mkvtoolnix/rake.d/iso3166.rb
Moritz Bunkus 65752aedcc
BCP47: regions: include fact whether entries are deprecated
Part of the implementation of #3307.
2022-03-26 13:40:09 +01:00

206 lines
5.5 KiB
Ruby
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

def create_iso3166_country_list_file
countries_regions = {}
iso3166_content = Mtx::OnlineFile.download("https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes")
parse_html_extract_table_data(iso3166_content, %r{^.*?<table[^>]*>}i).
drop(2).
reject { |row| row.length < 8 }.
each do |row|
row = row.map do |column|
column.
gsub(%r{<style>.*?</style>}, '').
gsub(%r{<a><img></a>}, ', ').
gsub(%r{<img>}, '').
gsub(%r{>_<}, '><').
gsub(%r{<[^>]+>}, '').
gsub(%r{ }, '').
gsub(%r{^, +}, '').
gsub(%r{[[:space:]]+}, ' ').
gsub(%r{^ +| +$}, '')
end
# 0 ["Albania",
# 1 "The Republic of Albania",
# 2 "UN member state ",
# 3 "AL",
# 4 "ALB",
# 5 "008",
# 6 "ISO 3166-2:AL",
# 7 ".al"],
m49_code = row[5].to_i
name = row[0].
gsub(%r{ *\[.*?\]}, '').
gsub(%r{ *\(the?\)$}i, '').
gsub(%r{(,.+?),([^,]+)$}, '\1 and\2')
official_name = row[1].
gsub(%r{ *\[.*?\]}, '').
gsub(%r{ *\(.*?\)$}i, '').
gsub(%r{^The +}, '')
entry = {
:number => m49_code,
:alpha_2_code => row[3],
:alpha_3_code => row[4],
:name => name,
:official_name => name == official_name ? "" : official_name,
}
countries_regions[row[3]] = entry
countries_regions[m49_code] = entry
end
# pp(countries_regions); exit 42
m49_content = Mtx::OnlineFile.download("https://unstats.un.org/unsd/methodology/m49/overview/", "m49_list.txt")
m49_data = parse_html_extract_table_data(m49_content, %r{^.*?<table[^>]+downloadTableEN[^>]*>}i)
headers = Hash[ *
m49_data.
shift.
each_with_index.
map { |text, idx| [ idx, text.downcase.gsub(%r{[^a-z0-9]+}, '_').gsub(%r{^_|_$}, '') ] }.
flatten
]
maybe_add = lambda do|row, type|
code = row["#{type}_code"].to_i
name = row["#{type}_name"]
return if name.blank? || countries_regions[code]
countries_regions[code] = {
:number => code,
:alpha_2_code => "",
:alpha_3_code => "",
:name => name,
:official_name => "",
}
end
m49_data.
map do |row|
Hash[ *
row.
each_with_index.
map { |text, idx| [ headers[idx], text ] }.
flatten
]
end.
each do |row|
%w{global region sub_region intermediate_region}.each { |type| maybe_add.call(row, type) }
code = row["m49_code"].to_i
entry = {
:number => code,
:alpha_2_code => row["iso_alpha2_code"],
:alpha_3_code => row["iso_alpha3_code"],
:name => row["country_or_area"],
:official_name => "",
}
countries_regions[code] ||= entry
countries_regions[row["iso_alpha2_code"]] ||= entry
end
user_assigned = [ 'AA', 'ZZ' ] \
+ ('M'..'Z').map { |letter| "Q#{letter}" } \
+ ('A'..'Z').map { |letter| "X#{letter}" }
user_assigned.each do |code|
countries_regions[code] = {
:number => 0,
:alpha_2_code => code,
:alpha_3_code => "",
:name => "User-assigned",
:official_name => "",
}
end
Mtx::IANALanguageSubtagRegistry.
fetch_registry["region"].
reject { |entry| %r{\.\.}.match(entry[:subtag]) }.
each do |entry|
if %r{^[0-9]+$}.match(entry[:subtag])
number = entry[:subtag].gsub(%r{^0+}, '').to_i
code = ""
idx = number
else
number = 0
code = entry[:subtag]
idx = code
end
if !countries_regions.key?(idx)
countries_regions[idx] = {
:number => number,
:alpha_2_code => code,
:alpha_3_code => "",
:name => entry[:description],
:official_name => "",
}
end
countries_regions[idx][:deprecated] = entry.key?(:deprecated)
end
rows = countries_regions.
values.
uniq.
sort_by { |entry| [ entry[:alpha_2_code], entry[:alpha_3_code], entry[:number] ] }.
map do |entry|
[ entry[:alpha_2_code].upcase.to_cpp_string,
entry[:alpha_3_code].upcase.to_cpp_string,
sprintf('%3d', entry[:number]),
entry[:name].to_u8_cpp_string,
entry[:official_name].to_u8_cpp_string,
(entry[:deprecated] || false).to_s,
]
end
header = <<EOT
/*
mkvmerge -- utility for splicing together matroska files
from component media subtypes
Distributed under the GPL v2
see the file COPYING for details
or visit https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
ISO 3166 countries & UN M.49 regions
Written by Moritz Bunkus <moritz@bunkus.org>.
*/
// ------------------------------------------------------------------------
// NOTE: this file is auto-generated by the "dev:iso3166_list" rake target.
// ------------------------------------------------------------------------
#include "common/common_pch.h"
#include "common/iso3166.h"
namespace mtx::iso3166 {
std::vector<region_t> g_regions;
void
init() {
g_regions.reserve(#{rows.size});
EOT
footer = <<EOT
}
} // namespace mtx::iso3166
EOT
rows = rows.sort_by { |row| [ row[0], row[1], row[3] ].join('::') }
content = header + format_table(rows, :column_suffix => ',', :row_prefix => " g_regions.emplace_back(", :row_suffix => ");").join("\n") + "\n" + footer
cpp_file_name = "src/common/iso3166_country_list.cpp"
runq("write", cpp_file_name) { IO.write("#{$source_dir}/#{cpp_file_name}", content); 0 }
end