mkvtoolnix/rake.d/iso3166.rb
2021-07-19 14:15:03 +02:00

173 lines
4.6 KiB
Ruby
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

def create_iso3166_country_list_file
countries_regions = {}
iso3166_content = Mtx::OnlineFile.download("https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes")
parse_html_extract_table_data(iso3166_content, %r{^.*?<table[^>]*>}i).
drop(2).
reject { |row| row.length < 8 }.
each do |row|
row = row.map do |column|
column.
gsub(%r{<style>.*?</style>}, '').
gsub(%r{<a><img></a>}, ', ').
gsub(%r{<img>}, '').
gsub(%r{>_<}, '><').
gsub(%r{<[^>]+>}, '').
gsub(%r{ }, '').
gsub(%r{^, +}, '').
gsub(%r{[[:space:]]+}, ' ').
gsub(%r{^ +| +$}, '')
end
# 0 ["Albania",
# 1 "The Republic of Albania",
# 2 "UN member state ",
# 3 "AL",
# 4 "ALB",
# 5 "008",
# 6 "ISO 3166-2:AL",
# 7 ".al"],
m49_code = row[5].to_i
name = row[0].
gsub(%r{ *\[.*?\]}, '').
gsub(%r{ *\(the?\)$}i, '').
gsub(%r{(,.+?),([^,]+)$}, '\1 and\2')
official_name = row[1].
gsub(%r{ *\[.*?\]}, '').
gsub(%r{ *\(.*?\)$}i, '').
gsub(%r{^The +}, '')
countries_regions[m49_code] = {
:number => m49_code,
:alpha_2_code => row[3],
:alpha_3_code => row[4],
:name => name,
:official_name => name == official_name ? "" : official_name,
}
end
# pp(countries_regions); exit 42
m49_content = Mtx::OnlineFile.download("https://unstats.un.org/unsd/methodology/m49/overview/", "m49_list.txt")
m49_data = parse_html_extract_table_data(m49_content, %r{^.*?<table[^>]+downloadTableEN[^>]*>}i)
headers = Hash[ *
m49_data.
shift.
each_with_index.
map { |text, idx| [ idx, text.downcase.gsub(%r{[^a-z0-9]+}, '_').gsub(%r{^_|_$}, '') ] }.
flatten
]
maybe_add = lambda do|row, type|
code = row["#{type}_code"].to_i
name = row["#{type}_name"]
return if name.blank? || countries_regions[code]
countries_regions[code] = {
:number => code,
:alpha_2_code => "",
:alpha_3_code => "",
:name => name,
:official_name => "",
}
end
m49_data.
map do |row|
Hash[ *
row.
each_with_index.
map { |text, idx| [ headers[idx], text ] }.
flatten
]
end.
each do |row|
%w{global region sub_region intermediate_region}.each { |type| maybe_add.call(row, type) }
code = row["m49_code"].to_i
countries_regions[code] ||= {
:number => code,
:alpha_2_code => row["iso_alpha2_code"],
:alpha_3_code => row["iso_alpha3_code"],
:name => row["country_or_area"],
:official_name => "",
}
end
user_assigned = [ 'AA', 'ZZ' ] \
+ ('M'..'Z').map { |letter| "Q#{letter}" } \
+ ('A'..'Z').map { |letter| "X#{letter}" }
entries = countries_regions.values
entries +=
user_assigned.
map do |code|
{
:number => 0,
:alpha_2_code => code,
:alpha_3_code => "",
:name => "User-assigned",
:official_name => "",
}
end
rows = entries.
map do |entry|
[ entry[:alpha_2_code].upcase.to_cpp_string,
entry[:alpha_3_code].upcase.to_cpp_string,
sprintf('%3d', entry[:number]),
entry[:name].to_u8_cpp_string,
entry[:official_name].to_u8_cpp_string,
]
end
header = <<EOT
/*
mkvmerge -- utility for splicing together matroska files
from component media subtypes
Distributed under the GPL v2
see the file COPYING for details
or visit https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
ISO 3166 countries & UN M.49 regions
Written by Moritz Bunkus <moritz@bunkus.org>.
*/
// ------------------------------------------------------------------------
// NOTE: this file is auto-generated by the "dev:iso3166_list" rake target.
// ------------------------------------------------------------------------
#include "common/common_pch.h"
#include "common/iso3166.h"
namespace mtx::iso3166 {
std::vector<region_t> g_regions;
void
init() {
g_regions.reserve(#{rows.size});
EOT
footer = <<EOT
}
} // namespace mtx::iso3166
EOT
rows = rows.sort_by { |row| [ row[0], row[1], row[3] ].join('::') }
content = header + format_table(rows, :column_suffix => ',', :row_prefix => " g_regions.emplace_back(", :row_suffix => ");").join("\n") + "\n" + footer
cpp_file_name = "src/common/iso3166_country_list.cpp"
runq("write", cpp_file_name) { IO.write("#{$source_dir}/#{cpp_file_name}", content); 0 }
end