mkvtoolnix/rake.d/iso3166.rb
Moritz Bunkus 39529c226b
languages/scripts/regions/IANA lists: use different method of initialization
The prior method was to generate one line of
`g_container.emplace_back(…)` per entry in the list & letting the
compiler chew on that. Each string argument in that call was done was
`u8"Some Name"s`, meaning as a std::string instance.

Drawbacks:

• takes the compiler ages to compile, even forcing me to drop all
  optimizations for the ISO-639 language list file

• even smaller files such as the IANA language subtag registry lists
  take more than 30s to compile

• due to no optimizations initialization is actually not as fast as
  could be

The new method uses a plain C-style array of structs with `char
const *` entries for the initial list. The initialization method then
copies the entries from that list to the actual container, again using
`std::emplace_back(…)`.

This yields sub-1s compilation times even with the longest file, the
ISO-639 language list, and the runtime initialization is actually
faster.
2022-04-23 00:00:15 +02:00

218 lines
5.9 KiB
Ruby
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

def create_iso3166_country_list_file
countries_regions = {}
iso3166_content = Mtx::OnlineFile.download("https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes")
parse_html_extract_table_data(iso3166_content, %r{^.*?<table[^>]*>}i).
drop(2).
reject { |row| row.length < 8 }.
each do |row|
row = row.map do |column|
column.
gsub(%r{<style>.*?</style>}, '').
gsub(%r{<a><img></a>}, ', ').
gsub(%r{<img>}, '').
gsub(%r{>_<}, '><').
gsub(%r{<[^>]+>}, '').
gsub(%r{ }, '').
gsub(%r{^, +}, '').
gsub(%r{[[:space:]]+}, ' ').
gsub(%r{^ +| +$}, '')
end
# 0 ["Albania",
# 1 "The Republic of Albania",
# 2 "UN member state ",
# 3 "AL",
# 4 "ALB",
# 5 "008",
# 6 "ISO 3166-2:AL",
# 7 ".al"],
m49_code = row[5].to_i
name = row[0].
gsub(%r{ *\[.*?\]}, '').
gsub(%r{ *\(the?\)$}i, '').
gsub(%r{(,.+?),([^,]+)$}, '\1 and\2')
official_name = row[1].
gsub(%r{ *\[.*?\]}, '').
gsub(%r{ *\(.*?\)$}i, '').
gsub(%r{^The +}, '')
entry = {
:number => m49_code,
:alpha_2_code => row[3],
:alpha_3_code => row[4],
:name => name,
:official_name => name == official_name ? "" : official_name,
}
countries_regions[row[3]] = entry
countries_regions[m49_code] = entry
end
# pp(countries_regions); exit 42
m49_content = Mtx::OnlineFile.download("https://unstats.un.org/unsd/methodology/m49/overview/", "m49_list.txt")
m49_data = parse_html_extract_table_data(m49_content, %r{^.*?<table[^>]+downloadTableEN[^>]*>}i)
headers = Hash[ *
m49_data.
shift.
each_with_index.
map { |text, idx| [ idx, text.downcase.gsub(%r{[^a-z0-9]+}, '_').gsub(%r{^_|_$}, '') ] }.
flatten
]
maybe_add = lambda do|row, type|
code = row["#{type}_code"].to_i
name = row["#{type}_name"]
return if name.blank? || countries_regions[code]
countries_regions[code] = {
:number => code,
:alpha_2_code => "",
:alpha_3_code => "",
:name => name,
:official_name => "",
}
end
m49_data.
map do |row|
Hash[ *
row.
each_with_index.
map { |text, idx| [ headers[idx], text ] }.
flatten
]
end.
each do |row|
%w{global region sub_region intermediate_region}.each { |type| maybe_add.call(row, type) }
code = row["m49_code"].to_i
entry = {
:number => code,
:alpha_2_code => row["iso_alpha2_code"],
:alpha_3_code => row["iso_alpha3_code"],
:name => row["country_or_area"],
:official_name => "",
}
countries_regions[code] ||= entry
countries_regions[row["iso_alpha2_code"]] ||= entry
end
user_assigned = [ 'AA', 'ZZ' ] \
+ ('M'..'Z').map { |letter| "Q#{letter}" } \
+ ('A'..'Z').map { |letter| "X#{letter}" }
user_assigned.each do |code|
countries_regions[code] = {
:number => 0,
:alpha_2_code => code,
:alpha_3_code => "",
:name => "User-assigned",
:official_name => "",
}
end
Mtx::IANALanguageSubtagRegistry.
fetch_registry["region"].
reject { |entry| %r{\.\.}.match(entry[:subtag]) }.
each do |entry|
if %r{^[0-9]+$}.match(entry[:subtag])
number = entry[:subtag].gsub(%r{^0+}, '').to_i
code = ""
idx = number
else
number = 0
code = entry[:subtag]
idx = code
end
if !countries_regions.key?(idx)
countries_regions[idx] = {
:number => number,
:alpha_2_code => code,
:alpha_3_code => "",
:name => entry[:description],
:official_name => "",
}
end
countries_regions[idx][:deprecated] = entry.key?(:deprecated)
end
rows = countries_regions.
values.
uniq.
sort_by { |entry| [ entry[:alpha_2_code], entry[:alpha_3_code], entry[:number] ] }.
map do |entry|
[ entry[:alpha_2_code].upcase.to_c_string,
entry[:alpha_3_code].upcase.to_c_string,
sprintf('%3d', entry[:number]),
entry[:name].to_u8_c_string,
entry[:official_name].to_u8_c_string,
(entry[:deprecated] || false).to_s,
]
end
header = <<EOT
/*
mkvmerge -- utility for splicing together matroska files
from component media subtypes
Distributed under the GPL v2
see the file COPYING for details
or visit https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
ISO 3166 countries & UN M.49 regions
Written by Moritz Bunkus <moritz@bunkus.org>.
*/
// ------------------------------------------------------------------------
// NOTE: this file is auto-generated by the "dev:iso3166_list" rake target.
// ------------------------------------------------------------------------
#include "common/common_pch.h"
#include "common/iso3166.h"
namespace mtx::iso3166 {
std::vector<region_t> g_regions;
struct region_init_t {
char const *alpha_2_code, *alpha_3_code;
unsigned int number;
char const *name, *official_name;
bool is_deprecated;
};
static region_init_t const s_regions_init[] = {
EOT
footer = <<EOT
};
void
init() {
g_regions.reserve(#{rows.size});
for (region_init_t const *region = s_regions_init, *end = region + #{rows.size}; region < end; ++region)
g_regions.emplace_back(region->alpha_2_code, region->alpha_3_code, region->number, region->name, region->official_name, region->is_deprecated);
}
} // namespace mtx::iso3166
EOT
rows = rows.sort_by { |row| [ row[0], row[1], row[3] ].join('::') }
content = header + format_table(rows, :column_suffix => ',', :row_prefix => " { ", :row_suffix => " },").join("\n") + "\n" + footer
cpp_file_name = "src/common/iso3166_country_list.cpp"
runq("write", cpp_file_name) { IO.write("#{$source_dir}/#{cpp_file_name}", content); 0 }
end