2020-06-02 17:33:10 +00:00
|
|
|
def create_iso639_language_list_file
|
2021-07-17 10:47:38 +00:00
|
|
|
content = Mtx::OnlineFile.download("https://www.loc.gov/standards/iso639-2/php/code_list.php", "iso-639-2.html")
|
2021-07-15 17:32:25 +00:00
|
|
|
|
|
|
|
entries_by_alpha_3 = {}
|
|
|
|
|
2021-07-17 10:47:38 +00:00
|
|
|
parse_html_extract_table_data(content, %r{^.*?<table[^>]+>.*?<table[^>]+>}im).
|
|
|
|
drop(1).
|
|
|
|
each do |row|
|
2021-07-15 17:32:25 +00:00
|
|
|
if %r{^([a-z]{3}) *\(([bt])\)<.*?>([a-z]{3})}.match(row[0].downcase)
|
|
|
|
alpha_3_b = $2 == 'b' ? $1 : $3
|
|
|
|
alpha_3_t = $2 == 'b' ? $3 : $1
|
|
|
|
else
|
|
|
|
alpha_3_b = row[0]
|
|
|
|
alpha_3_t = row[0]
|
|
|
|
end
|
|
|
|
|
|
|
|
entries_by_alpha_3[alpha_3_b] = {
|
|
|
|
"name" => row[2],
|
|
|
|
"bibliographic" => alpha_3_b == alpha_3_t ? nil : alpha_3_b,
|
|
|
|
"alpha_2" => row[1],
|
|
|
|
"alpha_3" => alpha_3_t,
|
|
|
|
"alpha_3_to_use" => alpha_3_b,
|
|
|
|
"has_639_2" => true,
|
|
|
|
}
|
|
|
|
end
|
|
|
|
|
2021-07-17 09:18:07 +00:00
|
|
|
lines = Mtx::OnlineFile.download("https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab").
|
|
|
|
split(%r{\n+}).
|
|
|
|
map(&:chomp)
|
|
|
|
|
2021-07-14 20:45:49 +00:00
|
|
|
headers = Hash[ *
|
|
|
|
lines.
|
|
|
|
shift.
|
|
|
|
split(%r{\t}).
|
|
|
|
map(&:downcase).
|
|
|
|
each_with_index.
|
|
|
|
map { |name, index| [ index, name ] }.
|
|
|
|
flatten
|
|
|
|
]
|
|
|
|
|
2021-07-15 17:32:25 +00:00
|
|
|
lines.
|
2021-07-14 20:45:49 +00:00
|
|
|
map do |line|
|
|
|
|
parts = line.split(%r{\t})
|
|
|
|
entry = Hash[ *
|
|
|
|
(0..parts.size).
|
|
|
|
map { |idx| [ headers[idx], !parts[idx] || parts[idx].empty? ? nil : parts[idx] ] }.
|
|
|
|
flatten
|
|
|
|
]
|
|
|
|
|
|
|
|
entry
|
|
|
|
end.
|
|
|
|
reject { |entry| !%r{^[CLS]$}.match(entry["language_type"]) }. # Constructed, Living & Special
|
2021-07-15 17:32:25 +00:00
|
|
|
each do |entry|
|
|
|
|
alpha_3_to_use = entry["part2b"] || entry["id"]
|
|
|
|
|
|
|
|
entry_639_2 = entries_by_alpha_3[alpha_3_to_use]
|
|
|
|
entries_by_alpha_3[alpha_3_to_use] = {
|
2021-07-14 20:45:49 +00:00
|
|
|
"name" => entry["ref_name"],
|
|
|
|
"bibliographic" => entry["part2b"] && (entry["part2b"] != entry["part2t"]) ? entry["part2b"] : nil,
|
|
|
|
"alpha_2" => entry["part1"],
|
|
|
|
"alpha_3" => entry["part2t"] || entry["id"],
|
2021-07-15 17:32:25 +00:00
|
|
|
"alpha_3_to_use" => alpha_3_to_use,
|
|
|
|
"has_639_2" => !!entry_639_2,
|
2021-07-14 20:45:49 +00:00
|
|
|
}
|
2021-07-15 17:32:25 +00:00
|
|
|
end
|
|
|
|
|
2021-07-21 20:34:26 +00:00
|
|
|
content = Mtx::OnlineFile.download("https://en.wikipedia.org/wiki/List_of_ISO_639-5_codes")
|
|
|
|
|
|
|
|
parse_html_extract_table_data(content, %r{^.*?<table[^>]+>}im).
|
|
|
|
drop(1).
|
|
|
|
each do |row|
|
|
|
|
alpha_3 = row[1]
|
|
|
|
|
|
|
|
entries_by_alpha_3[alpha_3] ||= {
|
|
|
|
"name" => row[3].gsub(%r{<[^>]+>}, ''),
|
|
|
|
"bibliographic" => nil,
|
|
|
|
"alpha_2" => nil,
|
|
|
|
"alpha_3" => alpha_3,
|
|
|
|
"alpha_3_to_use" => alpha_3,
|
|
|
|
"has_639_2" => false,
|
|
|
|
}
|
|
|
|
end
|
|
|
|
|
2021-07-15 17:32:25 +00:00
|
|
|
rows = entries_by_alpha_3.
|
|
|
|
values.
|
|
|
|
map do |entry|
|
2020-06-28 12:02:45 +00:00
|
|
|
[ entry["name"].to_u8_cpp_string,
|
2021-02-17 16:39:42 +00:00
|
|
|
entry["alpha_3_to_use"].to_cpp_string,
|
2020-06-28 12:02:45 +00:00
|
|
|
(entry["alpha_2"] || '').to_cpp_string,
|
|
|
|
entry["bibliographic"] ? entry["alpha_3"].to_cpp_string : '""s',
|
2021-02-17 16:39:42 +00:00
|
|
|
entry["has_639_2"].to_s,
|
2020-06-02 17:33:10 +00:00
|
|
|
]
|
|
|
|
end
|
|
|
|
|
|
|
|
rows += ("a".."d").map do |letter|
|
2021-01-25 23:06:46 +00:00
|
|
|
[ %Q{u8"Reserved for local use: qa#{letter}"s},
|
|
|
|
%Q{u8"qa#{letter}"s},
|
2020-06-28 12:02:45 +00:00
|
|
|
'""s',
|
|
|
|
'""s',
|
2021-01-24 21:05:34 +00:00
|
|
|
'true ',
|
2020-06-02 17:33:10 +00:00
|
|
|
]
|
|
|
|
end
|
|
|
|
|
|
|
|
header = <<EOT
|
|
|
|
/*
|
|
|
|
mkvmerge -- utility for splicing together matroska files
|
|
|
|
from component media subtypes
|
|
|
|
|
|
|
|
Distributed under the GPL v2
|
|
|
|
see the file COPYING for details
|
2020-08-01 16:03:54 +00:00
|
|
|
or visit https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
|
2020-06-02 17:33:10 +00:00
|
|
|
|
|
|
|
ISO 639 language definitions, lookup functions
|
|
|
|
|
|
|
|
Written by Moritz Bunkus <moritz@bunkus.org>.
|
|
|
|
*/
|
|
|
|
|
|
|
|
// -----------------------------------------------------------------------
|
|
|
|
// NOTE: this file is auto-generated by the "dev:iso639_list" rake target.
|
|
|
|
// -----------------------------------------------------------------------
|
|
|
|
|
2021-02-17 16:21:10 +00:00
|
|
|
#include "common/iso639_types.h"
|
2020-06-02 17:33:10 +00:00
|
|
|
|
2021-02-17 16:21:10 +00:00
|
|
|
using namespace std::string_literals;
|
2020-06-02 17:33:10 +00:00
|
|
|
|
2020-07-04 11:36:50 +00:00
|
|
|
namespace mtx::iso639 {
|
|
|
|
|
2021-01-25 23:06:46 +00:00
|
|
|
std::vector<language_t> g_languages;
|
|
|
|
|
|
|
|
void
|
|
|
|
init() {
|
|
|
|
g_languages.reserve(#{rows.size});
|
|
|
|
|
2020-06-02 17:33:10 +00:00
|
|
|
EOT
|
|
|
|
|
|
|
|
footer = <<EOT
|
2021-01-25 23:06:46 +00:00
|
|
|
}
|
2020-07-04 11:36:50 +00:00
|
|
|
|
|
|
|
} // namespace mtx::iso639
|
2020-06-02 17:33:10 +00:00
|
|
|
EOT
|
|
|
|
|
2021-07-14 20:45:49 +00:00
|
|
|
content = header + format_table(rows.sort, :column_suffix => ',', :row_prefix => " g_languages.emplace_back(", :row_suffix => ");").join("\n") + "\n" + footer
|
|
|
|
cpp_file_name = "src/common/iso639_language_list.cpp"
|
2020-06-02 17:33:10 +00:00
|
|
|
|
2020-06-28 12:02:45 +00:00
|
|
|
runq("write", cpp_file_name) { IO.write("#{$source_dir}/#{cpp_file_name}", content); 0 }
|
2020-06-02 17:33:10 +00:00
|
|
|
end
|