def create_iso639_language_list_file name_overrides = { "grc" => "Greek (ancient, -1453)", "gre" => "Greek (modern, 1453-)", } content = Mtx::OnlineFile.download("https://www.loc.gov/standards/iso639-2/php/code_list.php", "iso-639-2.html") entries_by_alpha_3 = {} parse_html_extract_table_data(content, %r{^.*?]+>.*?]+>}im). drop(1). each do |row| if %r{^([a-z]{3}) *\(([bt])\)<.*?>([a-z]{3})}.match(row[0].downcase) alpha_3_b = $2 == 'b' ? $1 : $3 alpha_3_t = $2 == 'b' ? $3 : $1 else alpha_3_b = row[0] alpha_3_t = row[0] end entries_by_alpha_3[alpha_3_b] = { "name" => row[2], "bibliographic" => alpha_3_b == alpha_3_t ? nil : alpha_3_b, "alpha_2" => row[1], "alpha_3" => alpha_3_t, "alpha_3_to_use" => alpha_3_b, "has_639_2" => true, } end lines = Mtx::OnlineFile.download("https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab"). split(%r{\n+}). map(&:chomp) headers = Hash[ * lines. shift. split(%r{\t}). map(&:downcase). each_with_index. map { |name, index| [ index, name ] }. flatten ] lines. map do |line| parts = line.split(%r{\t}) entry = Hash[ * (0..parts.size). map { |idx| [ headers[idx], !parts[idx] || parts[idx].empty? ? nil : parts[idx] ] }. flatten ] entry end. each do |entry| alpha_3_to_use = entry["part2b"] || entry["id"] entry_639_2 = entries_by_alpha_3[alpha_3_to_use] entries_by_alpha_3[alpha_3_to_use] = { "name" => entry["ref_name"], "bibliographic" => entry["part2b"] && (entry["part2b"] != entry["part2t"]) ? entry["part2b"] : nil, "alpha_2" => entry["part1"], "alpha_3" => entry["part2t"] || entry["id"], "alpha_3_to_use" => alpha_3_to_use, "has_639_2" => !!entry_639_2, } end content = Mtx::OnlineFile.download("https://en.wikipedia.org/wiki/List_of_ISO_639-5_codes") parse_html_extract_table_data(content, %r{^.*?]+>}im). drop(1). each do |row| alpha_3 = row[1] entries_by_alpha_3[alpha_3] ||= { "name" => row[3].gsub(%r{<[^>]+>}, ''), "bibliographic" => nil, "alpha_2" => nil, "alpha_3" => alpha_3, "alpha_3_to_use" => alpha_3, "has_639_2" => false, } end ("a".."t").each do |letter1| ("a".."z").each do |letter2| alpha_3 = "q#{letter1}#{letter2}" entries_by_alpha_3[alpha_3] = { "name" => "Reserved for local use: #{alpha_3}", "bibliographic" => nil, "alpha_2" => nil, "alpha_3" => alpha_3, "alpha_3_to_use" => alpha_3, "has_639_2" => true, } end end entries_by_alpha_2 = Hash[ * entries_by_alpha_3. values. map { |entry| [ entry["alpha_2"], entry ] }. flatten ] Mtx::IANALanguageSubtagRegistry. fetch_registry["language"]. reject { |entry| %r{\.\.}.match(entry[:subtag]) }. each do |entry| is_alpha_2 = entry[:subtag].length == 2 entries_map = is_alpha_2 ? entries_by_alpha_2 : entries_by_alpha_3 if !entries_map.key?(entry[:subtag]) entries_map[entry[:subtag]] = { "name" => entry[:description], "bibliographic" => nil, "alpha_2" => is_alpha_2 ? entry[:subtag] : nil, "alpha_3" => is_alpha_2 ? nil : entry[:subtag], "alpha_3_to_use" => entry[:subtag], "has_639_2" => is_alpha_2, } end entries_map[entry[:subtag]]["deprecated"] = entry.key?(:deprecated) end rows = entries_by_alpha_3. values. map do |entry| name = name_overrides[ entry["alpha_3_to_use"] ] || entry["name"] [ name.to_u8_c_string, entry["alpha_3_to_use"].to_c_string, (entry["alpha_2"] || '').to_c_string, entry["bibliographic"] ? entry["alpha_3"].to_c_string : '""', entry["has_639_2"].to_s, (entry["deprecated"] || false).to_s, ] end header = <. */ // ----------------------------------------------------------------------- // NOTE: this file is auto-generated by the "dev:iso639_list" rake target. // ----------------------------------------------------------------------- #include "common/iso639_types.h" using namespace std::string_literals; namespace mtx::iso639 { std::vector g_languages; struct language_init_t { char const *english_name, *alpha_3_code, *alpha_2_code, *terminology_abbrev; bool is_part_of_iso639_2, is_deprecated; }; static language_init_t const s_languages_init[] = { EOT footer = <english_name, lang->alpha_3_code, lang->alpha_2_code, lang->terminology_abbrev, lang->is_part_of_iso639_2, lang->is_deprecated); } } // namespace mtx::iso639 EOT content = header + format_table(rows.sort, :column_suffix => ',', :row_prefix => " { ", :row_suffix => " },").join("\n") + "\n" + footer cpp_file_name = "src/common/iso639_language_list.cpp" runq("write", cpp_file_name) { IO.write("#{$source_dir}/#{cpp_file_name}", content); 0 } end def look_up_iso_639_1 language require 'rexml/document' iso639_file = "/usr/share/xml/iso-codes/iso_639.xml" node = REXML::XPath.first REXML::Document.new(File.new(iso639_file)), "//iso_639_entry[@name='#{language}']" locale = node ? node.attributes['iso_639_1_code'] : nil return locale if !locale.blank? return language if /^ [a-z]{2} (?: _ [A-Z]{2} )? $/x.match(language) fail "Unknown language/ISO-639-1 code not found in #{iso639_file}" end