diff --git a/rake.d/iso639.rb b/rake.d/iso639.rb index ab6b91891..799233271 100644 --- a/rake.d/iso639.rb +++ b/rake.d/iso639.rb @@ -1,31 +1,47 @@ def create_iso639_language_list_file - cpp_file_name = "src/common/iso639_language_list.cpp" - iso639_2 = JSON.parse(IO.readlines("/usr/share/iso-codes/json/iso_639-2.json").join('')) \ - ["639-2"]. - reject { |entry| %r{^qaa}.match(entry["alpha_3"]) }. - map do |entry| - entry["has_639_2"] = true - entry["alpha_3_to_use"] = entry["bibliographic"] || entry["alpha_3"] - entry - end - - used_codes = Hash[ *iso639_2.map { |entry| [ entry["alpha_3"], true, entry["bibliographic"], true ] }.flatten ] - - JSON.parse(IO.readlines("/usr/share/iso-codes/json/iso_639-3.json").join('')) \ - ["639-3"]. - reject { |entry| entry["type"] != "L" }. - reject { |entry| used_codes.include?(entry["alpha_3"]) }. - each do |entry| - iso639_2 << { - "name" => entry["name"], - "alpha_3" => entry["alpha_3"], - "alpha_3_to_use" => entry["alpha_3"], - "has_639_2" => false, - } - end - - rows = iso639_2. + list_file = "iso-639-3" + list_downloaded = false + + if !FileTest.exists?(list_file) + url = "https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab" + runq "wget", url, "wget --quiet -O #{list_file} #{url}" + + list_downloaded = true + end + + lines = IO.readlines(list_file).map(&:chomp) + headers = Hash[ * + lines. + shift. + split(%r{\t}). + map(&:downcase). + each_with_index. + map { |name, index| [ index, name ] }. + flatten + ] + + rows = lines. + map do |line| + parts = line.split(%r{\t}) + entry = Hash[ * + (0..parts.size). + map { |idx| [ headers[idx], !parts[idx] || parts[idx].empty? ? nil : parts[idx] ] }. + flatten + ] + + entry + end. + reject { |entry| !%r{^[CLS]$}.match(entry["language_type"]) }. # Constructed, Living & Special map do |entry| + { + "name" => entry["ref_name"], + "bibliographic" => entry["part2b"] && (entry["part2b"] != entry["part2t"]) ? entry["part2b"] : nil, + "alpha_2" => entry["part1"], + "alpha_3" => entry["part2t"] || entry["id"], + "alpha_3_to_use" => entry["part2b"] || entry["id"], + "has_639_2" => !entry["part2b"].nil?, + } + end.map do |entry| [ entry["name"].to_u8_cpp_string, entry["alpha_3_to_use"].to_cpp_string, (entry["alpha_2"] || '').to_cpp_string, @@ -81,7 +97,11 @@ EOT } // namespace mtx::iso639 EOT - content = header + format_table(rows.sort, :column_suffix => ',', :row_prefix => " g_languages.emplace_back(", :row_suffix => ");").join("\n") + "\n" + footer + content = header + format_table(rows.sort, :column_suffix => ',', :row_prefix => " g_languages.emplace_back(", :row_suffix => ");").join("\n") + "\n" + footer + cpp_file_name = "src/common/iso639_language_list.cpp" runq("write", cpp_file_name) { IO.write("#{$source_dir}/#{cpp_file_name}", content); 0 } + +ensure + File.unlink(list_file) if list_downloaded && FileTest.exists?(list_file) end