Skip to content

Instantly share code, notes, and snippets.

@josemarcosrf
Forked from flaviussn/language_to_iso_code.py
Last active November 14, 2019 15:10
Show Gist options
  • Save josemarcosrf/71aafa07440d92199ecc3d1993e9ea9e to your computer and use it in GitHub Desktop.
Save josemarcosrf/71aafa07440d92199ecc3d1993e9ea9e to your computer and use it in GitHub Desktop.
From language name to ISO code
import pycountry
from pprint import pformat
from data.model_languages import bert_languages, xlm_lang_codes
def get_codes(language):
lang = pycountry.languages.get(name=language)
alpha_2 = alpha_3 = None
try:
alpha_2 = lang.alpha_2
except:
pass
try:
alpha_3 = lang.alpha_3
except:
pass
return {
"language_name": language,
"alpha_2": alpha_2,
"alpha_3": alpha_3,
}
if __name__ == "__main__":
# Transform mBERT languages to lang. ISO codes
bert_codes = [get_codes(lang) for lang in bert_languages]
print(pformat(bert_codes))
# Check intersection between mBERT and XLM
bert_alpha_2_codes = [c.get('alpha_2') or c.get('alpha_3') for c in bert_codes]
# in Both
intersection = set(xlm_lang_codes).intersection(bert_alpha_2_codes)
print("Present in both:")
print(intersection)
print("({})".format(len(intersection)))
# Only in mBERT
only_bert = set(bert_alpha_2_codes).difference(xlm_lang_codes)
print("\n\nPresent only in mBERT:")
print(only_bert)
print("({})".format(len(only_bert)))
# Only in XLM
only_xlm = set(xlm_lang_codes).difference(bert_alpha_2_codes)
print("\n\nPresent only in XLM:")
print(only_xlm)
print("({})".format(len(only_xlm)))
bert_languages = [
"Afrikaans",
"Albanian",
"Arabic",
"Aragonese",
"Armenian",
"Asturian",
"Azerbaijani",
"Bashkir",
"Basque",
"Bavarian",
"Belarusian",
"Bengali",
"Bishnupriya",
"Bosnian",
"Breton",
"Bulgarian",
"Burmese",
"Catalan",
"Cebuano",
"Chechen",
"Chinese (Simplified)",
"Chinese (Traditional)",
"Chuvash",
"Croatian",
"Czech",
"Danish",
"Dutch",
"English",
"Estonian",
"Finnish",
"French",
"Galician",
"Georgian",
"German",
"Modern Greek (1453-)",
"Gujarati",
"Haitian",
"Hebrew",
"Hindi",
"Hungarian",
"Icelandic",
"Ido",
"Indonesian",
"Irish",
"Italian",
"Japanese",
"Javanese",
"Kannada",
"Kazakh",
"Kirghiz",
"Korean",
"Latin",
"Latvian",
"Lithuanian",
"Lombard",
"Low German",
"Luxembourgish",
"Macedonian",
"Malagasy",
"Malay (macrolanguage)",
"Malayalam",
"Marathi",
"Minangkabau",
"Nepali (macrolanguage)",
"Newari",
"Norwegian Bokmål",
"Norwegian Nynorsk",
"Occitan (post 1500)",
"Persian",
"Piemontese",
"Polish",
"Portuguese",
"Panjabi",
"Romanian",
"Russian",
"Scots",
"Serbian",
"Serbo-Croatian",
"Sicilian",
"Slovak",
"Slovenian",
"South Azerbaijani",
"Spanish",
"Sundanese",
"Swahili (macrolanguage)",
"Swedish",
"Tagalog",
"Tajik",
"Tamil",
"Tatar",
"Telugu",
"Turkish",
"Ukrainian",
"Urdu",
"Uzbek",
"Vietnamese",
"Volapük",
"Waray (Philippines)",
"Welsh",
"Western Frisian",
"Western Panjabi",
"Yoruba",
]
xlm_lang_codes = [
"en",
"es",
"fr",
"de",
"zh",
"ru",
"pt",
"it",
"ar",
"ja",
"id",
"tr",
"nl",
"pl",
"simple",
"fa",
"vi",
"sv",
"ko",
"he",
"ro",
"no",
"hi",
"uk",
"cs",
"fi",
"hu",
"th",
"da",
"ca",
"el",
"bg",
"sr",
"ms",
"bn",
"hr",
"sl",
"zh_yue",
"az",
"sk",
"eo",
"ta",
"sh",
"lt",
"et",
"ml",
"la",
"bs",
"sq",
"arz",
"af",
"ka",
"mr",
"eu",
"tl",
"ang",
"gl",
"nn",
"ur",
"kk",
"be",
"hy",
"te",
"lv",
"mk",
"zh_classical",
"als",
"is",
"wuu",
"my",
"sco",
"mn",
"ceb",
"ast",
"cy",
"kn",
"br",
"an",
"gu",
"bar",
"uz",
"lb",
"ne",
"si",
"war",
"jv",
"ga",
"zh_min_nan",
"oc",
"ku",
"sw",
"nds",
"ckb",
"ia",
"yi",
"fy",
"scn",
"gan",
"tt",
"am",
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment