avans06's picture
Added support for translation models (NLLB, NLLB-CT2, MT5)
8077be2
raw
history blame
11.9 kB
class NllbLang():
def __init__(self, code, name, code_whisper=None, name_whisper=None):
self.code = code
self.name = name
self.code_whisper = code_whisper
self.name_whisper = name_whisper
def __str__(self):
return "Language(code={}, name={})".format(self.code, self.name)
NLLB_LANGS = [
NllbLang('ace_Arab', 'Acehnese (Arabic script)'),
NllbLang('ace_Latn', 'Acehnese (Latin script)'),
NllbLang('acm_Arab', 'Mesopotamian Arabic', 'ar', 'Arabic'),
NllbLang('acq_Arab', 'Ta’izzi-Adeni Arabic', 'ar', 'Arabic'),
NllbLang('aeb_Arab', 'Tunisian Arabic'),
NllbLang('afr_Latn', 'Afrikaans', 'am', 'Amharic'),
NllbLang('ajp_Arab', 'South Levantine Arabic', 'ar', 'Arabic'),
NllbLang('aka_Latn', 'Akan'),
NllbLang('amh_Ethi', 'Amharic'),
NllbLang('apc_Arab', 'North Levantine Arabic', 'ar', 'Arabic'),
NllbLang('arb_Arab', 'Modern Standard Arabic', 'ar', 'Arabic'),
NllbLang('arb_Latn', 'Modern Standard Arabic (Romanized)'),
NllbLang('ars_Arab', 'Najdi Arabic', 'ar', 'Arabic'),
NllbLang('ary_Arab', 'Moroccan Arabic', 'ar', 'Arabic'),
NllbLang('arz_Arab', 'Egyptian Arabic', 'ar', 'Arabic'),
NllbLang('asm_Beng', 'Assamese', 'as', 'Assamese'),
NllbLang('ast_Latn', 'Asturian'),
NllbLang('awa_Deva', 'Awadhi'),
NllbLang('ayr_Latn', 'Central Aymara'),
NllbLang('azb_Arab', 'South Azerbaijani', 'az', 'Azerbaijani'),
NllbLang('azj_Latn', 'North Azerbaijani', 'az', 'Azerbaijani'),
NllbLang('bak_Cyrl', 'Bashkir', 'ba', 'Bashkir'),
NllbLang('bam_Latn', 'Bambara'),
NllbLang('ban_Latn', 'Balinese'),
NllbLang('bel_Cyrl', 'Belarusian', 'be', 'Belarusian'),
NllbLang('bem_Latn', 'Bemba'),
NllbLang('ben_Beng', 'Bengali', 'bn', 'Bengali'),
NllbLang('bho_Deva', 'Bhojpuri'),
NllbLang('bjn_Arab', 'Banjar (Arabic script)'),
NllbLang('bjn_Latn', 'Banjar (Latin script)'),
NllbLang('bod_Tibt', 'Standard Tibetan', 'bo', 'Tibetan'),
NllbLang('bos_Latn', 'Bosnian', 'bs', 'Bosnian'),
NllbLang('bug_Latn', 'Buginese'),
NllbLang('bul_Cyrl', 'Bulgarian', 'bg', 'Bulgarian'),
NllbLang('cat_Latn', 'Catalan', 'ca', 'Catalan'),
NllbLang('ceb_Latn', 'Cebuano'),
NllbLang('ces_Latn', 'Czech', 'cs', 'Czech'),
NllbLang('cjk_Latn', 'Chokwe'),
NllbLang('ckb_Arab', 'Central Kurdish'),
NllbLang('crh_Latn', 'Crimean Tatar'),
NllbLang('cym_Latn', 'Welsh', 'cy', 'Welsh'),
NllbLang('dan_Latn', 'Danish', 'da', 'Danish'),
NllbLang('deu_Latn', 'German', 'de', 'German'),
NllbLang('dik_Latn', 'Southwestern Dinka'),
NllbLang('dyu_Latn', 'Dyula'),
NllbLang('dzo_Tibt', 'Dzongkha'),
NllbLang('ell_Grek', 'Greek', 'el', 'Greek'),
NllbLang('eng_Latn', 'English', 'en', 'English'),
NllbLang('epo_Latn', 'Esperanto'),
NllbLang('est_Latn', 'Estonian', 'et', 'Estonian'),
NllbLang('eus_Latn', 'Basque', 'eu', 'Basque'),
NllbLang('ewe_Latn', 'Ewe'),
NllbLang('fao_Latn', 'Faroese', 'fo', 'Faroese'),
NllbLang('fij_Latn', 'Fijian'),
NllbLang('fin_Latn', 'Finnish', 'fi', 'Finnish'),
NllbLang('fon_Latn', 'Fon'),
NllbLang('fra_Latn', 'French', 'fr', 'French'),
NllbLang('fur_Latn', 'Friulian'),
NllbLang('fuv_Latn', 'Nigerian Fulfulde'),
NllbLang('gla_Latn', 'Scottish Gaelic'),
NllbLang('gle_Latn', 'Irish'),
NllbLang('glg_Latn', 'Galician', 'gl', 'Galician'),
NllbLang('grn_Latn', 'Guarani'),
NllbLang('guj_Gujr', 'Gujarati', 'gu', 'Gujarati'),
NllbLang('hat_Latn', 'Haitian Creole', 'ht', 'Haitian creole'),
NllbLang('hau_Latn', 'Hausa', 'ha', 'Hausa'),
NllbLang('heb_Hebr', 'Hebrew', 'he', 'Hebrew'),
NllbLang('hin_Deva', 'Hindi', 'hi', 'Hindi'),
NllbLang('hne_Deva', 'Chhattisgarhi'),
NllbLang('hrv_Latn', 'Croatian', 'hr', 'Croatian'),
NllbLang('hun_Latn', 'Hungarian', 'hu', 'Hungarian'),
NllbLang('hye_Armn', 'Armenian', 'hy', 'Armenian'),
NllbLang('ibo_Latn', 'Igbo'),
NllbLang('ilo_Latn', 'Ilocano'),
NllbLang('ind_Latn', 'Indonesian', 'id', 'Indonesian'),
NllbLang('isl_Latn', 'Icelandic', 'is', 'Icelandic'),
NllbLang('ita_Latn', 'Italian', 'it', 'Italian'),
NllbLang('jav_Latn', 'Javanese', 'jw', 'Javanese'),
NllbLang('jpn_Jpan', 'Japanese', 'ja', 'Japanese'),
NllbLang('kab_Latn', 'Kabyle'),
NllbLang('kac_Latn', 'Jingpho'),
NllbLang('kam_Latn', 'Kamba'),
NllbLang('kan_Knda', 'Kannada', 'kn', 'Kannada'),
NllbLang('kas_Arab', 'Kashmiri (Arabic script)'),
NllbLang('kas_Deva', 'Kashmiri (Devanagari script)'),
NllbLang('kat_Geor', 'Georgian', 'ka', 'Georgian'),
NllbLang('knc_Arab', 'Central Kanuri (Arabic script)'),
NllbLang('knc_Latn', 'Central Kanuri (Latin script)'),
NllbLang('kaz_Cyrl', 'Kazakh', 'kk', 'Kazakh'),
NllbLang('kbp_Latn', 'Kabiyè'),
NllbLang('kea_Latn', 'Kabuverdianu'),
NllbLang('khm_Khmr', 'Khmer', 'km', 'Khmer'),
NllbLang('kik_Latn', 'Kikuyu'),
NllbLang('kin_Latn', 'Kinyarwanda'),
NllbLang('kir_Cyrl', 'Kyrgyz'),
NllbLang('kmb_Latn', 'Kimbundu'),
NllbLang('kmr_Latn', 'Northern Kurdish'),
NllbLang('kon_Latn', 'Kikongo'),
NllbLang('kor_Hang', 'Korean', 'ko', 'Korean'),
NllbLang('lao_Laoo', 'Lao', 'lo', 'Lao'),
NllbLang('lij_Latn', 'Ligurian'),
NllbLang('lim_Latn', 'Limburgish'),
NllbLang('lin_Latn', 'Lingala', 'ln', 'Lingala'),
NllbLang('lit_Latn', 'Lithuanian', 'lt', 'Lithuanian'),
NllbLang('lmo_Latn', 'Lombard'),
NllbLang('ltg_Latn', 'Latgalian'),
NllbLang('ltz_Latn', 'Luxembourgish', 'lb', 'Luxembourgish'),
NllbLang('lua_Latn', 'Luba-Kasai'),
NllbLang('lug_Latn', 'Ganda'),
NllbLang('luo_Latn', 'Luo'),
NllbLang('lus_Latn', 'Mizo'),
NllbLang('lvs_Latn', 'Standard Latvian', 'lv', 'Latvian'),
NllbLang('mag_Deva', 'Magahi'),
NllbLang('mai_Deva', 'Maithili'),
NllbLang('mal_Mlym', 'Malayalam', 'ml', 'Malayalam'),
NllbLang('mar_Deva', 'Marathi', 'mr', 'Marathi'),
NllbLang('min_Arab', 'Minangkabau (Arabic script)'),
NllbLang('min_Latn', 'Minangkabau (Latin script)'),
NllbLang('mkd_Cyrl', 'Macedonian', 'mk', 'Macedonian'),
NllbLang('plt_Latn', 'Plateau Malagasy', 'mg', 'Malagasy'),
NllbLang('mlt_Latn', 'Maltese', 'mt', 'Maltese'),
NllbLang('mni_Beng', 'Meitei (Bengali script)'),
NllbLang('khk_Cyrl', 'Halh Mongolian', 'mn', 'Mongolian'),
NllbLang('mos_Latn', 'Mossi'),
NllbLang('mri_Latn', 'Maori', 'mi', 'Maori'),
NllbLang('mya_Mymr', 'Burmese', 'my', 'Myanmar'),
NllbLang('nld_Latn', 'Dutch', 'nl', 'Dutch'),
NllbLang('nno_Latn', 'Norwegian Nynorsk', 'nn', 'Nynorsk'),
NllbLang('nob_Latn', 'Norwegian Bokmål', 'no', 'Norwegian'),
NllbLang('npi_Deva', 'Nepali', 'ne', 'Nepali'),
NllbLang('nso_Latn', 'Northern Sotho'),
NllbLang('nus_Latn', 'Nuer'),
NllbLang('nya_Latn', 'Nyanja'),
NllbLang('oci_Latn', 'Occitan', 'oc', 'Occitan'),
NllbLang('gaz_Latn', 'West Central Oromo'),
NllbLang('ory_Orya', 'Odia'),
NllbLang('pag_Latn', 'Pangasinan'),
NllbLang('pan_Guru', 'Eastern Panjabi', 'pa', 'Punjabi'),
NllbLang('pap_Latn', 'Papiamento'),
NllbLang('pes_Arab', 'Western Persian', 'fa', 'Persian'),
NllbLang('pol_Latn', 'Polish', 'pl', 'Polish'),
NllbLang('por_Latn', 'Portuguese', 'pt', 'Portuguese'),
NllbLang('prs_Arab', 'Dari'),
NllbLang('pbt_Arab', 'Southern Pashto', 'ps', 'Pashto'),
NllbLang('quy_Latn', 'Ayacucho Quechua'),
NllbLang('ron_Latn', 'Romanian', 'ro', 'Romanian'),
NllbLang('run_Latn', 'Rundi'),
NllbLang('rus_Cyrl', 'Russian', 'ru', 'Russian'),
NllbLang('sag_Latn', 'Sango'),
NllbLang('san_Deva', 'Sanskrit', 'sa', 'Sanskrit'),
NllbLang('sat_Olck', 'Santali'),
NllbLang('scn_Latn', 'Sicilian'),
NllbLang('shn_Mymr', 'Shan'),
NllbLang('sin_Sinh', 'Sinhala', 'si', 'Sinhala'),
NllbLang('slk_Latn', 'Slovak', 'sk', 'Slovak'),
NllbLang('slv_Latn', 'Slovenian', 'sl', 'Slovenian'),
NllbLang('smo_Latn', 'Samoan'),
NllbLang('sna_Latn', 'Shona', 'sn', 'Shona'),
NllbLang('snd_Arab', 'Sindhi', 'sd', 'Sindhi'),
NllbLang('som_Latn', 'Somali', 'so', 'Somali'),
NllbLang('sot_Latn', 'Southern Sotho'),
NllbLang('spa_Latn', 'Spanish', 'es', 'Spanish'),
NllbLang('als_Latn', 'Tosk Albanian', 'sq', 'Albanian'),
NllbLang('srd_Latn', 'Sardinian'),
NllbLang('srp_Cyrl', 'Serbian', 'sr', 'Serbian'),
NllbLang('ssw_Latn', 'Swati'),
NllbLang('sun_Latn', 'Sundanese', 'su', 'Sundanese'),
NllbLang('swe_Latn', 'Swedish', 'sv', 'Swedish'),
NllbLang('swh_Latn', 'Swahili', 'sw', 'Swahili'),
NllbLang('szl_Latn', 'Silesian'),
NllbLang('tam_Taml', 'Tamil', 'ta', 'Tamil'),
NllbLang('tat_Cyrl', 'Tatar', 'tt', 'Tatar'),
NllbLang('tel_Telu', 'Telugu', 'te', 'Telugu'),
NllbLang('tgk_Cyrl', 'Tajik', 'tg', 'Tajik'),
NllbLang('tgl_Latn', 'Tagalog', 'tl', 'Tagalog'),
NllbLang('tha_Thai', 'Thai', 'th', 'Thai'),
NllbLang('tir_Ethi', 'Tigrinya'),
NllbLang('taq_Latn', 'Tamasheq (Latin script)'),
NllbLang('taq_Tfng', 'Tamasheq (Tifinagh script)'),
NllbLang('tpi_Latn', 'Tok Pisin'),
NllbLang('tsn_Latn', 'Tswana'),
NllbLang('tso_Latn', 'Tsonga'),
NllbLang('tuk_Latn', 'Turkmen', 'tk', 'Turkmen'),
NllbLang('tum_Latn', 'Tumbuka'),
NllbLang('tur_Latn', 'Turkish', 'tr', 'Turkish'),
NllbLang('twi_Latn', 'Twi'),
NllbLang('tzm_Tfng', 'Central Atlas Tamazight'),
NllbLang('uig_Arab', 'Uyghur'),
NllbLang('ukr_Cyrl', 'Ukrainian', 'uk', 'Ukrainian'),
NllbLang('umb_Latn', 'Umbundu'),
NllbLang('urd_Arab', 'Urdu', 'ur', 'Urdu'),
NllbLang('uzn_Latn', 'Northern Uzbek', 'uz', 'Uzbek'),
NllbLang('vec_Latn', 'Venetian'),
NllbLang('vie_Latn', 'Vietnamese', 'vi', 'Vietnamese'),
NllbLang('war_Latn', 'Waray'),
NllbLang('wol_Latn', 'Wolof'),
NllbLang('xho_Latn', 'Xhosa'),
NllbLang('ydd_Hebr', 'Eastern Yiddish', 'yi', 'Yiddish'),
NllbLang('yor_Latn', 'Yoruba', 'yo', 'Yoruba'),
NllbLang('yue_Hant', 'Yue Chinese', 'zh', 'Chinese'),
NllbLang('zho_Hans', 'Chinese (Simplified)', 'zh', 'Chinese'),
NllbLang('zho_Hant', 'Chinese (Traditional)', 'zh', 'Chinese'),
NllbLang('zsm_Latn', 'Standard Malay', 'ms', 'Malay'),
NllbLang('zul_Latn', 'Zulu'),
]
_TO_NLLB_LANG_CODE = {language.code.lower(): language for language in NLLB_LANGS if language.code is not None}
_TO_NLLB_LANG_NAME = {language.name.lower(): language for language in NLLB_LANGS if language.name is not None}
_TO_NLLB_LANG_WHISPER_CODE = {language.code_whisper.lower(): language for language in NLLB_LANGS if language.code_whisper is not None}
_TO_NLLB_LANG_WHISPER_NAME = {language.name_whisper.lower(): language for language in NLLB_LANGS if language.name_whisper is not None}
def get_nllb_lang_from_code(lang_code, default=None) -> NllbLang:
"""Return the language from the language code."""
return _TO_NLLB_LANG_CODE.get(lang_code, default)
def get_nllb_lang_from_name(lang_name, default=None) -> NllbLang:
"""Return the language from the language name."""
return _TO_NLLB_LANG_NAME.get(lang_name.lower() if lang_name else None, default)
def get_nllb_lang_from_code_whisper(lang_code_whisper, default=None) -> NllbLang:
"""Return the language from the language code."""
return _TO_NLLB_LANG_WHISPER_CODE.get(lang_code_whisper, default)
def get_nllb_lang_from_name_whisper(lang_name_whisper, default=None) -> NllbLang:
"""Return the language from the language name."""
return _TO_NLLB_LANG_WHISPER_NAME.get(lang_name_whisper.lower() if lang_name_whisper else None, default)
def get_nllb_lang_names():
"""Return a list of language names."""
return [language.name for language in NLLB_LANGS]
if __name__ == "__main__":
# Test lookup
print(get_nllb_lang_from_code('eng_Latn'))
print(get_nllb_lang_from_name('English'))
print(get_nllb_lang_names())