OcTra / configurations /get_constants.py
arcan3's picture
adde revision
fc5ed00
import os
class constantConfig:
def __init__(self):
self.flores_codes={'Acehnese (Arabic script)': 'ace_Arab',
'Acehnese (Latin script)': 'ace_Latn',
'Mesopotamian Arabic': 'acm_Arab',
'Ta’izzi-Adeni Arabic': 'acq_Arab',
'Tunisian Arabic': 'aeb_Arab',
'Afrikaans': 'afr_Latn',
'South Levantine Arabic': 'ajp_Arab',
'Akan': 'aka_Latn',
'Amharic': 'amh_Ethi',
'North Levantine Arabic': 'apc_Arab',
'Modern Standard Arabic': 'arb_Arab',
'Modern Standard Arabic (Romanized)': 'arb_Latn',
'Najdi Arabic': 'ars_Arab',
'Moroccan Arabic': 'ary_Arab',
'Egyptian Arabic': 'arz_Arab',
'Assamese': 'asm_Beng',
'Asturian': 'ast_Latn',
'Awadhi': 'awa_Deva',
'Central Aymara': 'ayr_Latn',
'South Azerbaijani': 'azb_Arab',
'North Azerbaijani': 'azj_Latn',
'Bashkir': 'bak_Cyrl',
'Bambara': 'bam_Latn',
'Balinese': 'ban_Latn',
'Belarusian': 'bel_Cyrl',
'Bemba': 'bem_Latn',
'Bengali': 'ben_Beng',
'Bhojpuri': 'bho_Deva',
'Banjar (Arabic script)': 'bjn_Arab',
'Banjar (Latin script)': 'bjn_Latn',
'Standard Tibetan': 'bod_Tibt',
'Bosnian': 'bos_Latn',
'Buginese': 'bug_Latn',
'Bulgarian': 'bul_Cyrl',
'Catalan': 'cat_Latn',
'Cebuano': 'ceb_Latn',
'Czech': 'ces_Latn',
'Chokwe': 'cjk_Latn',
'Central Kurdish': 'ckb_Arab',
'Crimean Tatar': 'crh_Latn',
'Welsh': 'cym_Latn',
'Danish': 'dan_Latn',
'German': 'deu_Latn',
'Southwestern Dinka': 'dik_Latn',
'Dyula': 'dyu_Latn',
'Dzongkha': 'dzo_Tibt',
'Greek': 'ell_Grek',
'English': 'eng_Latn',
'Esperanto': 'epo_Latn',
'Estonian': 'est_Latn',
'Basque': 'eus_Latn',
'Ewe': 'ewe_Latn',
'Faroese': 'fao_Latn',
'Fijian': 'fij_Latn',
'Finnish': 'fin_Latn',
'Fon': 'fon_Latn',
'French': 'fra_Latn',
'Friulian': 'fur_Latn',
'Nigerian Fulfulde': 'fuv_Latn',
'Scottish Gaelic': 'gla_Latn',
'Irish': 'gle_Latn',
'Galician': 'glg_Latn',
'Guarani': 'grn_Latn',
'Gujarati': 'guj_Gujr',
'Haitian Creole': 'hat_Latn',
'Hausa': 'hau_Latn',
'Hebrew': 'heb_Hebr',
'Hindi': 'hin_Deva',
'Chhattisgarhi': 'hne_Deva',
'Croatian': 'hrv_Latn',
'Hungarian': 'hun_Latn',
'Armenian': 'hye_Armn',
'Igbo': 'ibo_Latn',
'Ilocano': 'ilo_Latn',
'Indonesian': 'ind_Latn',
'Icelandic': 'isl_Latn',
'Italian': 'ita_Latn',
'Javanese': 'jav_Latn',
'Japanese': 'jpn_Jpan',
'Kabyle': 'kab_Latn',
'Jingpho': 'kac_Latn',
'Kamba': 'kam_Latn',
'Kannada': 'kan_Knda',
'Kashmiri (Arabic script)': 'kas_Arab',
'Kashmiri (Devanagari script)': 'kas_Deva',
'Georgian': 'kat_Geor',
'Central Kanuri (Arabic script)': 'knc_Arab',
'Central Kanuri (Latin script)': 'knc_Latn',
'Kazakh': 'kaz_Cyrl',
'Kabiyè': 'kbp_Latn',
'Kabuverdianu': 'kea_Latn',
'Khmer': 'khm_Khmr',
'Kikuyu': 'kik_Latn',
'Kinyarwanda': 'kin_Latn', 'Kyrgyz': 'kir_Cyrl', 'Kimbundu': 'kmb_Latn',
'Northern Kurdish': 'kmr_Latn', 'Kikongo': 'kon_Latn',
'Korean': 'kor_Hang', 'Lao': 'lao_Laoo', 'Ligurian': 'lij_Latn',
'Limburgish': 'lim_Latn', 'Lingala': 'lin_Latn', 'Lithuanian': 'lit_Latn', 'Lombard': 'lmo_Latn',
'Latgalian': 'ltg_Latn', 'Luxembourgish': 'ltz_Latn', 'Luba-Kasai': 'lua_Latn', 'Ganda': 'lug_Latn',
'Luo': 'luo_Latn', 'Mizo': 'lus_Latn', 'Standard Latvian': 'lvs_Latn', 'Magahi': 'mag_Deva',
'Maithili': 'mai_Deva', 'Malayalam': 'mal_Mlym', 'Marathi': 'mar_Deva',
'Minangkabau (Arabic script)': 'min_Arab', 'Minangkabau (Latin script)': 'min_Latn',
'Macedonian': 'mkd_Cyrl', 'Plateau Malagasy': 'plt_Latn', 'Maltese': 'mlt_Latn',
'Meitei (Bengali script)': 'mni_Beng', 'Halh Mongolian': 'khk_Cyrl', 'Mossi': 'mos_Latn',
'Maori': 'mri_Latn', 'Burmese': 'mya_Mymr', 'Dutch': 'nld_Latn', 'Norwegian Nynorsk': 'nno_Latn',
'Norwegian Bokmål': 'nob_Latn', 'Nepali': 'npi_Deva', 'Northern Sotho': 'nso_Latn',
'Nuer': 'nus_Latn',
'Nyanja': 'nya_Latn', 'Occitan': 'oci_Latn', 'West Central Oromo': 'gaz_Latn', 'Odia': 'ory_Orya',
'Pangasinan': 'pag_Latn', 'Eastern Panjabi': 'pan_Guru', 'Papiamento': 'pap_Latn',
'Western Persian': 'pes_Arab',
'Polish': 'pol_Latn', 'Portuguese': 'por_Latn', 'Dari': 'prs_Arab', 'Southern Pashto': 'pbt_Arab',
'Ayacucho Quechua': 'quy_Latn', 'Romanian': 'ron_Latn', 'Rundi': 'run_Latn', 'Russian': 'rus_Cyrl',
'Sango': 'sag_Latn', 'Sanskrit': 'san_Deva', 'Santali': 'sat_Olck', 'Sicilian': 'scn_Latn',
'Shan': 'shn_Mymr',
'Sinhala': 'sin_Sinh', 'Slovak': 'slk_Latn', 'Slovenian': 'slv_Latn', 'Samoan': 'smo_Latn',
'Shona': 'sna_Latn',
'Sindhi': 'snd_Arab', 'Somali': 'som_Latn', 'Southern Sotho': 'sot_Latn', 'Spanish': 'spa_Latn',
'Tosk Albanian': 'als_Latn', 'Sardinian': 'srd_Latn', 'Serbian': 'srp_Cyrl', 'Swati': 'ssw_Latn',
'Sundanese': 'sun_Latn', 'Swedish': 'swe_Latn', 'Swahili': 'swh_Latn', 'Silesian': 'szl_Latn',
'Tamil': 'tam_Taml', 'Tatar': 'tat_Cyrl', 'Telugu': 'tel_Telu', 'Tajik': 'tgk_Cyrl',
'Tagalog': 'tgl_Latn',
'Thai': 'tha_Thai', 'Tigrinya': 'tir_Ethi', 'Tamasheq (Latin script)': 'taq_Latn',
'Tamasheq (Tifinagh script)': 'taq_Tfng',
'Tok Pisin': 'tpi_Latn', 'Tswana': 'tsn_Latn', 'Tsonga': 'tso_Latn', 'Turkmen': 'tuk_Latn', 'Tumbuka': 'tum_Latn',
'Turkish': 'tur_Latn', 'Twi': 'twi_Latn', 'Central Atlas Tamazight': 'tzm_Tfng',
'Uyghur': 'uig_Arab',
'Ukrainian': 'ukr_Cyrl', 'Umbundu': 'umb_Latn', 'Urdu': 'urd_Arab', 'Northern Uzbek': 'uzn_Latn',
'Venetian': 'vec_Latn',
'Vietnamese': 'vie_Latn', 'Waray': 'war_Latn', 'Wolof': 'wol_Latn', 'Xhosa': 'xho_Latn',
'Eastern Yiddish': 'ydd_Hebr',
'Yoruba': 'yor_Latn', 'Yue Chinese': 'yue_Hant', 'Chinese (Simplified)': 'zho_Hans',
'Chinese (Traditional)': 'zho_Hant',
'Standard Malay': 'zsm_Latn', 'Zulu': 'zul_Latn'}
self.model_name_dict = {'0.6B': 'facebook/nllb-200-distilled-600M',
'1.3B': 'facebook/nllb-200-distilled-1.3B',
'3.3B': 'facebook/nllb-200-3.3B',
}
self.whisper_codes_to_flores_codes = {"de" : self.flores_codes['German'],
"en" : self.flores_codes['English'],
"pl" : self.flores_codes['Polish'],
"hi" : self.flores_codes['Hindi']
}
self.flores_codes_to_tts_codes = {'Acehnese': 'ace', 'Mesopotamian Arabic': 'acm', 'Ta’izzi-Adeni Arabic': 'acq', 'Tunisian Arabic': 'aeb', 'Afrikaans': 'afr', 'South Levantine Arabic': 'ajp', 'Akan': 'aka', 'Amharic': 'amh', 'North Levantine Arabic': 'apc', 'Modern Standard Arabic': 'arb', 'Najdi Arabic': 'ars', 'Moroccan Arabic': 'ary', 'Egyptian Arabic': 'arz', 'Assamese': 'asm', 'Asturian': 'ast', 'Awadhi': 'awa', 'Central Aymara': 'ayr', 'South Azerbaijani': 'azb', 'North Azerbaijani': 'azj', 'Bashkir': 'bak', 'Bambara': 'bam', 'Balinese': 'ban', 'Belarusian': 'bel', 'Bemba': 'bem', 'Bengali': 'ben', 'Bhojpuri': 'bho', 'Banjar': 'bjn', 'Standard Tibetan': 'bod', 'Bosnian': 'bos', 'Buginese': 'bug', 'Bulgarian': 'bul', 'Catalan': 'cat', 'Cebuano': 'ceb', 'Czech': 'ces', 'Chokwe': 'cjk', 'Central Kurdish': 'ckb', 'Crimean Tatar': 'crh', 'Welsh': 'cym', 'Danish': 'dan', 'German': 'deu', 'Southwestern Dinka': 'dik', 'Dyula': 'dyu', 'Dzongkha': 'dzo', 'Greek': 'ell', 'English': 'eng', 'Esperanto': 'epo', 'Estonian': 'est', 'Basque': 'eus', 'Ewe': 'ewe', 'Faroese': 'fao', 'Fijian': 'fij', 'Finnish': 'fin', 'Fon': 'fon', 'French': 'fra', 'Friulian': 'fur', 'Nigerian Fulfulde': 'fuv', 'Scottish Gaelic': 'gla', 'Irish': 'gle', 'Galician': 'glg', 'Guarani': 'grn', 'Gujarati': 'guj', 'Haitian Creole': 'hat', 'Hausa': 'hau', 'Hebrew': 'heb', 'Hindi': 'hin', 'Chhattisgarhi': 'hne', 'Croatian': 'hrv', 'Hungarian': 'hun', 'Armenian': 'hye', 'Igbo': 'ibo', 'Ilocano': 'ilo', 'Indonesian': 'ind', 'Icelandic': 'isl', 'Italian': 'ita', 'Javanese': 'jav', 'Japanese': 'jpn', 'Kabyle': 'kab', 'Jingpho': 'kac', 'Kamba': 'kam', 'Kannada': 'kan', 'Kashmiri': 'kas', 'Georgian': 'kat', 'Central Kanuri': 'knc', 'Kazakh': 'kaz', 'Kabiyè': 'kbp', 'Kabuverdianu': 'kea', 'Khmer': 'khm', 'Kikuyu': 'kik', 'Kinyarwanda': 'kin', 'Kyrgyz': 'kir', 'Kimbundu': 'kmb', 'Northern Kurdish': 'kmr', 'Kikongo': 'kon', 'Korean': 'kor', 'Lao': 'lao', 'Ligurian': 'lij', 'Limburgish': 'lim', 'Lingala': 'lin', 'Lithuanian': 'lit', 'Lombard': 'lmo', 'Latgalian': 'ltg', 'Luxembourgish': 'ltz', 'Luba-Kasai': 'lua', 'Ganda': 'lug', 'Luo': 'luo', 'Mizo': 'lus', 'Standard Latvian': 'lvs', 'Magahi': 'mag', 'Maithili': 'mai', 'Malayalam': 'mal', 'Marathi': 'mar', 'Minangkabau': 'min', 'Macedonian': 'mkd', 'Plateau Malagasy': 'plt', 'Maltese': 'mlt', 'Meitei': 'mni', 'Halh Mongolian': 'khk', 'Mossi': 'mos', 'Maori': 'mri', 'Burmese': 'mya', 'Dutch': 'nld', 'Norwegian Nynorsk': 'nno', 'Norwegian Bokmål': 'nob', 'Nepali': 'npi', 'Northern Sotho': 'nso', 'Nuer': 'nus', 'Nyanja': 'nya', 'Occitan': 'oci', 'West Central Oromo': 'gaz', 'Odia': 'ory', 'Pangasinan': 'pag', 'Eastern Panjabi': 'pan', 'Papiamento': 'pap', 'Western Persian': 'pes', 'Polish': 'pol', 'Portuguese': 'por', 'Dari': 'prs', 'Southern Pashto': 'pbt', 'Ayacucho Quechua': 'quy', 'Romanian': 'ron', 'Rundi': 'run', 'Russian': 'rus', 'Sango': 'sag', 'Sanskrit': 'san', 'Santali': 'sat', 'Sicilian': 'scn', 'Shan': 'shn', 'Sinhala': 'sin', 'Slovak': 'slk', 'Slovenian': 'slv', 'Samoan': 'smo', 'Shona': 'sna', 'Sindhi': 'snd', 'Somali': 'som', 'Southern Sotho': 'sot', 'Spanish': 'spa', 'Tosk Albanian': 'als', 'Sardinian': 'srd', 'Serbian': 'srp', 'Swati': 'ssw', 'Sundanese': 'sun', 'Swedish': 'swe', 'Swahili': 'swh', 'Silesian': 'szl', 'Tamil': 'tam', 'Tatar': 'tat', 'Telugu': 'tel', 'Tajik': 'tgk', 'Tagalog': 'tgl', 'Thai': 'tha', 'Tigrinya': 'tir', 'Tamasheq': 'taq', 'Tok Pisin': 'tpi', 'Tswana': 'tsn', 'Tsonga': 'tso', 'Turkmen': 'tuk', 'Tumbuka': 'tum', 'Turkish': 'tur', 'Twi': 'twi', 'Central Atlas Tamazight': 'tzm', 'Uyghur': 'uig', 'Ukrainian': 'ukr', 'Umbundu': 'umb', 'Urdu': 'urd', 'Northern Uzbek': 'uzn', 'Venetian': 'vec', 'Vietnamese': 'vie', 'Waray': 'war', 'Wolof': 'wol', 'Xhosa': 'xho', 'Eastern Yiddish': 'ydd', 'Yoruba': 'yor', 'Yue Chinese': 'yue', 'Chinese': 'zho', 'Standard Malay': 'zsm', 'Zulu': 'zul'}
self.language_directory = 'Languages'
self.uroman_directory = 'aux_files'
self.language_download_web = 'https://dl.fbaipublicfiles.com/mms/tts'
self.language_vocab_text = "vocab.txt"
self.language_vocab_configuration = "config.json"
self.language_vocab_model = "G_100000.pth"
# creating the audio files temporary
# ---------------------------------------
self.temp_audio_folder = 'Temp_Audios'
self.text2speech_wavfile = f'{self.temp_audio_folder}/text2speech.wav'
self.enhanced_speech_file = f"{self.temp_audio_folder}/enhanced.mp3"
self.input_speech_file = f'{self.temp_audio_folder}/output.wav'
try:
os.makedirs(self.language_directory)
except:
pass
try:
os.makedirs(self.temp_audio_folder)
except:
pass