ilhamsyahids commited on
Commit
afb6c6a
1 Parent(s): 2cc827f

init gradio application

Browse files

Signed-off-by: Ilham Syahid S <ilhamsyahids@gmail.com>

Files changed (4) hide show
  1. .vscode/settings.json +6 -0
  2. app.py +95 -0
  3. flores200_codes.py +213 -0
  4. requirements.txt +3 -0
.vscode/settings.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[python]": {
3
+ "editor.defaultFormatter": "ms-python.black-formatter"
4
+ },
5
+ "python.formatting.provider": "none"
6
+ }
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
4
+ from flores200_codes import flores_codes
5
+
6
+
7
+ def load_models():
8
+ # build model and tokenizer
9
+ model_name_dict = {
10
+ "nllb-distilled-600M": "facebook/nllb-200-distilled-600M",
11
+ "nllb-distilled-1.3B": "facebook/nllb-200-distilled-1.3B",
12
+ "nllb-1.3B": "facebook/nllb-200-1.3B",
13
+ "nllb-3.3B": "facebook/nllb-200-3.3B",
14
+ }
15
+
16
+ model_dict = {}
17
+
18
+ for call_name, real_name in model_name_dict.items():
19
+ print("\tLoading model: %s" % call_name)
20
+ model = AutoModelForSeq2SeqLM.from_pretrained(real_name)
21
+ tokenizer = AutoTokenizer.from_pretrained(real_name)
22
+ model_dict[call_name + "_model"] = model
23
+ model_dict[call_name + "_tokenizer"] = tokenizer
24
+
25
+ return model_dict
26
+
27
+
28
+ def translation(model_name, source, target, text):
29
+ start_time = time.time()
30
+ source = flores_codes[source]
31
+ target = flores_codes[target]
32
+
33
+ model = model_dict[model_name + "_model"]
34
+ tokenizer = model_dict[model_name + "_tokenizer"]
35
+
36
+ translator = pipeline(
37
+ "translation",
38
+ model=model,
39
+ tokenizer=tokenizer,
40
+ src_lang=source,
41
+ tgt_lang=target,
42
+ )
43
+ output = translator(text, max_length=400)
44
+
45
+ end_time = time.time()
46
+
47
+ full_output = output
48
+ output = output[0]["translation_text"]
49
+ result = {
50
+ "inference_time": end_time - start_time,
51
+ "source": source,
52
+ "target": target,
53
+ "result": output,
54
+ "full_output": full_output,
55
+ }
56
+ return result
57
+
58
+
59
+ if __name__ == "__main__":
60
+ print("\tinit models")
61
+
62
+ global model_dict
63
+
64
+ model_dict = load_models()
65
+
66
+ # define gradio demo
67
+ lang_codes = list(flores_codes.keys())
68
+ inputs = [
69
+ gr.inputs.Radio(
70
+ ["nllb-distilled-600M", "nllb-distilled-1.3B", "nllb-1.3B", "nllb-3.3B"],
71
+ label="NLLB Model",
72
+ default="nllb-distilled-600M",
73
+ ),
74
+ gr.inputs.Dropdown(lang_codes, default="Najdi Arabic", label="Source"),
75
+ gr.inputs.Dropdown(lang_codes, default="English", label="Target"),
76
+ gr.inputs.Textbox(lines=5, label="Input text"),
77
+ ]
78
+
79
+ outputs = gr.outputs.JSON()
80
+
81
+ title = "NLLB (No Language Left Behind) demo"
82
+
83
+ demo_status = "Demo is running on CPU"
84
+ description = f"Using NLLB model, details: https://github.com/facebookresearch/fairseq/tree/nllb. {demo_status}"
85
+ examples = [["nllb-3.3B", "Najdi Arabic", "English", "جلست اطفال"]]
86
+
87
+ gr.Interface(
88
+ translation,
89
+ inputs,
90
+ outputs,
91
+ title=title,
92
+ description=description,
93
+ examples=examples,
94
+ examples_per_page=50,
95
+ ).launch()
flores200_codes.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Description: Contains a dictionary of the languages in the Flores dataset and their corresponding codes.
2
+
3
+ codes_as_string = """Acehnese (Arabic script) ace_Arab
4
+ Acehnese (Latin script) ace_Latn
5
+ Mesopotamian Arabic acm_Arab
6
+ Ta’izzi-Adeni Arabic acq_Arab
7
+ Tunisian Arabic aeb_Arab
8
+ Afrikaans afr_Latn
9
+ South Levantine Arabic ajp_Arab
10
+ Akan aka_Latn
11
+ Amharic amh_Ethi
12
+ North Levantine Arabic apc_Arab
13
+ Modern Standard Arabic arb_Arab
14
+ Modern Standard Arabic (Romanized) arb_Latn
15
+ Najdi Arabic ars_Arab
16
+ Moroccan Arabic ary_Arab
17
+ Egyptian Arabic arz_Arab
18
+ Assamese asm_Beng
19
+ Asturian ast_Latn
20
+ Awadhi awa_Deva
21
+ Central Aymara ayr_Latn
22
+ South Azerbaijani azb_Arab
23
+ North Azerbaijani azj_Latn
24
+ Bashkir bak_Cyrl
25
+ Bambara bam_Latn
26
+ Balinese ban_Latn
27
+ Belarusian bel_Cyrl
28
+ Bemba bem_Latn
29
+ Bengali ben_Beng
30
+ Bhojpuri bho_Deva
31
+ Banjar (Arabic script) bjn_Arab
32
+ Banjar (Latin script) bjn_Latn
33
+ Standard Tibetan bod_Tibt
34
+ Bosnian bos_Latn
35
+ Buginese bug_Latn
36
+ Bulgarian bul_Cyrl
37
+ Catalan cat_Latn
38
+ Cebuano ceb_Latn
39
+ Czech ces_Latn
40
+ Chokwe cjk_Latn
41
+ Central Kurdish ckb_Arab
42
+ Crimean Tatar crh_Latn
43
+ Welsh cym_Latn
44
+ Danish dan_Latn
45
+ German deu_Latn
46
+ Southwestern Dinka dik_Latn
47
+ Dyula dyu_Latn
48
+ Dzongkha dzo_Tibt
49
+ Greek ell_Grek
50
+ English eng_Latn
51
+ Esperanto epo_Latn
52
+ Estonian est_Latn
53
+ Basque eus_Latn
54
+ Ewe ewe_Latn
55
+ Faroese fao_Latn
56
+ Fijian fij_Latn
57
+ Finnish fin_Latn
58
+ Fon fon_Latn
59
+ French fra_Latn
60
+ Friulian fur_Latn
61
+ Nigerian Fulfulde fuv_Latn
62
+ Scottish Gaelic gla_Latn
63
+ Irish gle_Latn
64
+ Galician glg_Latn
65
+ Guarani grn_Latn
66
+ Gujarati guj_Gujr
67
+ Haitian Creole hat_Latn
68
+ Hausa hau_Latn
69
+ Hebrew heb_Hebr
70
+ Hindi hin_Deva
71
+ Chhattisgarhi hne_Deva
72
+ Croatian hrv_Latn
73
+ Hungarian hun_Latn
74
+ Armenian hye_Armn
75
+ Igbo ibo_Latn
76
+ Ilocano ilo_Latn
77
+ Indonesian ind_Latn
78
+ Icelandic isl_Latn
79
+ Italian ita_Latn
80
+ Javanese jav_Latn
81
+ Japanese jpn_Jpan
82
+ Kabyle kab_Latn
83
+ Jingpho kac_Latn
84
+ Kamba kam_Latn
85
+ Kannada kan_Knda
86
+ Kashmiri (Arabic script) kas_Arab
87
+ Kashmiri (Devanagari script) kas_Deva
88
+ Georgian kat_Geor
89
+ Central Kanuri (Arabic script) knc_Arab
90
+ Central Kanuri (Latin script) knc_Latn
91
+ Kazakh kaz_Cyrl
92
+ Kabiyè kbp_Latn
93
+ Kabuverdianu kea_Latn
94
+ Khmer khm_Khmr
95
+ Kikuyu kik_Latn
96
+ Kinyarwanda kin_Latn
97
+ Kyrgyz kir_Cyrl
98
+ Kimbundu kmb_Latn
99
+ Northern Kurdish kmr_Latn
100
+ Kikongo kon_Latn
101
+ Korean kor_Hang
102
+ Lao lao_Laoo
103
+ Ligurian lij_Latn
104
+ Limburgish lim_Latn
105
+ Lingala lin_Latn
106
+ Lithuanian lit_Latn
107
+ Lombard lmo_Latn
108
+ Latgalian ltg_Latn
109
+ Luxembourgish ltz_Latn
110
+ Luba-Kasai lua_Latn
111
+ Ganda lug_Latn
112
+ Luo luo_Latn
113
+ Mizo lus_Latn
114
+ Standard Latvian lvs_Latn
115
+ Magahi mag_Deva
116
+ Maithili mai_Deva
117
+ Malayalam mal_Mlym
118
+ Marathi mar_Deva
119
+ Minangkabau (Arabic script) min_Arab
120
+ Minangkabau (Latin script) min_Latn
121
+ Macedonian mkd_Cyrl
122
+ Plateau Malagasy plt_Latn
123
+ Maltese mlt_Latn
124
+ Meitei (Bengali script) mni_Beng
125
+ Halh Mongolian khk_Cyrl
126
+ Mossi mos_Latn
127
+ Maori mri_Latn
128
+ Burmese mya_Mymr
129
+ Dutch nld_Latn
130
+ Norwegian Nynorsk nno_Latn
131
+ Norwegian Bokmål nob_Latn
132
+ Nepali npi_Deva
133
+ Northern Sotho nso_Latn
134
+ Nuer nus_Latn
135
+ Nyanja nya_Latn
136
+ Occitan oci_Latn
137
+ West Central Oromo gaz_Latn
138
+ Odia ory_Orya
139
+ Pangasinan pag_Latn
140
+ Eastern Panjabi pan_Guru
141
+ Papiamento pap_Latn
142
+ Western Persian pes_Arab
143
+ Polish pol_Latn
144
+ Portuguese por_Latn
145
+ Dari prs_Arab
146
+ Southern Pashto pbt_Arab
147
+ Ayacucho Quechua quy_Latn
148
+ Romanian ron_Latn
149
+ Rundi run_Latn
150
+ Russian rus_Cyrl
151
+ Sango sag_Latn
152
+ Sanskrit san_Deva
153
+ Santali sat_Olck
154
+ Sicilian scn_Latn
155
+ Shan shn_Mymr
156
+ Sinhala sin_Sinh
157
+ Slovak slk_Latn
158
+ Slovenian slv_Latn
159
+ Samoan smo_Latn
160
+ Shona sna_Latn
161
+ Sindhi snd_Arab
162
+ Somali som_Latn
163
+ Southern Sotho sot_Latn
164
+ Spanish spa_Latn
165
+ Tosk Albanian als_Latn
166
+ Sardinian srd_Latn
167
+ Serbian srp_Cyrl
168
+ Swati ssw_Latn
169
+ Sundanese sun_Latn
170
+ Swedish swe_Latn
171
+ Swahili swh_Latn
172
+ Silesian szl_Latn
173
+ Tamil tam_Taml
174
+ Tatar tat_Cyrl
175
+ Telugu tel_Telu
176
+ Tajik tgk_Cyrl
177
+ Tagalog tgl_Latn
178
+ Thai tha_Thai
179
+ Tigrinya tir_Ethi
180
+ Tamasheq (Latin script) taq_Latn
181
+ Tamasheq (Tifinagh script) taq_Tfng
182
+ Tok Pisin tpi_Latn
183
+ Tswana tsn_Latn
184
+ Tsonga tso_Latn
185
+ Turkmen tuk_Latn
186
+ Tumbuka tum_Latn
187
+ Turkish tur_Latn
188
+ Twi twi_Latn
189
+ Central Atlas Tamazight tzm_Tfng
190
+ Uyghur uig_Arab
191
+ Ukrainian ukr_Cyrl
192
+ Umbundu umb_Latn
193
+ Urdu urd_Arab
194
+ Northern Uzbek uzn_Latn
195
+ Venetian vec_Latn
196
+ Vietnamese vie_Latn
197
+ Waray war_Latn
198
+ Wolof wol_Latn
199
+ Xhosa xho_Latn
200
+ Eastern Yiddish ydd_Hebr
201
+ Yoruba yor_Latn
202
+ Yue Chinese yue_Hant
203
+ Chinese (Simplified) zho_Hans
204
+ Chinese (Traditional) zho_Hant
205
+ Standard Malay zsm_Latn
206
+ Zulu zul_Latn"""
207
+
208
+ codes_as_string = codes_as_string.split("\n")
209
+
210
+ flores_codes = {}
211
+ for code in codes_as_string:
212
+ lang, lang_code = code.split("\t")
213
+ flores_codes[lang] = lang_code
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ gradio
3
+ torch