511KeV commited on
Commit
b2c8a1e
0 Parent(s):

Duplicate from 511KeV/nllb-translation-demo

Browse files
Files changed (6) hide show
  1. .gitattributes +27 -0
  2. README.md +13 -0
  3. app.py +88 -0
  4. flores200_codes.py +210 -0
  5. imp.js +2 -0
  6. requirements.txt +3 -0
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.onnx filter=lfs diff=lfs merge=lfs -text
13
+ *.ot filter=lfs diff=lfs merge=lfs -text
14
+ *.parquet filter=lfs diff=lfs merge=lfs -text
15
+ *.pb filter=lfs diff=lfs merge=lfs -text
16
+ *.pt filter=lfs diff=lfs merge=lfs -text
17
+ *.pth filter=lfs diff=lfs merge=lfs -text
18
+ *.rar filter=lfs diff=lfs merge=lfs -text
19
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
21
+ *.tflite filter=lfs diff=lfs merge=lfs -text
22
+ *.tgz filter=lfs diff=lfs merge=lfs -text
23
+ *.wasm filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Nllb Translation Demo
3
+ emoji: 👀
4
+ colorFrom: indigo
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.0.26
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: 511KeV/nllb-translation-demo
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gradio as gr
4
+ import time
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
+ from flores200_codes import flores_codes
7
+
8
+ #print(f"Is CUDA available: {torch.cuda.is_available()}")
9
+ # True
10
+ #print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
11
+
12
+ def load_models():
13
+ # build model and tokenizer
14
+ model_name_dict = {'nllb-distilled-600M': 'facebook/nllb-200-distilled-600M',
15
+ #'nllb-1.3B': 'facebook/nllb-200-1.3B',
16
+ #'nllb-distilled-1.3B': 'facebook/nllb-200-distilled-1.3B',
17
+ #'nllb-3.3B': 'facebook/nllb-200-3.3B',
18
+ }
19
+
20
+ model_dict = {}
21
+
22
+ for call_name, real_name in model_name_dict.items():
23
+ print('\tLoading model: %s' % call_name)
24
+ model = AutoModelForSeq2SeqLM.from_pretrained(real_name)
25
+ tokenizer = AutoTokenizer.from_pretrained(real_name)
26
+ model_dict[call_name+'_model'] = model
27
+ model_dict[call_name+'_tokenizer'] = tokenizer
28
+
29
+ return model_dict
30
+
31
+
32
+ def translation(source, target, text):
33
+ if len(model_dict) == 2:
34
+ model_name = 'nllb-distilled-600M'
35
+
36
+ start_time = time.time()
37
+ source = flores_codes[source]
38
+ target = flores_codes[target]
39
+
40
+ model = model_dict[model_name + '_model']
41
+ tokenizer = model_dict[model_name + '_tokenizer']
42
+
43
+ translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=source, tgt_lang=target)
44
+ output = translator(text, max_length=4098)
45
+
46
+ end_time = time.time()
47
+
48
+ output = output[0]['translation_text']
49
+ result = {'inference_time': end_time - start_time,
50
+ 'source': source,
51
+ 'target': target,
52
+ 'result': output}
53
+ return result
54
+
55
+
56
+ if __name__ == '__main__':
57
+ print('\tinit models')
58
+
59
+ global model_dict
60
+
61
+ model_dict = load_models()
62
+
63
+ # define gradio demo
64
+ lang_codes = list(flores_codes.keys())
65
+ #inputs = [gr.inputs.Radio(['nllb-distilled-600M', 'nllb-1.3B', 'nllb-distilled-1.3B'], label='NLLB Model'),
66
+ inputs = [gr.inputs.Dropdown(lang_codes, default='English', label='Source'),
67
+ gr.inputs.Dropdown(lang_codes, default='Kashmiri', label='Target'),
68
+ gr.inputs.Textbox(lines=5, label="Input text"),
69
+ ]
70
+
71
+ outputs = gr.outputs.JSON()
72
+
73
+ title = "NLLB distilled 600M demo"
74
+
75
+ demo_status = "Demo is running on CPU"
76
+ description = f"Details: https://github.com/facebookresearch/fairseq/tree/nllb. {demo_status}"
77
+ examples = [
78
+ ['English', 'Kashmiri', 'Hi. nice to meet you']
79
+ ]
80
+
81
+ gr.Interface(translation,
82
+ inputs,
83
+ outputs,
84
+ title=title,
85
+ description=description,
86
+ ).launch()
87
+
88
+
flores200_codes.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ codes_as_string = '''Acehnese (Arabic script) ace_Arab
2
+ Acehnese (Latin script) ace_Latn
3
+ Mesopotamian Arabic acm_Arab
4
+ Ta’izzi-Adeni Arabic acq_Arab
5
+ Tunisian Arabic aeb_Arab
6
+ Afrikaans afr_Latn
7
+ South Levantine Arabic ajp_Arab
8
+ Akan aka_Latn
9
+ Amharic amh_Ethi
10
+ North Levantine Arabic apc_Arab
11
+ Modern Standard Arabic arb_Arab
12
+ Modern Standard Arabic (Romanized) arb_Latn
13
+ Najdi Arabic ars_Arab
14
+ Moroccan Arabic ary_Arab
15
+ Egyptian Arabic arz_Arab
16
+ Assamese asm_Beng
17
+ Asturian ast_Latn
18
+ Awadhi awa_Deva
19
+ Central Aymara ayr_Latn
20
+ South Azerbaijani azb_Arab
21
+ North Azerbaijani azj_Latn
22
+ Bashkir bak_Cyrl
23
+ Bambara bam_Latn
24
+ Balinese ban_Latn
25
+ Belarusian bel_Cyrl
26
+ Bemba bem_Latn
27
+ Bengali ben_Beng
28
+ Bhojpuri bho_Deva
29
+ Banjar (Arabic script) bjn_Arab
30
+ Banjar (Latin script) bjn_Latn
31
+ Standard Tibetan bod_Tibt
32
+ Bosnian bos_Latn
33
+ Buginese bug_Latn
34
+ Bulgarian bul_Cyrl
35
+ Catalan cat_Latn
36
+ Cebuano ceb_Latn
37
+ Czech ces_Latn
38
+ Chokwe cjk_Latn
39
+ Central Kurdish ckb_Arab
40
+ Crimean Tatar crh_Latn
41
+ Welsh cym_Latn
42
+ Danish dan_Latn
43
+ German deu_Latn
44
+ Southwestern Dinka dik_Latn
45
+ Dyula dyu_Latn
46
+ Dzongkha dzo_Tibt
47
+ Greek ell_Grek
48
+ English eng_Latn
49
+ Esperanto epo_Latn
50
+ Estonian est_Latn
51
+ Basque eus_Latn
52
+ Ewe ewe_Latn
53
+ Faroese fao_Latn
54
+ Fijian fij_Latn
55
+ Finnish fin_Latn
56
+ Fon fon_Latn
57
+ French fra_Latn
58
+ Friulian fur_Latn
59
+ Nigerian Fulfulde fuv_Latn
60
+ Scottish Gaelic gla_Latn
61
+ Irish gle_Latn
62
+ Galician glg_Latn
63
+ Guarani grn_Latn
64
+ Gujarati guj_Gujr
65
+ Haitian Creole hat_Latn
66
+ Hausa hau_Latn
67
+ Hebrew heb_Hebr
68
+ Hindi hin_Deva
69
+ Chhattisgarhi hne_Deva
70
+ Croatian hrv_Latn
71
+ Hungarian hun_Latn
72
+ Armenian hye_Armn
73
+ Igbo ibo_Latn
74
+ Ilocano ilo_Latn
75
+ Indonesian ind_Latn
76
+ Icelandic isl_Latn
77
+ Italian ita_Latn
78
+ Javanese jav_Latn
79
+ Japanese jpn_Jpan
80
+ Kabyle kab_Latn
81
+ Jingpho kac_Latn
82
+ Kamba kam_Latn
83
+ Kannada kan_Knda
84
+ Kashmiri kas_Arab
85
+ Georgian kat_Geor
86
+ Central Kanuri (Arabic script) knc_Arab
87
+ Central Kanuri (Latin script) knc_Latn
88
+ Kazakh kaz_Cyrl
89
+ Kabiyè kbp_Latn
90
+ Kabuverdianu kea_Latn
91
+ Khmer khm_Khmr
92
+ Kikuyu kik_Latn
93
+ Kinyarwanda kin_Latn
94
+ Kyrgyz kir_Cyrl
95
+ Kimbundu kmb_Latn
96
+ Northern Kurdish kmr_Latn
97
+ Kikongo kon_Latn
98
+ Korean kor_Hang
99
+ Lao lao_Laoo
100
+ Ligurian lij_Latn
101
+ Limburgish lim_Latn
102
+ Lingala lin_Latn
103
+ Lithuanian lit_Latn
104
+ Lombard lmo_Latn
105
+ Latgalian ltg_Latn
106
+ Luxembourgish ltz_Latn
107
+ Luba-Kasai lua_Latn
108
+ Ganda lug_Latn
109
+ Luo luo_Latn
110
+ Mizo lus_Latn
111
+ Standard Latvian lvs_Latn
112
+ Magahi mag_Deva
113
+ Maithili mai_Deva
114
+ Malayalam mal_Mlym
115
+ Marathi mar_Deva
116
+ Minangkabau (Arabic script) min_Arab
117
+ Minangkabau (Latin script) min_Latn
118
+ Macedonian mkd_Cyrl
119
+ Plateau Malagasy plt_Latn
120
+ Maltese mlt_Latn
121
+ Meitei (Bengali script) mni_Beng
122
+ Halh Mongolian khk_Cyrl
123
+ Mossi mos_Latn
124
+ Maori mri_Latn
125
+ Burmese mya_Mymr
126
+ Dutch nld_Latn
127
+ Norwegian Nynorsk nno_Latn
128
+ Norwegian Bokmål nob_Latn
129
+ Nepali npi_Deva
130
+ Northern Sotho nso_Latn
131
+ Nuer nus_Latn
132
+ Nyanja nya_Latn
133
+ Occitan oci_Latn
134
+ West Central Oromo gaz_Latn
135
+ Odia ory_Orya
136
+ Pangasinan pag_Latn
137
+ Eastern Panjabi pan_Guru
138
+ Papiamento pap_Latn
139
+ Western Persian pes_Arab
140
+ Polish pol_Latn
141
+ Portuguese por_Latn
142
+ Dari prs_Arab
143
+ Southern Pashto pbt_Arab
144
+ Ayacucho Quechua quy_Latn
145
+ Romanian ron_Latn
146
+ Rundi run_Latn
147
+ Russian rus_Cyrl
148
+ Sango sag_Latn
149
+ Sanskrit san_Deva
150
+ Santali sat_Olck
151
+ Sicilian scn_Latn
152
+ Shan shn_Mymr
153
+ Sinhala sin_Sinh
154
+ Slovak slk_Latn
155
+ Slovenian slv_Latn
156
+ Samoan smo_Latn
157
+ Shona sna_Latn
158
+ Sindhi snd_Arab
159
+ Somali som_Latn
160
+ Southern Sotho sot_Latn
161
+ Spanish spa_Latn
162
+ Tosk Albanian als_Latn
163
+ Sardinian srd_Latn
164
+ Serbian srp_Cyrl
165
+ Swati ssw_Latn
166
+ Sundanese sun_Latn
167
+ Swedish swe_Latn
168
+ Swahili swh_Latn
169
+ Silesian szl_Latn
170
+ Tamil tam_Taml
171
+ Tatar tat_Cyrl
172
+ Telugu tel_Telu
173
+ Tajik tgk_Cyrl
174
+ Tagalog tgl_Latn
175
+ Thai tha_Thai
176
+ Tigrinya tir_Ethi
177
+ Tamasheq (Latin script) taq_Latn
178
+ Tamasheq (Tifinagh script) taq_Tfng
179
+ Tok Pisin tpi_Latn
180
+ Tswana tsn_Latn
181
+ Tsonga tso_Latn
182
+ Turkmen tuk_Latn
183
+ Tumbuka tum_Latn
184
+ Turkish tur_Latn
185
+ Twi twi_Latn
186
+ Central Atlas Tamazight tzm_Tfng
187
+ Uyghur uig_Arab
188
+ Ukrainian ukr_Cyrl
189
+ Umbundu umb_Latn
190
+ Urdu urd_Arab
191
+ Northern Uzbek uzn_Latn
192
+ Venetian vec_Latn
193
+ Vietnamese vie_Latn
194
+ Waray war_Latn
195
+ Wolof wol_Latn
196
+ Xhosa xho_Latn
197
+ Eastern Yiddish ydd_Hebr
198
+ Yoruba yor_Latn
199
+ Yue Chinese yue_Hant
200
+ Chinese (Simplified) zho_Hans
201
+ Chinese (Traditional) zho_Hant
202
+ Standard Malay zsm_Latn
203
+ Zulu zul_Latn'''
204
+
205
+ codes_as_string = codes_as_string.split('\n')
206
+
207
+ flores_codes = {}
208
+ for code in codes_as_string:
209
+ lang, lang_code = code.split('\t')
210
+ flores_codes[lang] = lang_code
imp.js ADDED
@@ -0,0 +1,2 @@
 
 
1
+ import nllb-200-distilled-600M
2
+ import nllb-200-3.3B
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ gradio
3
+ torch