pedrogengo commited on
Commit
dd0a0a2
1 Parent(s): ddfd18f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +277 -0
app.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import numpy as np
3
+ from transformers import AutoTokenizer
4
+ import gradio as gr
5
+
6
+ lang_codes = """Acehnese (Arabic script) | ace_Arab
7
+ Acehnese (Latin script) | ace_Latn
8
+ Mesopotamian Arabic | acm_Arab
9
+ Ta’izzi-Adeni Arabic | acq_Arab
10
+ Tunisian Arabic | aeb_Arab
11
+ Afrikaans | afr_Latn
12
+ South Levantine Arabic | ajp_Arab
13
+ Akan | aka_Latn
14
+ Amharic | amh_Ethi
15
+ North Levantine Arabic | apc_Arab
16
+ Modern Standard Arabic | arb_Arab
17
+ Modern Standard Arabic (Romanized) | arb_Latn
18
+ Najdi Arabic | ars_Arab
19
+ Moroccan Arabic | ary_Arab
20
+ Egyptian Arabic | arz_Arab
21
+ Assamese | asm_Beng
22
+ Asturian | ast_Latn
23
+ Awadhi | awa_Deva
24
+ Central Aymara | ayr_Latn
25
+ South Azerbaijani | azb_Arab
26
+ North Azerbaijani | azj_Latn
27
+ Bashkir | bak_Cyrl
28
+ Bambara | bam_Latn
29
+ Balinese | ban_Latn
30
+ Belarusian | bel_Cyrl
31
+ Bemba | bem_Latn
32
+ Bengali | ben_Beng
33
+ Bhojpuri | bho_Deva
34
+ Banjar (Arabic script) | bjn_Arab
35
+ Banjar (Latin script) | bjn_Latn
36
+ Standard Tibetan | bod_Tibt
37
+ Bosnian | bos_Latn
38
+ Buginese | bug_Latn
39
+ Bulgarian | bul_Cyrl
40
+ Catalan | cat_Latn
41
+ Cebuano | ceb_Latn
42
+ Czech | ces_Latn
43
+ Chokwe | cjk_Latn
44
+ Central Kurdish | ckb_Arab
45
+ Crimean Tatar | crh_Latn
46
+ Welsh | cym_Latn
47
+ Danish | dan_Latn
48
+ German | deu_Latn
49
+ Southwestern Dinka | dik_Latn
50
+ Dyula | dyu_Latn
51
+ Dzongkha | dzo_Tibt
52
+ Greek | ell_Grek
53
+ English | eng_Latn
54
+ Esperanto | epo_Latn
55
+ Estonian | est_Latn
56
+ Basque | eus_Latn
57
+ Ewe | ewe_Latn
58
+ Faroese | fao_Latn
59
+ Fijian | fij_Latn
60
+ Finnish | fin_Latn
61
+ Fon | fon_Latn
62
+ French | fra_Latn
63
+ Friulian | fur_Latn
64
+ Nigerian Fulfulde | fuv_Latn
65
+ Scottish Gaelic | gla_Latn
66
+ Irish | gle_Latn
67
+ Galician | glg_Latn
68
+ Guarani | grn_Latn
69
+ Gujarati | guj_Gujr
70
+ Haitian Creole | hat_Latn
71
+ Hausa | hau_Latn
72
+ Hebrew | heb_Hebr
73
+ Hindi | hin_Deva
74
+ Chhattisgarhi | hne_Deva
75
+ Croatian | hrv_Latn
76
+ Hungarian | hun_Latn
77
+ Armenian | hye_Armn
78
+ Igbo | ibo_Latn
79
+ Ilocano | ilo_Latn
80
+ Indonesian | ind_Latn
81
+ Icelandic | isl_Latn
82
+ Italian | ita_Latn
83
+ Javanese | jav_Latn
84
+ Japanese | jpn_Jpan
85
+ Kabyle | kab_Latn
86
+ Jingpho | kac_Latn
87
+ Kamba | kam_Latn
88
+ Kannada | kan_Knda
89
+ Kashmiri (Arabic script) | kas_Arab
90
+ Kashmiri (Devanagari script) | kas_Deva
91
+ Georgian | kat_Geor
92
+ Central Kanuri (Arabic script) | knc_Arab
93
+ Central Kanuri (Latin script) | knc_Latn
94
+ Kazakh | kaz_Cyrl
95
+ Kabiyè | kbp_Latn
96
+ Kabuverdianu | kea_Latn
97
+ Khmer | khm_Khmr
98
+ Kikuyu | kik_Latn
99
+ Kinyarwanda | kin_Latn
100
+ Kyrgyz | kir_Cyrl
101
+ Kimbundu | kmb_Latn
102
+ Northern Kurdish | kmr_Latn
103
+ Kikongo | kon_Latn
104
+ Korean | kor_Hang
105
+ Lao | lao_Laoo
106
+ Ligurian | lij_Latn
107
+ Limburgish | lim_Latn
108
+ Lingala | lin_Latn
109
+ Lithuanian | lit_Latn
110
+ Lombard | lmo_Latn
111
+ Latgalian | ltg_Latn
112
+ Luxembourgish | ltz_Latn
113
+ Luba-Kasai | lua_Latn
114
+ Ganda | lug_Latn
115
+ Luo | luo_Latn
116
+ Mizo | lus_Latn
117
+ Standard Latvian | lvs_Latn
118
+ Magahi | mag_Deva
119
+ Maithili | mai_Deva
120
+ Malayalam | mal_Mlym
121
+ Marathi | mar_Deva
122
+ Minangkabau (Arabic script) | min_Arab
123
+ Minangkabau (Latin script) | min_Latn
124
+ Macedonian | mkd_Cyrl
125
+ Plateau Malagasy | plt_Latn
126
+ Maltese | mlt_Latn
127
+ Meitei (Bengali script) | mni_Beng
128
+ Halh Mongolian | khk_Cyrl
129
+ Mossi | mos_Latn
130
+ Maori | mri_Latn
131
+ Burmese | mya_Mymr
132
+ Dutch | nld_Latn
133
+ Norwegian Nynorsk | nno_Latn
134
+ Norwegian Bokmål | nob_Latn
135
+ Nepali | npi_Deva
136
+ Northern Sotho | nso_Latn
137
+ Nuer | nus_Latn
138
+ Nyanja | nya_Latn
139
+ Occitan | oci_Latn
140
+ West Central Oromo | gaz_Latn
141
+ Odia | ory_Orya
142
+ Pangasinan | pag_Latn
143
+ Eastern Panjabi | pan_Guru
144
+ Papiamento | pap_Latn
145
+ Western Persian | pes_Arab
146
+ Polish | pol_Latn
147
+ Portuguese | por_Latn
148
+ Dari | prs_Arab
149
+ Southern Pashto | pbt_Arab
150
+ Ayacucho Quechua | quy_Latn
151
+ Romanian | ron_Latn
152
+ Rundi | run_Latn
153
+ Russian | rus_Cyrl
154
+ Sango | sag_Latn
155
+ Sanskrit | san_Deva
156
+ Santali | sat_Olck
157
+ Sicilian | scn_Latn
158
+ Shan | shn_Mymr
159
+ Sinhala | sin_Sinh
160
+ Slovak | slk_Latn
161
+ Slovenian | slv_Latn
162
+ Samoan | smo_Latn
163
+ Shona | sna_Latn
164
+ Sindhi | snd_Arab
165
+ Somali | som_Latn
166
+ Southern Sotho | sot_Latn
167
+ Spanish | spa_Latn
168
+ Tosk Albanian | als_Latn
169
+ Sardinian | srd_Latn
170
+ Serbian | srp_Cyrl
171
+ Swati | ssw_Latn
172
+ Sundanese | sun_Latn
173
+ Swedish | swe_Latn
174
+ Swahili | swh_Latn
175
+ Silesian | szl_Latn
176
+ Tamil | tam_Taml
177
+ Tatar | tat_Cyrl
178
+ Telugu | tel_Telu
179
+ Tajik | tgk_Cyrl
180
+ Tagalog | tgl_Latn
181
+ Thai | tha_Thai
182
+ Tigrinya | tir_Ethi
183
+ Tamasheq (Latin script) | taq_Latn
184
+ Tamasheq (Tifinagh script) | taq_Tfng
185
+ Tok Pisin | tpi_Latn
186
+ Tswana | tsn_Latn
187
+ Tsonga | tso_Latn
188
+ Turkmen | tuk_Latn
189
+ Tumbuka | tum_Latn
190
+ Turkish | tur_Latn
191
+ Twi | twi_Latn
192
+ Central Atlas Tamazight | tzm_Tfng
193
+ Uyghur | uig_Arab
194
+ Ukrainian | ukr_Cyrl
195
+ Umbundu | umb_Latn
196
+ Urdu | urd_Arab
197
+ Northern Uzbek | uzn_Latn
198
+ Venetian | vec_Latn
199
+ Vietnamese | vie_Latn
200
+ Waray | war_Latn
201
+ Wolof | wol_Latn
202
+ Xhosa | xho_Latn
203
+ Eastern Yiddish | ydd_Hebr
204
+ Yoruba | yor_Latn
205
+ Yue Chinese | yue_Hant
206
+ Chinese (Simplified) | zho_Hans
207
+ Chinese (Traditional) | zho_Hant
208
+ Standard Malay | zsm_Latn
209
+ Zulu | zul_Latn"""
210
+
211
+ lang_codes = {l.split(" | ")[0]: l.split(" | ")[1] for l in lang_codes.split("\n")}
212
+
213
+ dataset = load_dataset("facebook/flores", "all", trust_remote_code=True)["dev"]
214
+ data_per_lang = {}
215
+ for d in dataset:
216
+ for full, code in lang_codes.items():
217
+ k = f"sentence_{code}"
218
+ data_per_lang[full] = data_per_lang.get(code, []) + [d[k]]
219
+
220
+
221
+ def get_results(tokenizer, base_lang, comp_lang):
222
+ base_data = data_per_lang[base_lang]
223
+ comp_data = data_per_lang[comp_lang]
224
+
225
+ base_results = []
226
+ comp_results = []
227
+ for base_d, comp_d in zip(base_data, comp_data):
228
+ input_ids = tokenizer(base_d, return_tensors="np")[0]
229
+ base_results.append(len(input_ids))
230
+
231
+ input_ids = tokenizer(comp_d, return_tensors="np")[0]
232
+ comp_results.append(len(input_ids))
233
+
234
+ agg_base = np.array(base_results).mean()
235
+ agg_comp = np.array(comp_results).mean()
236
+ token_ratio = (v / baseline)
237
+
238
+ if token_ratio < 1.:
239
+ adverb = "less"
240
+ token_ratio = (1. - token_ratio) * 100
241
+
242
+ else:
243
+ adverb = "more"
244
+ token_ratio = (token_ratio - 1.) * 100
245
+
246
+ output = f"You need {token_ratio}% tokens to represent your text in {comp_lang} than in {base_lang}."
247
+ return output
248
+
249
+
250
+ with gr.Blocks() as demo:
251
+ with gr.Column():
252
+ with gr.Row():
253
+ tokenizer = gr.Textbox(label="Tokenizer name", value="bert-base-cased")
254
+
255
+ with gr.Row():
256
+ with gr.Column():
257
+ base_lang = gr.Dropdown(
258
+ list(lang_codes.keys()), label="Languages"
259
+ )
260
+ with gr.Column():
261
+ comp_lang = gr.Dropdown(
262
+ list(lang_codes.keys()), label="Languages"
263
+ )
264
+
265
+ with gr.Row():
266
+ btn = gr.Button("Submit")
267
+
268
+ out_text = gr.Markdown()
269
+
270
+ btn.click(
271
+ get_results,
272
+ inputs=[tokenizer, base_lang, comp_lang],
273
+ outputs=[out_text],
274
+ api_name=False,
275
+ )
276
+
277
+ demo.launch()