Spaces:
Sleeping
Sleeping
from datasets import load_dataset | |
import numpy as np | |
from transformers import AutoTokenizer | |
import gradio as gr | |
lang_codes = """Acehnese (Arabic script) | ace_Arab | |
Acehnese (Latin script) | ace_Latn | |
Mesopotamian Arabic | acm_Arab | |
Ta’izzi-Adeni Arabic | acq_Arab | |
Tunisian Arabic | aeb_Arab | |
Afrikaans | afr_Latn | |
South Levantine Arabic | ajp_Arab | |
Akan | aka_Latn | |
Amharic | amh_Ethi | |
North Levantine Arabic | apc_Arab | |
Modern Standard Arabic | arb_Arab | |
Modern Standard Arabic (Romanized) | arb_Latn | |
Najdi Arabic | ars_Arab | |
Moroccan Arabic | ary_Arab | |
Egyptian Arabic | arz_Arab | |
Assamese | asm_Beng | |
Asturian | ast_Latn | |
Awadhi | awa_Deva | |
Central Aymara | ayr_Latn | |
South Azerbaijani | azb_Arab | |
North Azerbaijani | azj_Latn | |
Bashkir | bak_Cyrl | |
Bambara | bam_Latn | |
Balinese | ban_Latn | |
Belarusian | bel_Cyrl | |
Bemba | bem_Latn | |
Bengali | ben_Beng | |
Bhojpuri | bho_Deva | |
Banjar (Arabic script) | bjn_Arab | |
Banjar (Latin script) | bjn_Latn | |
Standard Tibetan | bod_Tibt | |
Bosnian | bos_Latn | |
Buginese | bug_Latn | |
Bulgarian | bul_Cyrl | |
Catalan | cat_Latn | |
Cebuano | ceb_Latn | |
Czech | ces_Latn | |
Chokwe | cjk_Latn | |
Central Kurdish | ckb_Arab | |
Crimean Tatar | crh_Latn | |
Welsh | cym_Latn | |
Danish | dan_Latn | |
German | deu_Latn | |
Southwestern Dinka | dik_Latn | |
Dyula | dyu_Latn | |
Dzongkha | dzo_Tibt | |
Greek | ell_Grek | |
English | eng_Latn | |
Esperanto | epo_Latn | |
Estonian | est_Latn | |
Basque | eus_Latn | |
Ewe | ewe_Latn | |
Faroese | fao_Latn | |
Fijian | fij_Latn | |
Finnish | fin_Latn | |
Fon | fon_Latn | |
French | fra_Latn | |
Friulian | fur_Latn | |
Nigerian Fulfulde | fuv_Latn | |
Scottish Gaelic | gla_Latn | |
Irish | gle_Latn | |
Galician | glg_Latn | |
Guarani | grn_Latn | |
Gujarati | guj_Gujr | |
Haitian Creole | hat_Latn | |
Hausa | hau_Latn | |
Hebrew | heb_Hebr | |
Hindi | hin_Deva | |
Chhattisgarhi | hne_Deva | |
Croatian | hrv_Latn | |
Hungarian | hun_Latn | |
Armenian | hye_Armn | |
Igbo | ibo_Latn | |
Ilocano | ilo_Latn | |
Indonesian | ind_Latn | |
Icelandic | isl_Latn | |
Italian | ita_Latn | |
Javanese | jav_Latn | |
Japanese | jpn_Jpan | |
Kabyle | kab_Latn | |
Jingpho | kac_Latn | |
Kamba | kam_Latn | |
Kannada | kan_Knda | |
Kashmiri (Arabic script) | kas_Arab | |
Kashmiri (Devanagari script) | kas_Deva | |
Georgian | kat_Geor | |
Central Kanuri (Arabic script) | knc_Arab | |
Central Kanuri (Latin script) | knc_Latn | |
Kazakh | kaz_Cyrl | |
Kabiyè | kbp_Latn | |
Kabuverdianu | kea_Latn | |
Khmer | khm_Khmr | |
Kikuyu | kik_Latn | |
Kinyarwanda | kin_Latn | |
Kyrgyz | kir_Cyrl | |
Kimbundu | kmb_Latn | |
Northern Kurdish | kmr_Latn | |
Kikongo | kon_Latn | |
Korean | kor_Hang | |
Lao | lao_Laoo | |
Ligurian | lij_Latn | |
Limburgish | lim_Latn | |
Lingala | lin_Latn | |
Lithuanian | lit_Latn | |
Lombard | lmo_Latn | |
Latgalian | ltg_Latn | |
Luxembourgish | ltz_Latn | |
Luba-Kasai | lua_Latn | |
Ganda | lug_Latn | |
Luo | luo_Latn | |
Mizo | lus_Latn | |
Standard Latvian | lvs_Latn | |
Magahi | mag_Deva | |
Maithili | mai_Deva | |
Malayalam | mal_Mlym | |
Marathi | mar_Deva | |
Minangkabau (Arabic script) | min_Arab | |
Minangkabau (Latin script) | min_Latn | |
Macedonian | mkd_Cyrl | |
Plateau Malagasy | plt_Latn | |
Maltese | mlt_Latn | |
Meitei (Bengali script) | mni_Beng | |
Halh Mongolian | khk_Cyrl | |
Mossi | mos_Latn | |
Maori | mri_Latn | |
Burmese | mya_Mymr | |
Dutch | nld_Latn | |
Norwegian Nynorsk | nno_Latn | |
Norwegian Bokmål | nob_Latn | |
Nepali | npi_Deva | |
Northern Sotho | nso_Latn | |
Nuer | nus_Latn | |
Nyanja | nya_Latn | |
Occitan | oci_Latn | |
West Central Oromo | gaz_Latn | |
Odia | ory_Orya | |
Pangasinan | pag_Latn | |
Eastern Panjabi | pan_Guru | |
Papiamento | pap_Latn | |
Western Persian | pes_Arab | |
Polish | pol_Latn | |
Portuguese | por_Latn | |
Dari | prs_Arab | |
Southern Pashto | pbt_Arab | |
Ayacucho Quechua | quy_Latn | |
Romanian | ron_Latn | |
Rundi | run_Latn | |
Russian | rus_Cyrl | |
Sango | sag_Latn | |
Sanskrit | san_Deva | |
Santali | sat_Olck | |
Sicilian | scn_Latn | |
Shan | shn_Mymr | |
Sinhala | sin_Sinh | |
Slovak | slk_Latn | |
Slovenian | slv_Latn | |
Samoan | smo_Latn | |
Shona | sna_Latn | |
Sindhi | snd_Arab | |
Somali | som_Latn | |
Southern Sotho | sot_Latn | |
Spanish | spa_Latn | |
Tosk Albanian | als_Latn | |
Sardinian | srd_Latn | |
Serbian | srp_Cyrl | |
Swati | ssw_Latn | |
Sundanese | sun_Latn | |
Swedish | swe_Latn | |
Swahili | swh_Latn | |
Silesian | szl_Latn | |
Tamil | tam_Taml | |
Tatar | tat_Cyrl | |
Telugu | tel_Telu | |
Tajik | tgk_Cyrl | |
Tagalog | tgl_Latn | |
Thai | tha_Thai | |
Tigrinya | tir_Ethi | |
Tamasheq (Latin script) | taq_Latn | |
Tamasheq (Tifinagh script) | taq_Tfng | |
Tok Pisin | tpi_Latn | |
Tswana | tsn_Latn | |
Tsonga | tso_Latn | |
Turkmen | tuk_Latn | |
Tumbuka | tum_Latn | |
Turkish | tur_Latn | |
Twi | twi_Latn | |
Central Atlas Tamazight | tzm_Tfng | |
Uyghur | uig_Arab | |
Ukrainian | ukr_Cyrl | |
Umbundu | umb_Latn | |
Urdu | urd_Arab | |
Northern Uzbek | uzn_Latn | |
Venetian | vec_Latn | |
Vietnamese | vie_Latn | |
Waray | war_Latn | |
Wolof | wol_Latn | |
Xhosa | xho_Latn | |
Eastern Yiddish | ydd_Hebr | |
Yoruba | yor_Latn | |
Yue Chinese | yue_Hant | |
Chinese (Simplified) | zho_Hans | |
Chinese (Traditional) | zho_Hant | |
Standard Malay | zsm_Latn | |
Zulu | zul_Latn""" | |
lang_codes = {l.split(" | ")[0]: l.split(" | ")[1] for l in lang_codes.split("\n")} | |
dataset = load_dataset("facebook/flores", "all", trust_remote_code=True)["dev"] | |
data_per_lang = {} | |
for d in dataset: | |
for full, code in lang_codes.items(): | |
k = f"sentence_{code}" | |
data_per_lang[full] = data_per_lang.get(code, []) + [d[k]] | |
def get_results(tokenizer_name, base_lang, comp_lang, HF_token=""): | |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, token=HF_token if HF_token != "" else False) | |
base_data = data_per_lang[base_lang] | |
comp_data = data_per_lang[comp_lang] | |
base_results = [] | |
comp_results = [] | |
for base_d, comp_d in zip(base_data, comp_data): | |
input_ids = tokenizer(base_d, return_tensors="np")[0] | |
base_results.append(len(input_ids)) | |
input_ids = tokenizer(comp_d, return_tensors="np")[0] | |
comp_results.append(len(input_ids)) | |
agg_base = np.array(base_results).mean() | |
agg_comp = np.array(comp_results).mean() | |
token_ratio = (agg_comp / agg_base) | |
print(token_ratio) | |
if token_ratio < 1.: | |
adverb = "less" | |
token_ratio = (1. - token_ratio) * 100 | |
else: | |
adverb = "more" | |
token_ratio = (token_ratio - 1.) * 100 | |
output = f"**You need {round(token_ratio, 3)}% {adverb} tokens to represent your text in {comp_lang} than in {base_lang}.**" | |
return output | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
gr.Markdown("""<h1>Language tokenization comparison</h1> | |
This tool will help you calculate the how many more or less tokens you need to tokenize text in different languages. | |
To perform this comparison we are using [FLORES](https://github.com/facebookresearch/flores/tree/main) dataset, developed by meta, which presents translations between English and low-resource languages. | |
We first tokenize around 1000 texts to the base language and to the language we want to compare. After that, we get average of inputs_ids lenght.""") | |
with gr.Row(): | |
with gr.Column(): | |
tokenizer = gr.Textbox(label="Tokenizer name", value="bert-base-cased") | |
with gr.Column(): | |
HF_token = gr.Textbox(label="your HF Token") | |
with gr.Row(): | |
with gr.Column(): | |
base_lang = gr.Dropdown( | |
list(lang_codes.keys()), label="Languages" | |
) | |
with gr.Column(): | |
comp_lang = gr.Dropdown( | |
list(lang_codes.keys()), label="Languages" | |
) | |
with gr.Row(): | |
btn = gr.Button("Submit") | |
out_text = gr.Markdown() | |
btn.click( | |
get_results, | |
inputs=[tokenizer, base_lang, comp_lang, HF_token], | |
outputs=[out_text], | |
api_name=False, | |
) | |
demo.launch() |