small100 / app.py
alirezamsh's picture
Update app.py
f4c5c58
raw
history blame
3.19 kB
import gradio as gr
import os
os.system("pip install transformers sentencepiece torch")
from transformers import M2M100ForConditionalGeneration
from tokenization_small100 import SMALL100Tokenizer
langs = """Afrikaans (af), Amharic (am), Arabic (ar), Asturian (ast), Azerbaijani (az), Bashkir (ba), Belarusian (be), Bulgarian (bg), Bengali (bn), Breton (br), Bosnian (bs), Catalan; Valencian (ca), Cebuano (ceb), Czech (cs), Welsh (cy), Danish (da), German (de), Greeek (el), English (en), Spanish (es), Estonian (et), Persian (fa), Fulah (ff), Finnish (fi), French (fr), Western Frisian (fy), Irish (ga), Gaelic; Scottish Gaelic (gd), Galician (gl), Gujarati (gu), Hausa (ha), Hebrew (he), Hindi (hi), Croatian (hr), Haitian; Haitian Creole (ht), Hungarian (hu), Armenian (hy), Indonesian (id), Igbo (ig), Iloko (ilo), Icelandic (is), Italian (it), Japanese (ja), Javanese (jv), Georgian (ka), Kazakh (kk), Central Khmer (km), Kannada (kn),
Korean (ko), Luxembourgish; Letzeburgesch (lb), Ganda (lg), Lingala (ln), Lao (lo), Lithuanian (lt), Latvian (lv), Malagasy (mg), Macedonian (mk), Malayalam (ml), Mongolian (mn), Marathi (mr), Malay (ms), Burmese (my), Nepali (ne), Dutch; Flemish (nl), Norwegian (no), Northern Sotho (ns), Occitan (post 1500) (oc), Oriya (or), Panjabi; Punjabi (pa), Polish (pl), Pushto; Pashto (ps), Portuguese (pt), Romanian; Moldavian; Moldovan (ro), Russian (ru), Sindhi (sd), Sinhala; Sinhalese (si), Slovak (sk),
Slovenian (sl), Somali (so), Albanian (sq), Serbian (sr), Swati (ss), Sundanese (su), Swedish (sv), Swahili (sw), Tamil (ta), Thai (th), Tagalog (tl), Tswana (tn),
Turkish (tr), Ukrainian (uk), Urdu (ur), Uzbek (uz), Vietnamese (vi), Wolof (wo), Xhosa (xh), Yiddish (yi), Yoruba (yo), Chinese (zh), Zulu (zu)"""
lang_list = [lang.strip() for lang in langs.split(',')]
model = M2M100ForConditionalGeneration.from_pretrained("alirezamsh/small100")
tokenizer = SMALL100Tokenizer.from_pretrained("alirezamsh/small100")
description = """This is a demo for the paper [*SMaLL-100: Introducing Shallow Multilingual Machine Translation Model for Low-Resource Languages*](https://arxiv.org/abs/2210.11621) by Alireza Mohammadshahi, Vassilina Nikoulina, Alexandre Berard, Caroline Brun, James Henderson, Laurent Besacier
In this paper, they propose a compact and shallow massively multilingual MT model, and achieve competitive result with M2M-100, while being super smaller and faster. More details are provided [here](https://huggingface.co/alirezamsh/small100)"""
def small100_tr(lang, text):
lang = lang.split(" ")[-1][1:-1]
tokenizer.tgt_lang = lang
encoded_text = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_text)
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
examples = [["French (fr)", "Life is like a box of chocolates."]]
output_text = gr.outputs.Textbox()
gr.Interface(small100_tr, inputs=[gr.inputs.Dropdown(lang_list, label=" Target Language"), 'Source Text'], outputs=output_text, title="SMaLL100: Translate Between 100 languages much faster",
description=description,
examples=examples
).launch()