vien / app.py
p
back m2m100_1.2B due to memory litmit
3fce624
# Based on example code of https://huggingface.co/facebook/m2m100_1.2B
import gradio as gr
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
model = M2M100ForConditionalGeneration.from_pretrained(
"facebook/m2m100_1.2B")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
this_description = '''
Using facebook/m2m100_1.2B pre-trained model.
Chinese(zh)
English(en)
Hindi(hi)
Japanese(ja)
Sinhalese(si)
Thai(th)
Vietnamese(vi)
...
'''
# From facebook/m2m100_1.2B
lang_codes = {
"Afrikaans": "af",
"Amharic": "am",
"Arabic": "ar",
"Asturian": "ast",
"Azerbaijani": "az",
"Bashkir": "ba",
"Belarusian": "be",
"Bulgarian": "bg",
"Bengali": "bn",
"Breton": "br",
"Bosnian": "bs",
"Catalan; Valencian": "ca",
"Cebuano": "ceb",
"Czech": "cs",
"Welsh": "cy",
"Danish": "da",
"German": "de",
"Greeek": "el",
"English": "en",
"Spanish": "es",
"Estonian": "et",
"Persian": "fa",
"Fulah": "ff",
"Finnish": "fi",
"French": "fr",
"Western Frisian": "fy",
"Irish": "ga",
"Gaelic; Scottish Gaelic": "gd",
"Galician": "gl",
"Gujarati": "gu",
"Hausa": "ha",
"Hebrew": "he",
"Hindi": "hi",
"Croatian": "hr",
"Haitian; Haitian Creole": "ht",
"Hungarian": "hu",
"Armenian": "hy",
"Indonesian": "id",
"Igbo": "ig",
"Iloko": "ilo",
"Icelandic": "is",
"Italian": "it",
"Japanese": "ja",
"Javanese": "jv",
"Georgian": "ka",
"Kazakh": "kk",
"Central Khmer": "km",
"Kannada": "kn",
"Korean": "ko",
"Luxembourgish; Letzeburgesch": "lb",
"Ganda": "lg",
"Lingala": "ln",
"Lao": "lo",
"Lithuanian": "lt",
"Latvian": "lv",
"Malagasy": "mg",
"Macedonian": "mk",
"Malayalam": "ml",
"Mongolian": "mn",
"Marathi": "mr",
"Malay": "ms",
"Burmese": "my",
"Nepali": "ne",
"Dutch; Flemish": "nl",
"Norwegian": "no",
"Northern Sotho": "ns",
"Occitan": "oc",
"Oriya": "or",
"Panjabi; Punjabi": "pa",
"Polish": "pl",
"Pushto": "ps",
"Portuguese": "pt",
"Romanian; Moldavian; Moldovan": "ro",
"Russian": "ru",
"Sindhi": "sd",
"Sinhala; Sinhalese": "si",
"Slovak": "sk",
"Slovenian": "sl",
"Somali": "so",
"Albanian": "sq",
"Serbian": "sr",
"Swati": "ss",
"Sundanese": "su",
"Swedish": "sv",
"Swahili": "sw",
"Tamil": "ta",
"Thai": "th",
"Tagalog": "tl",
"Tswana": "tn",
"Turkish": "tr",
"Ukrainian": "uk",
"Urdu": "ur",
"Uzbek": "uz",
"Vietnamese": "vi",
"Wolof": "wo",
"Xhosa": "xh",
"Yiddish": "yi",
"Yoruba": "yo",
"Chinese": "zh",
"Zulu": "zu"
}
def m2m_translate(Input_Text, from_lang, to_lang):
tokenizer.src_lang = lang_codes[from_lang]
encoded_from_lang = tokenizer(Input_Text, return_tensors="pt")
generated_tokens = model.generate(
**encoded_from_lang,
max_new_tokens=200,
forced_bos_token_id=tokenizer.get_lang_id(lang_codes[to_lang])
)
res = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
return res[0]
iface = gr.Interface(
fn=m2m_translate,
title="M2M100 Text Translation",
description=this_description,
inputs=[
gr.Textbox(lines=5, placeholder="Enter text", label="Text input"),
gr.Radio(
choices=[
'Burmese',
'Chinese',
'English',
'Hindi',
'Japanese',
'Sinhala',
'Thai',
'Vietnamese'
],
value='Vietnamese',
label='From language'
),
gr.Radio(
choices=[
'Burmese',
'Chinese',
'English',
'Hindi',
'Japanese',
'Sinhala',
'Thai',
'Vietnamese'
],
value='English',
label='To language'
),
],
outputs="text")
iface.launch()