File size: 10,463 Bytes
797b28f
 
 
 
 
 
 
 
 
 
 
 
 
e2d7945
797b28f
 
ca36703
 
 
7414d79
797b28f
05e17e2
bd34b3e
7414d79
bd34b3e
797b28f
bd34b3e
 
 
 
 
 
 
 
 
 
 
797b28f
 
 
 
e2d8f81
797b28f
 
7414d79
797b28f
7414d79
797b28f
7414d79
797b28f
b0e8478
797b28f
 
 
9d18675
3f35fcb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d18675
797b28f
 
b7a6eed
7414d79
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
from transformers import AutoModelForSeq2SeqLM
from transformers import AlbertTokenizer


tokenizer = AlbertTokenizer.from_pretrained(
    "prajdabre/CreoleM2M", do_lower_case=False, use_fast=False, keep_accents=True)
model = AutoModelForSeq2SeqLM.from_pretrained(
    "prajdabre/CreoleM2M").eval()
bos_id = tokenizer._convert_token_to_id_with_added_voc("<s>")
eos_id = tokenizer._convert_token_to_id_with_added_voc("</s>")
pad_id = tokenizer._convert_token_to_id_with_added_voc("<pad>")

CREOLE = {"Hawaiian Pidgin": "hwc", "Saint Lucian Creole": "acf", "Belizean Creole": "bzj", "Chavacano Creole": "cbk", "Seychellois Creole": "crs", "Sranan Tongo": "srn", "Aukan": "djk", "Gullah": "gul", "San Andrés–Providencia Creole": "icr", "Jamaican Creole": "jam", "Mauritian Creole": "mfe", "Papiamento": "pap", "Pijin": "pis", "Tok Pisin": "tpi", "Torres Strait Creole": "tcs", "Australian Kriol": "rop", "Sango": "sag", "Saramaccan": "srm", "Bislama": "bis", "Nigerian Pidgin": "pcm", "Sierra Leonean Creole": "kri", "Haitian Creole": "hat", "Kupang Malay": "mkn", "Tetun Dili": "tdt", "Malay Baba": "mbf", "Kituba": "ktu", "English": "eng"}


def generate(input, slang, tlang):
    slang = CREOLE[slang]
    tlang = CREOLE[tlang]
    inp = tokenizer(input.strip() + " </s> <2" + slang + ">",
                    add_special_tokens=False, return_tensors="pt", padding=True).input_ids
    if (slang != "eng" and tlang == "eng") or (slang == "eng" and tlang != "eng") or (slang == tlang):
        model_output = model.generate(inp, use_cache=True, num_beams=1, max_length=int(2*len(inp[0])), min_length=1, early_stopping=True, pad_token_id=pad_id,
                                  bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2"+tlang+">"))
        decoded_output = tokenizer.decode(
        model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    elif slang != tlang:
        model_output = model.generate(inp, use_cache=True, num_beams=1, max_length=int(2*len(inp[0])), min_length=1, early_stopping=True, pad_token_id=pad_id,
                                  bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2eng>"))
        decoded_output = tokenizer.decode(
        model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
        inp = tokenizer(decoded_output + " </s> <2eng>",
                    add_special_tokens=False, return_tensors="pt", padding=True).input_ids
        model_output = model.generate(inp, use_cache=True, num_beams=1, max_length=int(2*len(inp[0])), min_length=1, early_stopping=True, pad_token_id=pad_id,
                                  bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2"+tlang+">"))
        decoded_output = tokenizer.decode(
        model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

    return decoded_output


languages = list(CREOLE.keys())

src_language_drop_down = gr.inputs.Dropdown(
    languages, type="value", default="Hawaiian Pidgin", label="Select source language")
tgt_language_drop_down = gr.inputs.Dropdown(
    languages, type="value", default="English", label="Select target language")
text = gr.inputs.Textbox(lines=1, placeholder="Enter text here...",
                         default="", label="Enter text in the source language")
text_ouptut = gr.outputs.Textbox(
    type="text", label="View translation in the target language")

supported_lang = ', '.join(languages)

examples = [
['Mé lè sé sòlda-a mawéy pou yo té bat li , Pòl di ofisyé-a ki doubout la-a , “ Ès lwa-a di ou sa bat on jan Ronm si ou pòkò menm fè lodyans pou sa ? ”', "Saint Lucian Creole", "English"],
['Be taem oli fasemgud hem blong wipim hem , Pol i talem long kapten blong olgeta , we i stap stanap long ples ya se , “ ! E ! Mi mi sitisen blong Rom ya . Yufala i no jajem mi yet . ! Olsem wanem yufala i wantem wipim mi ? ”', "Bislama", "English"],
['Wail di soalja dehn mi-di tai op Paal fi beet ahn , Paal aks wan a di aafisa dehn weh mi-di stan op kloas tu ahn , “ Tel mi , ih leegal fi beet wahn Roaman sitizn bifoa unu chrai ahn da koat ? ”', "Belizean Creole", "English"],
['Mientras ta amarra sila con Pablo para latiga , ya habla le con el capitan quien talla parao , “ Tiene ba uste el derecho para latiga con un ciudadano Romano que nuay pa pasa investigacion de algun crimen ? ”', "Chavacano Creole", "English"],
['Kan zot tin anmar li pour li ganny fwete , Pol ti dir avek sa zofisye ki ti la , “ Eski ou annan drwa fwet en sitwayen Romen ki pan ganny zize ? ”', "Seychellois Creole", "English"],
['Den tei en poti fu leli buba . Ma a piki a ofisii di mu meke den du dati taki : “ U tei mi enke foluku fu Loma Foto fu wipi ondoosuku ! Ma kownu anda weiti taki : Na lanti fu kuutu mu ondoosuku foluku fu Loma Foto . A sowtu wipi ya a ganda mindii noiti mu pasa . ’ ”', "Aukan", "English"],
['Bot wen dey tie Paul op an scretch um out fa beat um , Paul taak ta de offisa wa beena stanop dey . Paul aks um say , “ De law ain tell oona dat oona kin beat a Roman citizen wen nobody ain eben jedge um , needa find out dat e done sompin bad , ainty dough ? ”', "Gullah", "English"],
['Wen dey wen stretch him out fo whip him real hard , Paul wen tell da captain dat stay dea , “ Dis okay in da rules fo da Rome peopo ? fo you fo whip one guy dat get da same rights jalike da Rome peopo ? even one guy dat neva do notting wrong ? ”', "Hawaiian Pidgin", "English"],
['Wail di suoldya dehn wende tai op Paul fi biit im , Paul aks wan a di aafisa weh wende stan op gens im , “ Tel mi , sah , ih liigal fi biit wan Roman sitizn bifuor unu trai im dah kuot ? ”', "San Andrés–Providencia Creole", "English"],
['Afta dem tai im op an chrech im out fi biit im , Paal aks di ed fi onjrid suoja we did tan op de , “ Di Laa gi yu no rait fi biit mi , wan man we kom fram Ruom , wen yu no iivn kyari mi go a kuot an se mi gilti fi notn ? ”', "Jamaican Creole", "English"],
['Bɔt wɛn dɛn want bit am , dɛn tay am ; na in Pɔl aks di soja man dɛn edman we bin tinap de se , “ Di lɔ tɛl una se una kin bit pɔsin we na Roman wɛn una nɔ jɔj am yet ? ”', "Sierra Leonean Creole", "English"],
['Me letan zot fini atas li pou kapav fwet li , Pol dir ofisie ki ti la , “ Zot ena drwa fwet enn sitwayin Romin san mem ki zot ziz li ? ”', "Mauritian Creole", "English"],
['Ma ora nan a rèk su kurpa pa suté ku zuip , Pablo a bisa e ofisial di ehérsito pará einan : “ Boso tin mag di suta un hende ku ta siudadano romano sin ku e ta kondená ? ”', "Papiamento", "English"],
['Wen dem don put am for groun mak dem start to flog am , Pol kon ask di soja wey stand near am , " E dey rite mak una flog pesin wey bi Roman citizin , wen dem neva joj en kase ? "', "Nigerian Pidgin", "English"],
['Bat taem olketa taengem hand bilong hem long post for whipim hem , Paul sei olsem long bigman bilong army wea standap long there : “ Hao , hem stret for iufala whipim wanfala man bilong Rome wea iufala no kotem hem yet ? ”', "Pijin", "English"],
['en wen deibin taiyimap Pol blanga beldim im , imbin tok langa det boswan solja hubin jandap deya wansaid langa im . Imbin tok , “ Yumob nomo lau beldim mi , dumaji mi garram det rait seimwei laik ol yumob Roman pipul , en ai nomo bin abum kotkeis yet . ”', "Australian Kriol", "English"],
['Me tongana ala leke lo ti tene a pika lo na zaza , Paul atene na turugu ti kota kamba so ayeke luti na ndo so : “ Ndia amû lege na ALA ti pika na zaza mbeni koli so ayeke Romain na so a dë ngbanga na li ti lo pëpe ? ”', "Sango", "English"],
['Hën de tjëën go seeka tai fu de hupi . Hën Paulosu hakisi di kabiteni u sodati taanputaanpu dë taa : “ Unfa di wëti dë ? Un sa hupi wan goon mii u Loomë ufö un kuutu soni fëën ö ? ”', "Saramaccan", "English"],
['Ma di den poti Paulus didon langalanga fu wipi en , dan a taigi a legre-ofsiri di ben e tanapu drape : „ A fiti taki unu e wipi wan Romesma sondro fu krutu en ? ”', "Sranan Tongo", "English"],
['Bat wen dempla i bin mekpas Pol so dempla ken ploke em , Pol i bin spik po da sekan amiopisa uda bin stanap klostu wea em . Pol i bin spik , ‘ Ei yu ! Yu lau po ploke man uda gad rait wase man prom Rom , bipo yupla teke em po kot a ? ’', "Torres Strait Creole", "English"],
['Tasol taim ol i apim 2-pela han bilong en na pasim bilong wipim em , Pol i tokim ofisa bilong ami i sanap klostu olsem : “ I stret yupela i wipim wanpela man Rom taim em i no bin sanap yet long kot ? ”', "Tok Pisin", "English"],
['Men , lè yo fin mare Pòl pou yo bat li , li di ofisye ki te la a : “ Èske NOU gen dwa bat yon sitwayen women ki pa kondane ? ”', "Haitian Creole", "English"],
['Waktu dong ika sang Paulus ko mau firuk sang dia , ju dia bale tanya sang itu tantara , bilang , “ Iko pamarenta Roma pung atoran , mana yang batúl ? Kalo satu orang ada pung hak warga Roma , ais dia dapa parkara , bosong musti bekin karmana sang dia ? Bosong papoko lebe dolo sang dia , ko , bosong pareksa lebe dolo ? ”', "Kupang Malay", "English"],
['Maibé kuandu sira kesi tiha nia atu baku nia ho xikote , Paulo dehan ba kapitaun tropa nian neʼebé hamriik besik : “ Tuir lei , imi bele baku ema Roma ida maski seidauk tesi lia ba nia ka lae ? ”', "Tetun Dili", "English"],
['Bila dia-orang sudah ikatkan dia dngan tali kulit , Paulus kata sama itu hulubalang yang berdiri dkat situ , " Kalau satu anak Rom blum kna hukum , ada-kah patut angkau ssahkan dia ? "', "Malay Baba", "English"],
['Bu bau imene kukanga yandi sambu na kubula yandi fimbe , Paulu tubaka kwa nkuluntu ya telemaka kuna : Nsiku pesa nzila kubula muntu ya Loma fimbu ya imene kufunduswa ve ?', "Kituba", "English"],
['But when they had stretched him out for the whipping , Paul said to the army officer standing there : “ Is it lawful for you to scourge a Roman who has not been condemned ? ”', "English", "Hawaiian Pidgin"]
]

iface = gr.Interface(fn=generate, inputs=[text, src_language_drop_down, tgt_language_drop_down], outputs=text_ouptut, title='CreoleM2M System',
                     description='A system to translate to, from and between Creoles (and English). Currently the model supports ' + supported_lang, examples=examples) #
iface.launch(enable_queue=True)