RedDev commited on
Commit
dab21ad
1 Parent(s): 39a93cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -1
app.py CHANGED
@@ -1,3 +1,117 @@
1
  import gradio as gr
 
 
2
 
3
- gr.load("models/RedDev/nllb-deu-tok-v1").launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
3
+ import torch
4
 
5
+ def fix_tokenizer(tokenizer, new_lang='tok_Latn'):
6
+ """ Add a new language token to the tokenizer vocabulary (this should be done each time after its initialization) """
7
+ old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
8
+ tokenizer.lang_code_to_id[new_lang] = old_len-1
9
+ tokenizer.id_to_lang_code[old_len-1] = new_lang
10
+ # always move "mask" to the last position
11
+ tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
12
+
13
+ tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
14
+ tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
15
+ if new_lang not in tokenizer._additional_special_tokens:
16
+ tokenizer._additional_special_tokens.append(new_lang)
17
+ # clear the added token encoder; otherwise a new token may end up there by mistake
18
+ tokenizer.added_tokens_encoder = {}
19
+ tokenizer.added_tokens_decoder = {}
20
+
21
+ model = AutoModelForSeq2SeqLM.from_pretrained("RedDev/nllb-deu-tok-v1")
22
+ tokenizer = NllbTokenizer.from_pretrained("RedDev/nllb-deu-tok-v1")
23
+ fix_tokenizer(tokenizer)
24
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
25
+ LANG_CODES = {
26
+ "Deutsch":"deu_Latn",
27
+ "toki pona":"tok_Latn"
28
+ }
29
+
30
+ def translate(text, src_lang, tgt_lang, candidates:int):
31
+ """
32
+ Translate the text from source lang to target lang
33
+ """
34
+
35
+ src = LANG_CODES.get(src_lang)
36
+ tgt = LANG_CODES.get(tgt_lang)
37
+
38
+ tokenizer.src_lang = src
39
+ tokenizer.tgt_lang = tgt
40
+
41
+ ins = tokenizer(text, return_tensors='pt').to(device)
42
+
43
+ gen_args = {
44
+ 'return_dict_in_generate': True,
45
+ 'output_scores': True,
46
+ 'output_hidden_states': True,
47
+ 'length_penalty': 0.0, # don't encourage longer or shorter output,
48
+ 'num_return_sequences': candidates,
49
+ 'num_beams':candidates,
50
+ 'forced_bos_token_id': tokenizer.lang_code_to_id[tgt]
51
+ }
52
+
53
+
54
+ outs = model.generate(**{**ins, **gen_args})
55
+ output = tokenizer.batch_decode(outs.sequences, skip_special_tokens=True)
56
+
57
+ return '\n'.join(output)
58
+
59
+ with gr.Blocks() as app:
60
+ markdown="""
61
+ # An English / toki pona Neural Machine Translation App!
62
+
63
+ ### toki a! 💬
64
+
65
+ This is an english to toki pona / toki pona to english neural machine translation app.
66
+
67
+ Input your text to translate, a source language and target language, and desired number of return sequences!
68
+
69
+ ### Grammar Regularization
70
+ An interesting quirk of training a many-to-many translation model is that pseudo-grammar correction
71
+ can be achieved by translating *from* **language A** *to* **language A**
72
+
73
+ Remember, this can ***approximate*** grammaticality, but it isn't always the best.
74
+
75
+ For example, "mi li toki e toki pona" (Source Language: toki pona & Target Language: toki pona) will result in:
76
+ - ['mi toki e toki pona.', 'mi toki pona.', 'mi toki e toki pona']
77
+ - (Thus, the ungrammatical "li" is dropped)
78
+
79
+ ### Model and Data
80
+ This app utilizes a fine-tuned version of Facebook/Meta AI's M2M100 418M param model.
81
+
82
+ By leveraging the pretrained weights of the massively multilingual M2M100 model,
83
+ we can jumpstart our transfer learning to accomplish machine translation for toki pona!
84
+
85
+ The model was fine-tuned on the English/toki pona bitexts found at [https://tatoeba.org/](https://tatoeba.org/)
86
+
87
+ ### This app is a work in progress and obviously not all translations will be perfect.
88
+ In addition to parameter quantity and the hyper-parameters used while training,
89
+ the *quality of data* found on Tatoeba directly influences the perfomance of projects like this!
90
+
91
+ If you wish to contribute, please add high quality and diverse translations to Tatoeba!
92
+ """
93
+
94
+ with gr.Row():
95
+ gr.Markdown(markdown)
96
+ with gr.Column():
97
+ input_text = gr.components.Textbox(label="Input Text", value="Raccoons are fascinating creatures, but I prefer opossums.")
98
+ source_lang = gr.components.Dropdown(label="Source Language", value="Deutsch", choices=list(LANG_CODES.keys()))
99
+ target_lang = gr.components.Dropdown(label="Target Language", value="toki pona", choices=list(LANG_CODES.keys()))
100
+ return_seqs = gr.Slider(label="Number of return sequences", value=3, minimum=1, maximum=12, step=1)
101
+
102
+ inputs=[input_text, source_lang, target_lang, return_seqs]
103
+ outputs = gr.Textbox()
104
+
105
+ translate_btn = gr.Button("Translate! | o ante toki!")
106
+ translate_btn.click(translate, inputs=inputs, outputs=outputs)
107
+
108
+ gr.Examples(
109
+ [
110
+ ["Hello! How are you?", "English", "toki pona", 3],
111
+ ["toki a! ilo pi ante toki ni li pona!", "toki pona", "English", 3],
112
+ ["mi li toki e toki pona", "toki pona", "toki pona", 3],
113
+ ],
114
+ inputs=inputs
115
+ )
116
+
117
+ app.launch()