Spaces:

RedDev
/

RedDev-nllb-deu-tok-v1

Runtime error

App Files Files Community

RedDev commited on Nov 24, 2023

Commit

dab21ad

•

1 Parent(s): 39a93cd

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -1

app.py CHANGED Viewed

@@ -1,3 +1,117 @@
 import gradio as gr
-gr.load("models/RedDev/nllb-deu-tok-v1").launch()

 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+import torch
+def fix_tokenizer(tokenizer, new_lang='tok_Latn'):
+    """ Add a new language token to the tokenizer vocabulary (this should be done each time after its initialization) """
+    old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
+    tokenizer.lang_code_to_id[new_lang] = old_len-1
+    tokenizer.id_to_lang_code[old_len-1] = new_lang
+    # always move "mask" to the last position
+    tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
+    tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
+    tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
+    if new_lang not in tokenizer._additional_special_tokens:
+        tokenizer._additional_special_tokens.append(new_lang)
+    # clear the added token encoder; otherwise a new token may end up there by mistake
+    tokenizer.added_tokens_encoder = {}
+    tokenizer.added_tokens_decoder = {}
+model = AutoModelForSeq2SeqLM.from_pretrained("RedDev/nllb-deu-tok-v1")
+tokenizer = NllbTokenizer.from_pretrained("RedDev/nllb-deu-tok-v1")
+fix_tokenizer(tokenizer)
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+LANG_CODES = {
+    "Deutsch":"deu_Latn",
+    "toki pona":"tok_Latn"
+}
+def translate(text, src_lang, tgt_lang, candidates:int):
+    """
+    Translate the text from source lang to target lang
+    """
+    src = LANG_CODES.get(src_lang)
+    tgt = LANG_CODES.get(tgt_lang)
+    tokenizer.src_lang = src
+    tokenizer.tgt_lang = tgt
+    ins = tokenizer(text, return_tensors='pt').to(device)
+    gen_args = {
+            'return_dict_in_generate': True,
+            'output_scores': True,
+            'output_hidden_states': True,
+            'length_penalty': 0.0,  # don't encourage longer or shorter output,
+            'num_return_sequences': candidates,
+            'num_beams':candidates,
+            'forced_bos_token_id': tokenizer.lang_code_to_id[tgt]
+        }
+    outs = model.generate(**{**ins, **gen_args})
+    output = tokenizer.batch_decode(outs.sequences, skip_special_tokens=True)
+    return '\n'.join(output)
+with gr.Blocks() as app:
+    markdown="""
+    # An English / toki pona Neural Machine Translation App!
+    ### toki a! 💬
+    This is an english to toki pona / toki pona to english neural machine translation app.
+    Input your text to translate, a source language and target language, and desired number of return sequences!
+    ### Grammar Regularization
+    An interesting quirk of training a many-to-many translation model is that pseudo-grammar correction
+    can be achieved by translating *from* **language A** *to* **language A**
+    Remember, this can ***approximate*** grammaticality, but it isn't always the best.
+    For example, "mi li toki e toki pona" (Source Language: toki pona & Target Language: toki pona) will result in:
+    - ['mi toki e toki pona.', 'mi toki pona.', 'mi toki e toki pona']
+    - (Thus, the ungrammatical "li" is dropped)
+    ### Model and Data
+    This app utilizes a fine-tuned version of Facebook/Meta AI's M2M100 418M param model.
+    By leveraging the pretrained weights of the massively multilingual M2M100 model,
+    we can jumpstart our transfer learning to accomplish machine translation for toki pona!
+    The model was fine-tuned on the English/toki pona bitexts found at [https://tatoeba.org/](https://tatoeba.org/)
+    ### This app is a work in progress and obviously not all translations will be perfect.
+    In addition to parameter quantity and the hyper-parameters used while training,
+    the *quality of data* found on Tatoeba directly influences the perfomance of projects like this!
+    If you wish to contribute, please add high quality and diverse translations to Tatoeba!
+    """
+    with gr.Row():
+        gr.Markdown(markdown)
+        with gr.Column():
+            input_text = gr.components.Textbox(label="Input Text", value="Raccoons are fascinating creatures, but I prefer opossums.")
+            source_lang = gr.components.Dropdown(label="Source Language", value="Deutsch", choices=list(LANG_CODES.keys()))
+            target_lang = gr.components.Dropdown(label="Target Language", value="toki pona", choices=list(LANG_CODES.keys()))
+            return_seqs = gr.Slider(label="Number of return sequences", value=3, minimum=1, maximum=12, step=1)
+            inputs=[input_text, source_lang, target_lang, return_seqs]
+            outputs = gr.Textbox()
+            translate_btn = gr.Button("Translate! | o ante toki!")
+            translate_btn.click(translate, inputs=inputs, outputs=outputs)
+            gr.Examples(
+                [
+                    ["Hello! How are you?", "English", "toki pona", 3],
+                    ["toki a! ilo pi ante toki ni li pona!", "toki pona", "English",  3],
+                    ["mi li toki e toki pona", "toki pona", "toki pona", 3],
+                ],
+                inputs=inputs
+            )
+app.launch()