File size: 4,686 Bytes
d510d4c
 
 
 
 
 
 
22ca3fa
 
 
d510d4c
799254d
d510d4c
 
 
 
799254d
 
d510d4c
799254d
 
22ca3fa
d510d4c
 
22ca3fa
 
 
 
 
 
 
 
d510d4c
 
 
22ca3fa
 
ed6c2e2
 
 
 
 
22ca3fa
ed6c2e2
 
22ca3fa
 
 
 
 
 
 
 
ed6c2e2
 
 
 
22ca3fa
ed6c2e2
d510d4c
 
22ca3fa
d510d4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22ca3fa
 
 
 
799254d
 
 
22ca3fa
 
d510d4c
 
 
 
 
 
 
 
22ca3fa
d510d4c
 
22ca3fa
d510d4c
 
22ca3fa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

model = AutoModelForSeq2SeqLM.from_pretrained("Jayyydyyy/m2m100_418m_tokipona")
tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(device)
LANG_CODES = {"English": "en", "toki pona": "tl"}


def translate(text):
    """
    Translate the text from source lang to target lang
    """

    # src = LANG_CODES.get(src_lang)
    # tgt = LANG_CODES.get(tgt_lang)

    tokenizer.src_lang = "en"
    tokenizer.tgt_lang = "tl"
    ins = tokenizer(text, return_tensors="pt").to(device)

    gen_args = {
        "return_dict_in_generate": True,
        "output_scores": True,
        "output_hidden_states": True,
        "length_penalty": 0.0,  # don't encourage longer or shorter output,
        "num_return_sequences": 1,
        "num_beams": 1,
        "forced_bos_token_id": tokenizer.lang_code_to_id["tl"],
    }

    outs = model.generate(**{**ins, **gen_args})
    output = tokenizer.batch_decode(outs.sequences, skip_special_tokens=True)
    text2 = "\n".join(output)

    ##################

    tokenizer.src_lang = "tl"
    tokenizer.tgt_lang = "en"

    ins = tokenizer(text2, return_tensors="pt").to(device)

    gen_args = {
        "return_dict_in_generate": True,
        "output_scores": True,
        "output_hidden_states": True,
        "length_penalty": 0.0,  # don't encourage longer or shorter output,
        "num_return_sequences": 1,
        "num_beams": 1,
        "forced_bos_token_id": tokenizer.lang_code_to_id["en"],
    }

    outs2 = model.generate(**{**ins, **gen_args})
    output2 = tokenizer.batch_decode(outs2.sequences, skip_special_tokens=True)

    return "\n".join(output2)


with gr.Blocks() as app:
    markdown = """
    # An English / toki pona Neural Machine Translation App!
    
    ### toki a! 💬

    This is an english to toki pona / toki pona to english neural machine translation app.

    Input your text to translate, a source language and target language, and desired number of return sequences! 

    ### Grammar Regularization
    An interesting quirk of training a many-to-many translation model is that pseudo-grammar correction 
    can be achieved by translating *from* **language A** *to* **language A**
    
    Remember, this can ***approximate*** grammaticality, but it isn't always the best.
    
    For example, "mi li toki e toki pona" (Source Language: toki pona & Target Language: toki pona) will result in: 
    - ['mi toki e toki pona.', 'mi toki pona.', 'mi toki e toki pona']
    - (Thus, the ungrammatical "li" is dropped)

    ### Model and Data
    This app utilizes a fine-tuned version of Facebook/Meta AI's M2M100 418M param model. 
    
    By leveraging the pretrained weights of the massively multilingual M2M100 model, 
    we can jumpstart our transfer learning to accomplish machine translation for toki pona!
    
    The model was fine-tuned on the English/toki pona bitexts found at [https://tatoeba.org/](https://tatoeba.org/)
    
    ### This app is a work in progress and obviously not all translations will be perfect. 
    In addition to parameter quantity and the hyper-parameters used while training, 
    the *quality of data* found on Tatoeba directly influences the perfomance of projects like this! 

    If you wish to contribute, please add high quality and diverse translations to Tatoeba!
    """

    with gr.Row():
        gr.Markdown(markdown)
        with gr.Column():
            input_text = gr.components.Textbox(
                label="Input Text",
                value="Raccoons are fascinating creatures, but I prefer opossums.",
            )
            # source_lang = gr.components.Dropdown(label="Source Language", value="English", choices=list(LANG_CODES.keys()))
            # target_lang = gr.components.Dropdown(label="Target Language", value="toki pona", choices=list(LANG_CODES.keys()))
            # return_seqs = gr.Slider(label="Number of return sequences", value=3, minimum=1, maximum=12, step=1)

            inputs = [input_text]
            outputs = gr.Textbox()

            translate_btn = gr.Button("Translate! | o ante toki!")
            translate_btn.click(translate, inputs=inputs, outputs=outputs)

            gr.Examples(
                [
                    ["Hello! How are you?", "English", "toki pona", 3],
                    ["toki a! ilo pi ante toki ni li pona!", "toki pona", "English", 3],
                    ["mi li toki e toki pona", "toki pona", "toki pona", 3],
                ],
                inputs=inputs,
            )

app.launch()