File size: 5,771 Bytes
ce1c4dd
 
 
 
 
 
 
6318241
ce1c4dd
 
 
 
 
8210be8
ce1c4dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8210be8
 
ce1c4dd
 
 
 
 
 
 
 
 
2e7cfe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340a52c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e7cfe8
 
 
 
 
 
 
 
 
 
 
ce1c4dd
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

# this model was loaded from https://hf.co/models
model = AutoModelForSeq2SeqLM.from_pretrained("Jayyydyyy/m2m100_418m_tokipona")
tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
LANG_CODES = {
    "English":"en",
    "toki pona":"tl"
}

def translate(text, src_lang, tgt_lang, candidates:int):
    """
    Translate the text from source lang to target lang
    """

    src = LANG_CODES.get(src_lang)
    tgt = LANG_CODES.get(tgt_lang)

    tokenizer.src_lang = src
    tokenizer.tgt_lang = tgt

    ins = tokenizer(text, return_tensors='pt').to(device)

    gen_args = {
            'return_dict_in_generate': True,
            'output_scores': True,
            'output_hidden_states': True,
            'length_penalty': 0.0,  # don't encourage longer or shorter output,
            'num_return_sequences': candidates,
            'num_beams':candidates,
            'forced_bos_token_id': tokenizer.lang_code_to_id[tgt]
        }
    

    outs = model.generate(**{**ins, **gen_args})
    output = tokenizer.batch_decode(outs.sequences, skip_special_tokens=True)

    return output

# app = gr.Interface(
#     fn=translate,
#     inputs=[
#         gr.components.Textbox(label="Text"),
#         gr.components.Dropdown(label="Source Language", choices=list(LANG_CODES.keys())),
#         gr.components.Dropdown(label="Target Language", choices=list(LANG_CODES.keys())),
#         gr.Slider(label="Number of return sequences", value=3, minimum=1, maximum=12, step=1)
#     ],
#     outputs=["text"],
#     examples=[
#         ["Welcome to my translation app.", "English", "toki pona", 3],
#         ["Its not always perfect, but its pretty okay!", "English", "toki pona", 3],
#         ["ilo pi ante toki ni li pona a!", "toki pona", "English", 3],
#         ["kijetesantakalu li pona", "toki pona", "English", 3],
#         ["mi li toki e toki pona", "toki pona", "toki pona", 3]
#     ],
#     cache_examples=False,
#     article="""
#     # A simple English / toki pona Neural Machine Translation App!
    
#     ### toki a! 💬

#     This is a simple english to toki pona / toki pona to english neural machine translation app.

#     Input your text to translate, a source language and target language, and desired number of return sequences! 

#     ### Grammaticality / Regularization
#     English -> English and/or toki pona -> toki pona will result in some form of regularization. 
    
#     This can approximate grammaticality, but it isn't always the best.
    
#     For example, "mi li toki e toki pona" [src: toki pona, tgt: toki pona] will result in ['mi toki e toki pona.', 'mi toki pona.', 'mi toki e toki pona']
#     (Thus, the ungrammatical "li" is dropped)

#     ### Model and Data
#     This app utilizes a fine-tuned version of Facebook/Meta AI's M2M100 418M param model. 
    
#     By leveraging the pretrained weights of the massively multilingual M2M100 model, 
#     we can jumpstart our transfer learning to accomplish machine translation for toki pona!
    
#     The model was fine-tuned on the English/toki pona bitexts found at https://tatoeba.org/
    
#     ### This app is a work in progress and obviously not all translations will be perfect. 
#     In addition to parameter quantity and the hyper-parameters used while training, 
#     the *quality of data* found on Tatoeba directly influences the perfomance of projects like this! 

#     If you wish to contribute, please simply add high quality and diverse translations to Tatoeba!
#     """,
#     title="English / toki pona Translation"
# )

with gr.Blocks() as app:
    gr.Markdown("""
    # A simple English / toki pona Neural Machine Translation App!
    
    ### toki a! 💬

    This is a simple english to toki pona / toki pona to english neural machine translation app.

    Input your text to translate, a source language and target language, and desired number of return sequences! 

    ### Grammaticality / Regularization
    English -> English and/or toki pona -> toki pona will result in some form of regularization. 
    
    This can approximate grammaticality, but it isn't always the best.
    
    For example, "mi li toki e toki pona" [src: toki pona, tgt: toki pona] will result in ['mi toki e toki pona.', 'mi toki pona.', 'mi toki e toki pona']
    (Thus, the ungrammatical "li" is dropped)

    ### Model and Data
    This app utilizes a fine-tuned version of Facebook/Meta AI's M2M100 418M param model. 
    
    By leveraging the pretrained weights of the massively multilingual M2M100 model, 
    we can jumpstart our transfer learning to accomplish machine translation for toki pona!
    
    The model was fine-tuned on the English/toki pona bitexts found at https://tatoeba.org/
    
    ### This app is a work in progress and obviously not all translations will be perfect. 
    In addition to parameter quantity and the hyper-parameters used while training, 
    the *quality of data* found on Tatoeba directly influences the perfomance of projects like this! 

    If you wish to contribute, please simply add high quality and diverse translations to Tatoeba!
    """
    )
    inputs=[
        gr.components.Textbox(label="Text"),
        gr.components.Dropdown(label="Source Language", choices=list(LANG_CODES.keys())),
        gr.components.Dropdown(label="Target Language", choices=list(LANG_CODES.keys())),
        gr.Slider(label="Number of return sequences", value=3, minimum=1, maximum=12, step=1)
    ]

    translate_btn = gr.Button("Translate! | o ante toki!")
    translate_btn.click(translate, inputs=inputs, outputs=["text"])

app.launch()