File size: 2,368 Bytes
fbd8f89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0db4526
fbd8f89
 
 
3d23b84
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Huggingface Models: https://huggingface.co/models
Transformer dependency: https://pypi.org/project/transformers/
"""

import os
os.environ['CURL_CA_BUNDLE'] = ''
# os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

import malaya
import ctranslate2
from transformers import AutoTokenizer
import gradio as gr
import logging
import os

logging.basicConfig(level=logging.INFO)

TO_LANG = {
    'Malay': 'ms',
    'English': 'en',
}

TO_LANG_KEYS = list(TO_LANG.keys())

model = None
map_lang = {
    'en': 'Inggeris',
    'ms': 'Melayu',
    'pasar ms': 'pasar Melayu',
    'manglish': 'Manglish',
}

converter = ctranslate2.converters.TransformersConverter(
    'mesolitica/translation-t5-small-standard-bahasa-cased-v2'
    )
try:
    converter.convert('t5-small-ct2', quantization='int8')
except BaseException:
    pass

tokenizer = AutoTokenizer.from_pretrained(
    'mesolitica/translation-t5-small-standard-bahasa-cased-v2',
    use_fast=False,
)


def translate(text, to_lang):
    global model
    to_lang = TO_LANG[to_lang]
    if model is None:
        translator = ctranslate2.Translator('t5-small-ct2')

    prefix = f'terjemah ke {map_lang[to_lang]}: {text}'
    input_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prefix))
    outputs = translator.translate_batch(
        [input_tokens],
        max_input_length=6144,
        max_decoding_length=6144,
        disable_unk=True,
    )
    results = []
    for o in outputs:
        o = o.hypotheses[0]
        o = tokenizer.convert_tokens_to_ids(o)
        results.append(o)
    results = tokenizer.batch_decode(
        [[i for i in o if i not in [0, 1, 2]] for o in results],
        spaces_between_special_tokens=False,
    )
    return results[0]

demo = gr.Interface(
    fn=translate,
    inputs=[
        gr.components.Textbox(label='Input Text'),
        gr.components.Dropdown(label='Output language', choices=TO_LANG_KEYS, value='Malay'),
    ],
    outputs=[
        gr.components.Textbox(label='Output Text')
    ],
    cache_examples=False,
    title='bentobytes AI Translator',
    description='This translation is model able to translate malay, english, manglish to a target language. It is also able to maintain the text structure as it is and only translate necessary texts, eg, programming code.'
)

demo.launch(server_name='0.0.0.0')