zuhair96 commited on
Commit
fbd8f89
1 Parent(s): 4eae223

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +88 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Huggingface Models: https://huggingface.co/models
3
+ Transformer dependency: https://pypi.org/project/transformers/
4
+ """
5
+
6
+ import os
7
+ os.environ['CURL_CA_BUNDLE'] = ''
8
+ # os.environ['CUDA_VISIBLE_DEVICES'] = ''
9
+ os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
10
+
11
+ import malaya
12
+ import ctranslate2
13
+ from transformers import AutoTokenizer
14
+ import gradio as gr
15
+ import logging
16
+ import os
17
+
18
+ logging.basicConfig(level=logging.INFO)
19
+
20
+ TO_LANG = {
21
+ 'Malay': 'ms',
22
+ 'English': 'en',
23
+ }
24
+
25
+ TO_LANG_KEYS = list(TO_LANG.keys())
26
+
27
+ model = None
28
+ map_lang = {
29
+ 'en': 'Inggeris',
30
+ 'ms': 'Melayu',
31
+ 'pasar ms': 'pasar Melayu',
32
+ 'manglish': 'Manglish',
33
+ }
34
+
35
+ converter = ctranslate2.converters.TransformersConverter(
36
+ 'mesolitica/translation-t5-small-standard-bahasa-cased-v2'
37
+ )
38
+ try:
39
+ converter.convert('t5-small-ct2', quantization='int8')
40
+ except BaseException:
41
+ pass
42
+
43
+ tokenizer = AutoTokenizer.from_pretrained(
44
+ 'mesolitica/translation-t5-small-standard-bahasa-cased-v2',
45
+ use_fast=False,
46
+ )
47
+
48
+
49
+ def translate(text, to_lang):
50
+ global model
51
+ to_lang = TO_LANG[to_lang]
52
+ if model is None:
53
+ translator = ctranslate2.Translator('t5-small-ct2')
54
+
55
+ prefix = f'terjemah ke {map_lang[to_lang]}: {text}'
56
+ input_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prefix))
57
+ outputs = translator.translate_batch(
58
+ [input_tokens],
59
+ max_input_length=6144,
60
+ max_decoding_length=6144,
61
+ disable_unk=True,
62
+ )
63
+ results = []
64
+ for o in outputs:
65
+ o = o.hypotheses[0]
66
+ o = tokenizer.convert_tokens_to_ids(o)
67
+ results.append(o)
68
+ results = tokenizer.batch_decode(
69
+ [[i for i in o if i not in [0, 1, 2]] for o in results],
70
+ spaces_between_special_tokens=False,
71
+ )
72
+ return results[0]
73
+
74
+ demo = gr.Interface(
75
+ fn=translate,
76
+ inputs=[
77
+ gr.components.Textbox(label='Input Text'),
78
+ gr.components.Dropdown(label='Output language', choices=TO_LANG_KEYS, value='Malay'),
79
+ ],
80
+ outputs=[
81
+ gr.components.Textbox(label='Output Text')
82
+ ],
83
+ cache_examples=False,
84
+ title='Malaysian NMT',
85
+ description='This translation is model able to translate malay, english, manglish to a target language. It is also able to maintain the text structure as it is and only translate necessary texts, eg, programming code.'
86
+ )
87
+
88
+ demo.launch(server_name='0.0.0.0',share=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ malaya @ git+https://github.com/huseinzol05/malaya@5.1
2
+ torch
3
+ malaya-boilerplate
4
+ ctranslate2
5
+ gradio