PaulNdrei commited on
Commit
719cc71
β€’
0 Parent(s):

Initial commit

Browse files
Files changed (4) hide show
  1. .gitignore +5 -0
  2. README.md +11 -0
  3. app.py +220 -0
  4. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ venv
2
+ models
3
+ **/__pycache__/
4
+ hf_cache
5
+ .env
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Translator
3
+ emoji: 🌎
4
+ colorFrom: red
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 4.8.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
app.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import os
3
+ from dotenv import load_dotenv
4
+ import gradio as gr
5
+ from AinaTheme import AinaGradioTheme
6
+ import sentencepiece as spm
7
+ import ctranslate2
8
+ from huggingface_hub import snapshot_download
9
+ import nltk
10
+ from nltk import sent_tokenize
11
+
12
+ nltk.download('punkt')
13
+
14
+ load_dotenv()
15
+
16
+ MODELS_PATH = "./models"
17
+ HF_CACHE_DIR = "./hf_cache"
18
+ MAX_INPUT_CHARACTERS= int(os.environ.get("MAX_INPUT_CHARACTERS", default=500))
19
+
20
+ def download_model(repo_id, revision="main"):
21
+ return snapshot_download(repo_id=repo_id, revision=revision, local_dir=os.path.join(MODELS_PATH, repo_id), cache_dir=HF_CACHE_DIR)
22
+
23
+ model_dir_ca_es = download_model("projecte-aina/mt-aina-ca-es", revision="main")
24
+ model_dir_es_ca = download_model("PlanTL-GOB-ES/mt-plantl-es-ca", revision="main")
25
+
26
+ model_dir_ca_en = download_model("projecte-aina/mt-aina-ca-en", revision="main")
27
+ model_dir_en_ca = download_model("projecte-aina/mt-aina-en-ca", revision="main")
28
+
29
+ model_dir_ca_fr = download_model("projecte-aina/mt-aina-ca-fr", revision="main")
30
+ model_dir_fr_ca = download_model("projecte-aina/mt-aina-fr-ca", revision="main")
31
+
32
+ model_dir_ca_de = download_model("projecte-aina/mt-aina-ca-de", revision="main")
33
+ model_dir_de_ca = download_model("projecte-aina/mt-aina-de-ca", revision="main")
34
+
35
+ model_dir_ca_it = download_model("projecte-aina/mt-aina-ca-it", revision="main")
36
+ model_dir_it_ca = download_model("projecte-aina/mt-aina-it-ca", revision="main")
37
+
38
+ model_dir_ca_pt = download_model("projecte-aina/mt-aina-ca-pt", revision="main")
39
+ model_dir_pt_ca = download_model("projecte-aina/mt-aina-pt-ca", revision="main")
40
+
41
+ directions = {
42
+ "Catalan": {
43
+ "target": {
44
+ "Spanish": {"model": (f"{os.path.join(MODELS_PATH, model_dir_ca_es)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_ca_es)}")},
45
+ "English": {"model": (f"{os.path.join(MODELS_PATH, model_dir_ca_en)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_ca_en)}")},
46
+ "French": {"model": (f"{os.path.join(MODELS_PATH, model_dir_ca_fr)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_ca_fr)}")},
47
+ "German": {"model": (f"{os.path.join(MODELS_PATH, model_dir_ca_de)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_ca_de)}")},
48
+ "Italian": {"model": (f"{os.path.join(MODELS_PATH, model_dir_ca_it)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_ca_it)}")},
49
+ "Portuguese": {"model": (f"{os.path.join(MODELS_PATH, model_dir_ca_pt)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_ca_pt)}")}
50
+
51
+ }
52
+ },
53
+ "Spanish": {
54
+ "target": {
55
+ "Catalan": {"model": (f"{os.path.join(MODELS_PATH, model_dir_es_ca)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_es_ca)}")},
56
+ }
57
+ },
58
+ "English": {
59
+ "target": {
60
+ "Catalan": {"model": (f"{os.path.join(MODELS_PATH, model_dir_en_ca)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_en_ca)}")},
61
+ }
62
+ },
63
+ "French": {
64
+ "target": {
65
+ "Catalan": {"model": (f"{os.path.join(MODELS_PATH, model_dir_fr_ca)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_fr_ca)}")},
66
+ }
67
+ },
68
+ "German": {
69
+ "target": {
70
+ "Catalan": {"model": (f"{os.path.join(MODELS_PATH, model_dir_de_ca)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_de_ca)}")},
71
+ }
72
+ },
73
+ "Italian": {
74
+ "target": {
75
+ "Catalan": {"model": (f"{os.path.join(MODELS_PATH, model_dir_it_ca)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_it_ca)}")},
76
+ }
77
+ },
78
+ "Portuguese": {
79
+ "target": {
80
+ "Catalan": {"model": (f"{os.path.join(MODELS_PATH, model_dir_pt_ca)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_pt_ca)}")},
81
+ }
82
+ }
83
+ }
84
+
85
+ DEFAULT_SOURCE_LANGUAGE = list(directions.keys())[0]
86
+
87
+ def get_target_languages(source_language):
88
+ return list(directions.get(source_language, {}).get("target", {}).keys())
89
+
90
+
91
+ def get_target_languege_model(source_language, target_language):
92
+ return directions.get(source_language, {}).get("target", {}).get(target_language, {}).get("model")
93
+
94
+
95
+ def translate(source, lang_pair):
96
+ """Use CTranslate model to translate a sentence
97
+
98
+ Args:
99
+ source (str): Source sentences to translate
100
+ translator (object): Object of Translator, with the CTranslate2 model
101
+ sp_model (object): Object of SentencePieceProcessor, with the SentencePiece source model
102
+ Returns:
103
+ Translation of the source text
104
+ """
105
+ sp_model = spm.SentencePieceProcessor(lang_pair[0])
106
+ translator = ctranslate2.Translator(lang_pair[1])
107
+
108
+ source_sentences = sent_tokenize(source)
109
+ source_tokenized = sp_model.encode(source_sentences, out_type=str)
110
+ translations = translator.translate_batch(source_tokenized)
111
+ translations = [translation[0]["tokens"] for translation in translations]
112
+ translations_detokenized = sp_model.decode(translations)
113
+ translation = " ".join(translations_detokenized)
114
+ translation = translation.replace(' ⁇', ':')
115
+
116
+ gc.collect()
117
+
118
+ return translation
119
+
120
+
121
+ def translate_input(input, source_language, target_language):
122
+
123
+ if input.strip() == "":
124
+ gr.Warning('Not possible to translate an empty input.')
125
+ return None
126
+
127
+ target_language_model = get_target_languege_model(source_language, target_language)
128
+ translation = translate(input, target_language_model)
129
+
130
+ return translation
131
+
132
+
133
+ def clear():
134
+ return None, None
135
+
136
+ def change_interactive(text):
137
+ if len(text.strip()) > MAX_INPUT_CHARACTERS:
138
+ return gr.update(interactive = True), gr.update(interactive = False)
139
+ return gr.update(interactive = True), gr.update(interactive = True)
140
+
141
+ def update_target_languages_dropdown(source_language):
142
+
143
+ output_languages = get_target_languages(source_language)
144
+ return gr.update(choices=output_languages, value=output_languages[0], interactive=True)
145
+
146
+
147
+ with gr.Blocks(**AinaGradioTheme().get_kwargs()) as app:
148
+ with gr.Row(variant="panel"):
149
+ with gr.Column(scale=2):
150
+ placeholder_max_token = gr.Textbox(
151
+ visible=False,
152
+ interactive=False,
153
+ value= MAX_INPUT_CHARACTERS
154
+ )
155
+ source_language = gr.Dropdown(label="Source Language", choices=list(directions.keys()), value=DEFAULT_SOURCE_LANGUAGE)
156
+ input = gr.Textbox(placeholder="Enter a text here to translate.", max_lines=100, lines=12, show_label=False, interactive=True)
157
+ with gr.Row(variant="panel", equal_height=True):
158
+ gr.HTML("""<span id="countertext" style="display: flex; justify-content: start; color:#ef4444; font-weight: bold;"></span>""")
159
+ gr.HTML(f"""<span id="counter" style="display: flex; justify-content: end;"> <span id="inputlenght">0</span>&nbsp;/&nbsp;{MAX_INPUT_CHARACTERS}</span>""")
160
+
161
+ with gr.Column(scale=2):
162
+
163
+ target_outputs = get_target_languages(DEFAULT_SOURCE_LANGUAGE)
164
+ #target_language = gr.Dropdown(choices=target_outputs, label="Target Language", value=target_outputs[0])
165
+
166
+ target_language = gr.Radio(choices=target_outputs, label="Target Language", value=target_outputs[0])
167
+ output = gr.Textbox(max_lines=100, lines=12, show_label=False, interactive=False, show_copy_button=True)
168
+
169
+ with gr.Row(variant="panel"):
170
+ clear_btn = gr.Button(
171
+ "Clear",
172
+ )
173
+ submit_btn = gr.Button(
174
+ "Submit",
175
+ variant="primary",
176
+ )
177
+
178
+ source_language.change(fn=update_target_languages_dropdown, inputs=[source_language], outputs=target_language)
179
+
180
+ input.change(
181
+ fn=change_interactive,
182
+ inputs=[input],
183
+ outputs=[clear_btn, submit_btn],
184
+ api_name=False
185
+ )
186
+
187
+ input.change(
188
+ fn=None,
189
+ inputs=[input],
190
+ js=f"""(i) => document.getElementById('countertext').textContent = i.length > {MAX_INPUT_CHARACTERS} && 'Max length {MAX_INPUT_CHARACTERS} characters. ' || '' """,
191
+ api_name=False
192
+ )
193
+
194
+ input.change(
195
+ fn=None,
196
+ inputs=[input, placeholder_max_token],
197
+ js="""(i, m) => {
198
+ document.getElementById('inputlenght').textContent = i.length + ' '
199
+ document.getElementById('inputlenght').style.color = (i.length > m) ? "#ef4444" : "";
200
+ }""",
201
+ api_name=False
202
+ )
203
+
204
+ clear_btn.click(
205
+ fn=clear,
206
+ inputs=[],
207
+ outputs=[input, output],
208
+ queue=False,
209
+ api_name=False
210
+ )
211
+
212
+ submit_btn.click(
213
+ fn=translate_input,
214
+ inputs=[input, source_language, target_language],
215
+ outputs=[output],
216
+ api_name="translate",
217
+ concurrency_limit=1,
218
+ )
219
+
220
+ app.launch(show_api=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ aina-gradio-theme==1.0
2
+ gradio==4.8.0
3
+ ctranslate2==3.23.0
4
+ nltk==3.8.1
5
+ sentencepiece==0.1.99
6
+ python-dotenv==1.0.0