Spaces:
Running
Running
PaulNdrei
commited on
Commit
β’
719cc71
0
Parent(s):
Initial commit
Browse files- .gitignore +5 -0
- README.md +11 -0
- app.py +220 -0
- requirements.txt +6 -0
.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
venv
|
2 |
+
models
|
3 |
+
**/__pycache__/
|
4 |
+
hf_cache
|
5 |
+
.env
|
README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Translator
|
3 |
+
emoji: π
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: pink
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.8.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
---
|
app.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
import gradio as gr
|
5 |
+
from AinaTheme import AinaGradioTheme
|
6 |
+
import sentencepiece as spm
|
7 |
+
import ctranslate2
|
8 |
+
from huggingface_hub import snapshot_download
|
9 |
+
import nltk
|
10 |
+
from nltk import sent_tokenize
|
11 |
+
|
12 |
+
nltk.download('punkt')
|
13 |
+
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
MODELS_PATH = "./models"
|
17 |
+
HF_CACHE_DIR = "./hf_cache"
|
18 |
+
MAX_INPUT_CHARACTERS= int(os.environ.get("MAX_INPUT_CHARACTERS", default=500))
|
19 |
+
|
20 |
+
def download_model(repo_id, revision="main"):
|
21 |
+
return snapshot_download(repo_id=repo_id, revision=revision, local_dir=os.path.join(MODELS_PATH, repo_id), cache_dir=HF_CACHE_DIR)
|
22 |
+
|
23 |
+
model_dir_ca_es = download_model("projecte-aina/mt-aina-ca-es", revision="main")
|
24 |
+
model_dir_es_ca = download_model("PlanTL-GOB-ES/mt-plantl-es-ca", revision="main")
|
25 |
+
|
26 |
+
model_dir_ca_en = download_model("projecte-aina/mt-aina-ca-en", revision="main")
|
27 |
+
model_dir_en_ca = download_model("projecte-aina/mt-aina-en-ca", revision="main")
|
28 |
+
|
29 |
+
model_dir_ca_fr = download_model("projecte-aina/mt-aina-ca-fr", revision="main")
|
30 |
+
model_dir_fr_ca = download_model("projecte-aina/mt-aina-fr-ca", revision="main")
|
31 |
+
|
32 |
+
model_dir_ca_de = download_model("projecte-aina/mt-aina-ca-de", revision="main")
|
33 |
+
model_dir_de_ca = download_model("projecte-aina/mt-aina-de-ca", revision="main")
|
34 |
+
|
35 |
+
model_dir_ca_it = download_model("projecte-aina/mt-aina-ca-it", revision="main")
|
36 |
+
model_dir_it_ca = download_model("projecte-aina/mt-aina-it-ca", revision="main")
|
37 |
+
|
38 |
+
model_dir_ca_pt = download_model("projecte-aina/mt-aina-ca-pt", revision="main")
|
39 |
+
model_dir_pt_ca = download_model("projecte-aina/mt-aina-pt-ca", revision="main")
|
40 |
+
|
41 |
+
directions = {
|
42 |
+
"Catalan": {
|
43 |
+
"target": {
|
44 |
+
"Spanish": {"model": (f"{os.path.join(MODELS_PATH, model_dir_ca_es)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_ca_es)}")},
|
45 |
+
"English": {"model": (f"{os.path.join(MODELS_PATH, model_dir_ca_en)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_ca_en)}")},
|
46 |
+
"French": {"model": (f"{os.path.join(MODELS_PATH, model_dir_ca_fr)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_ca_fr)}")},
|
47 |
+
"German": {"model": (f"{os.path.join(MODELS_PATH, model_dir_ca_de)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_ca_de)}")},
|
48 |
+
"Italian": {"model": (f"{os.path.join(MODELS_PATH, model_dir_ca_it)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_ca_it)}")},
|
49 |
+
"Portuguese": {"model": (f"{os.path.join(MODELS_PATH, model_dir_ca_pt)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_ca_pt)}")}
|
50 |
+
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"Spanish": {
|
54 |
+
"target": {
|
55 |
+
"Catalan": {"model": (f"{os.path.join(MODELS_PATH, model_dir_es_ca)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_es_ca)}")},
|
56 |
+
}
|
57 |
+
},
|
58 |
+
"English": {
|
59 |
+
"target": {
|
60 |
+
"Catalan": {"model": (f"{os.path.join(MODELS_PATH, model_dir_en_ca)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_en_ca)}")},
|
61 |
+
}
|
62 |
+
},
|
63 |
+
"French": {
|
64 |
+
"target": {
|
65 |
+
"Catalan": {"model": (f"{os.path.join(MODELS_PATH, model_dir_fr_ca)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_fr_ca)}")},
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"German": {
|
69 |
+
"target": {
|
70 |
+
"Catalan": {"model": (f"{os.path.join(MODELS_PATH, model_dir_de_ca)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_de_ca)}")},
|
71 |
+
}
|
72 |
+
},
|
73 |
+
"Italian": {
|
74 |
+
"target": {
|
75 |
+
"Catalan": {"model": (f"{os.path.join(MODELS_PATH, model_dir_it_ca)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_it_ca)}")},
|
76 |
+
}
|
77 |
+
},
|
78 |
+
"Portuguese": {
|
79 |
+
"target": {
|
80 |
+
"Catalan": {"model": (f"{os.path.join(MODELS_PATH, model_dir_pt_ca)}/spm.model", f"{os.path.join(MODELS_PATH, model_dir_pt_ca)}")},
|
81 |
+
}
|
82 |
+
}
|
83 |
+
}
|
84 |
+
|
85 |
+
DEFAULT_SOURCE_LANGUAGE = list(directions.keys())[0]
|
86 |
+
|
87 |
+
def get_target_languages(source_language):
|
88 |
+
return list(directions.get(source_language, {}).get("target", {}).keys())
|
89 |
+
|
90 |
+
|
91 |
+
def get_target_languege_model(source_language, target_language):
|
92 |
+
return directions.get(source_language, {}).get("target", {}).get(target_language, {}).get("model")
|
93 |
+
|
94 |
+
|
95 |
+
def translate(source, lang_pair):
|
96 |
+
"""Use CTranslate model to translate a sentence
|
97 |
+
|
98 |
+
Args:
|
99 |
+
source (str): Source sentences to translate
|
100 |
+
translator (object): Object of Translator, with the CTranslate2 model
|
101 |
+
sp_model (object): Object of SentencePieceProcessor, with the SentencePiece source model
|
102 |
+
Returns:
|
103 |
+
Translation of the source text
|
104 |
+
"""
|
105 |
+
sp_model = spm.SentencePieceProcessor(lang_pair[0])
|
106 |
+
translator = ctranslate2.Translator(lang_pair[1])
|
107 |
+
|
108 |
+
source_sentences = sent_tokenize(source)
|
109 |
+
source_tokenized = sp_model.encode(source_sentences, out_type=str)
|
110 |
+
translations = translator.translate_batch(source_tokenized)
|
111 |
+
translations = [translation[0]["tokens"] for translation in translations]
|
112 |
+
translations_detokenized = sp_model.decode(translations)
|
113 |
+
translation = " ".join(translations_detokenized)
|
114 |
+
translation = translation.replace(' β', ':')
|
115 |
+
|
116 |
+
gc.collect()
|
117 |
+
|
118 |
+
return translation
|
119 |
+
|
120 |
+
|
121 |
+
def translate_input(input, source_language, target_language):
|
122 |
+
|
123 |
+
if input.strip() == "":
|
124 |
+
gr.Warning('Not possible to translate an empty input.')
|
125 |
+
return None
|
126 |
+
|
127 |
+
target_language_model = get_target_languege_model(source_language, target_language)
|
128 |
+
translation = translate(input, target_language_model)
|
129 |
+
|
130 |
+
return translation
|
131 |
+
|
132 |
+
|
133 |
+
def clear():
|
134 |
+
return None, None
|
135 |
+
|
136 |
+
def change_interactive(text):
|
137 |
+
if len(text.strip()) > MAX_INPUT_CHARACTERS:
|
138 |
+
return gr.update(interactive = True), gr.update(interactive = False)
|
139 |
+
return gr.update(interactive = True), gr.update(interactive = True)
|
140 |
+
|
141 |
+
def update_target_languages_dropdown(source_language):
|
142 |
+
|
143 |
+
output_languages = get_target_languages(source_language)
|
144 |
+
return gr.update(choices=output_languages, value=output_languages[0], interactive=True)
|
145 |
+
|
146 |
+
|
147 |
+
with gr.Blocks(**AinaGradioTheme().get_kwargs()) as app:
|
148 |
+
with gr.Row(variant="panel"):
|
149 |
+
with gr.Column(scale=2):
|
150 |
+
placeholder_max_token = gr.Textbox(
|
151 |
+
visible=False,
|
152 |
+
interactive=False,
|
153 |
+
value= MAX_INPUT_CHARACTERS
|
154 |
+
)
|
155 |
+
source_language = gr.Dropdown(label="Source Language", choices=list(directions.keys()), value=DEFAULT_SOURCE_LANGUAGE)
|
156 |
+
input = gr.Textbox(placeholder="Enter a text here to translate.", max_lines=100, lines=12, show_label=False, interactive=True)
|
157 |
+
with gr.Row(variant="panel", equal_height=True):
|
158 |
+
gr.HTML("""<span id="countertext" style="display: flex; justify-content: start; color:#ef4444; font-weight: bold;"></span>""")
|
159 |
+
gr.HTML(f"""<span id="counter" style="display: flex; justify-content: end;"> <span id="inputlenght">0</span> / {MAX_INPUT_CHARACTERS}</span>""")
|
160 |
+
|
161 |
+
with gr.Column(scale=2):
|
162 |
+
|
163 |
+
target_outputs = get_target_languages(DEFAULT_SOURCE_LANGUAGE)
|
164 |
+
#target_language = gr.Dropdown(choices=target_outputs, label="Target Language", value=target_outputs[0])
|
165 |
+
|
166 |
+
target_language = gr.Radio(choices=target_outputs, label="Target Language", value=target_outputs[0])
|
167 |
+
output = gr.Textbox(max_lines=100, lines=12, show_label=False, interactive=False, show_copy_button=True)
|
168 |
+
|
169 |
+
with gr.Row(variant="panel"):
|
170 |
+
clear_btn = gr.Button(
|
171 |
+
"Clear",
|
172 |
+
)
|
173 |
+
submit_btn = gr.Button(
|
174 |
+
"Submit",
|
175 |
+
variant="primary",
|
176 |
+
)
|
177 |
+
|
178 |
+
source_language.change(fn=update_target_languages_dropdown, inputs=[source_language], outputs=target_language)
|
179 |
+
|
180 |
+
input.change(
|
181 |
+
fn=change_interactive,
|
182 |
+
inputs=[input],
|
183 |
+
outputs=[clear_btn, submit_btn],
|
184 |
+
api_name=False
|
185 |
+
)
|
186 |
+
|
187 |
+
input.change(
|
188 |
+
fn=None,
|
189 |
+
inputs=[input],
|
190 |
+
js=f"""(i) => document.getElementById('countertext').textContent = i.length > {MAX_INPUT_CHARACTERS} && 'Max length {MAX_INPUT_CHARACTERS} characters. ' || '' """,
|
191 |
+
api_name=False
|
192 |
+
)
|
193 |
+
|
194 |
+
input.change(
|
195 |
+
fn=None,
|
196 |
+
inputs=[input, placeholder_max_token],
|
197 |
+
js="""(i, m) => {
|
198 |
+
document.getElementById('inputlenght').textContent = i.length + ' '
|
199 |
+
document.getElementById('inputlenght').style.color = (i.length > m) ? "#ef4444" : "";
|
200 |
+
}""",
|
201 |
+
api_name=False
|
202 |
+
)
|
203 |
+
|
204 |
+
clear_btn.click(
|
205 |
+
fn=clear,
|
206 |
+
inputs=[],
|
207 |
+
outputs=[input, output],
|
208 |
+
queue=False,
|
209 |
+
api_name=False
|
210 |
+
)
|
211 |
+
|
212 |
+
submit_btn.click(
|
213 |
+
fn=translate_input,
|
214 |
+
inputs=[input, source_language, target_language],
|
215 |
+
outputs=[output],
|
216 |
+
api_name="translate",
|
217 |
+
concurrency_limit=1,
|
218 |
+
)
|
219 |
+
|
220 |
+
app.launch(show_api=True)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aina-gradio-theme==1.0
|
2 |
+
gradio==4.8.0
|
3 |
+
ctranslate2==3.23.0
|
4 |
+
nltk==3.8.1
|
5 |
+
sentencepiece==0.1.99
|
6 |
+
python-dotenv==1.0.0
|