feat: add model module
Browse files- server/model.py +37 -0
- server/requirements.txt +2 -1
server/model.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ctranslate2
|
2 |
+
import sentencepiece as spm
|
3 |
+
|
4 |
+
|
5 |
+
modelDir = "./model"
|
6 |
+
sp_source_model = "./model/spm.ja.nopretok.model"
|
7 |
+
sp_target_model = "./model/spm.en.nopretok.model"
|
8 |
+
# inter_threads: quantas operações independentes podem ser executadas simultaneamente
|
9 |
+
translator = ctranslate2.Translator(modelDir, device="cpu", intra_threads=4, inter_threads=1)
|
10 |
+
|
11 |
+
|
12 |
+
def tokenizeBatch(text):
|
13 |
+
sp = spm.SentencePieceProcessor(sp_source_model)
|
14 |
+
if isinstance(text, list): return sp.encode(text, out_type=str)
|
15 |
+
elif isinstance(text, str):
|
16 |
+
return [sp.encode(text, out_type=str)]
|
17 |
+
|
18 |
+
|
19 |
+
def detokenizeBatch(text: str):
|
20 |
+
sp = spm.SentencePieceProcessor(sp_target_model)
|
21 |
+
translation = sp.decode(text)
|
22 |
+
return translation
|
23 |
+
|
24 |
+
|
25 |
+
def translate(text: str):
|
26 |
+
translated = translator.translate_batch(
|
27 |
+
source=tokenizeBatch(text),
|
28 |
+
num_hypotheses= 1,
|
29 |
+
return_alternatives= False,
|
30 |
+
replace_unknowns= False,
|
31 |
+
no_repeat_ngram_size= 3, # repetition_penalty
|
32 |
+
disable_unk= True,
|
33 |
+
beam_size= 5,
|
34 |
+
sampling_temperature= 0,
|
35 |
+
)
|
36 |
+
|
37 |
+
return [''.join( detokenizeBatch(result.hypotheses[0]) ) for result in translated]
|
server/requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
flask
|
2 |
waitress
|
|
|
3 |
CTranslate2
|
4 |
-
|
|
|
1 |
flask
|
2 |
waitress
|
3 |
+
flask_cors
|
4 |
CTranslate2
|
5 |
+
sentencepiece
|