Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,14 +1,5 @@
|
|
1 |
-
# OCR Translate v0.2
|
2 |
-
# 创建人:曾逸夫
|
3 |
-
# 创建时间:2022-07-19
|
4 |
-
|
5 |
-
import os
|
6 |
-
|
7 |
-
#os.system("apt-get install xclip")
|
8 |
-
|
9 |
import gradio as gr
|
10 |
import nltk
|
11 |
-
import pyclip
|
12 |
import pytesseract
|
13 |
from nltk.tokenize import sent_tokenize
|
14 |
from transformers import MarianMTModel, MarianTokenizer
|
@@ -24,17 +15,32 @@ img_dir = "./data"
|
|
24 |
# 获取tesseract语言列表
|
25 |
choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
#
|
29 |
-
def
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
tokenizer = MarianTokenizer.from_pretrained(model_name) # 分词器
|
35 |
-
model = MarianMTModel.from_pretrained(model_name) # 模型
|
36 |
|
37 |
-
return tokenizer, model
|
38 |
|
39 |
|
40 |
# tesseract语言列表转pytesseract语言
|
@@ -77,32 +83,7 @@ def cp_clear():
|
|
77 |
pyclip.clear()
|
78 |
|
79 |
|
80 |
-
# 翻译
|
81 |
-
def translate(input_text, inputs_transStyle):
|
82 |
-
# 参考:https://huggingface.co/docs/transformers/model_doc/marian
|
83 |
-
if input_text is None or input_text == "":
|
84 |
-
return "System prompt: There is no content to translate!"
|
85 |
-
|
86 |
-
# 选择翻译模型
|
87 |
-
trans_src, trans_trg = inputs_transStyle.split("-")[0], inputs_transStyle.split("-")[1]
|
88 |
-
tokenizer, model = model_choice(trans_src, trans_trg)
|
89 |
-
|
90 |
-
translate_text = ""
|
91 |
-
input_text_list = input_text.split("\n\n")
|
92 |
-
|
93 |
-
translate_text_list_tmp = []
|
94 |
-
for i in range(len(input_text_list)):
|
95 |
-
if input_text_list[i] != "":
|
96 |
-
translate_text_list_tmp.append(input_text_list[i])
|
97 |
-
|
98 |
-
for i in range(len(translate_text_list_tmp)):
|
99 |
-
translated_sub = model.generate(
|
100 |
-
**tokenizer(sent_tokenize(translate_text_list_tmp[i]), return_tensors="pt", truncation=True, padding=True))
|
101 |
-
tgt_text_sub = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_sub]
|
102 |
-
translate_text_sub = "".join(tgt_text_sub)
|
103 |
-
translate_text = translate_text + "\n\n" + translate_text_sub
|
104 |
|
105 |
-
return translate_text[2:]
|
106 |
|
107 |
|
108 |
def main():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import nltk
|
|
|
3 |
import pytesseract
|
4 |
from nltk.tokenize import sent_tokenize
|
5 |
from transformers import MarianMTModel, MarianTokenizer
|
|
|
15 |
# 获取tesseract语言列表
|
16 |
choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
|
17 |
|
18 |
+
# Função de escolha do modelo de tradução
|
19 |
+
def model_choice(src, trg):
|
20 |
+
model_name = f"Helsinki-NLP/opus-mt-tc-big-{src}-{trg}"
|
21 |
+
tokenizer = MarianTokenizer.from_pretrained(model_name)
|
22 |
+
model = MarianMTModel.from_pretrained(model_name)
|
23 |
+
return tokenizer, model
|
24 |
|
25 |
+
# Função de tradução
|
26 |
+
def translate(input_text, trans_style):
|
27 |
+
if not input_text:
|
28 |
+
return "System prompt: There is no content to translate!"
|
29 |
+
|
30 |
+
src, trg = trans_style.split("-")
|
31 |
+
tokenizer, model = model_choice(src, trg)
|
32 |
+
|
33 |
+
sentences = sent_tokenize(input_text)
|
34 |
+
translated_text = ""
|
35 |
+
for sentence in sentences:
|
36 |
+
inputs = tokenizer.encode(sentence, return_tensors="pt", truncation=True, padding=True)
|
37 |
+
translated = model.generate(inputs)
|
38 |
+
translated_sentence = tokenizer.decode(translated[0], skip_special_tokens=True)
|
39 |
+
translated_text += translated_sentence + " "
|
40 |
+
|
41 |
+
return translated_text.strip()
|
42 |
|
|
|
|
|
43 |
|
|
|
44 |
|
45 |
|
46 |
# tesseract语言列表转pytesseract语言
|
|
|
83 |
pyclip.clear()
|
84 |
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
|
|
87 |
|
88 |
|
89 |
def main():
|