Zengyf-CVer commited on
Commit
5a0f532
1 Parent(s): 57627c5

app update

Browse files
Files changed (2) hide show
  1. .gitignore +0 -1
  2. app.py +137 -0
.gitignore CHANGED
@@ -58,6 +58,5 @@
58
  !requirements.txt
59
  !.pre-commit-config.yaml
60
 
61
- app.py
62
  test.py
63
  test*.py
 
58
  !requirements.txt
59
  !.pre-commit-config.yaml
60
 
 
61
  test.py
62
  test*.py
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OCR Translate v0.1
2
+ # 创建人:曾逸夫
3
+ # 创建时间:2022-06-14
4
+ # email: zyfiy1314@163.com
5
+
6
+ import os
7
+
8
+ import gradio as gr
9
+ import nltk
10
+ import pytesseract
11
+ from nltk.tokenize import sent_tokenize
12
+ from transformers import MarianMTModel, MarianTokenizer
13
+
14
+ nltk.download('punkt')
15
+
16
+ # ----------- 翻译 -----------
17
+ # https://huggingface.co/Helsinki-NLP/opus-mt-en-zh
18
+ modchoice = "Helsinki-NLP/opus-mt-en-zh" # 模型名称
19
+
20
+ tokenizer = MarianTokenizer.from_pretrained(modchoice) # 分词器
21
+ model = MarianMTModel.from_pretrained(modchoice) # 模型
22
+
23
+ OCR_TR_DESCRIPTION = '''# OCR Translate v0.1
24
+ <div id="content_align">基于Tesseract的OCR翻译系统</div>'''
25
+
26
+ # 图片路径
27
+ img_dir = "./data"
28
+
29
+ # 获取tesseract语言列表
30
+ choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
31
+
32
+
33
+ # tesseract语言列表转pytesseract语言
34
+ def ocr_lang(lang_list):
35
+ lang_str = ""
36
+ lang_len = len(lang_list)
37
+ if lang_len == 1:
38
+ return lang_list[0]
39
+ else:
40
+ for i in range(lang_len):
41
+ lang_list.insert(lang_len - i, "+")
42
+
43
+ lang_str = "".join(lang_list[:-1])
44
+ return lang_str
45
+
46
+
47
+ # ocr tesseract
48
+ def ocr_tesseract(img, languages):
49
+ ocr_str = pytesseract.image_to_string(img, lang=ocr_lang(languages))
50
+ return ocr_str
51
+
52
+
53
+ # 示例
54
+ def set_example_image(example: list) -> dict:
55
+ return gr.Image.update(value=example[0])
56
+
57
+
58
+ # 清除
59
+ def clear_content():
60
+ return None
61
+
62
+
63
+ # 翻译
64
+ def translate(input_text):
65
+ # 参考:https://huggingface.co/docs/transformers/model_doc/marian
66
+ if input_text is None or input_text == "":
67
+ return "系统提示:没有可翻译的内容!"
68
+
69
+ translated = model.generate(**tokenizer(sent_tokenize(input_text), return_tensors="pt", padding=True))
70
+ tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
71
+ translate_text = "".join(tgt_text)
72
+
73
+ return translate_text
74
+
75
+
76
+ def main():
77
+
78
+ with gr.Blocks(css='style.css') as ocr_tr:
79
+ gr.Markdown(OCR_TR_DESCRIPTION)
80
+
81
+ # -------------- OCR 文字提取 --------------
82
+ with gr.Box():
83
+
84
+ with gr.Row():
85
+ gr.Markdown("### Step 01: 文字提取")
86
+
87
+ with gr.Row():
88
+ with gr.Column():
89
+ with gr.Row():
90
+ inputs_img = gr.Image(image_mode="RGB", source="upload", type="pil", label="图片")
91
+ with gr.Row():
92
+ inputs_lang = gr.CheckboxGroup(choices=choices, type="value", value=['eng'], label='语言')
93
+
94
+ with gr.Row():
95
+ clear_img_btn = gr.Button('Clear')
96
+ ocr_btn = gr.Button(value='OCR 提取', variant="primary")
97
+
98
+ with gr.Column():
99
+ imgs_path = sorted(os.listdir(img_dir))
100
+ example_images = gr.Dataset(components=[inputs_img],
101
+ samples=[[f"{img_dir}/{i}"] for i in imgs_path])
102
+
103
+ # -------------- 翻译 --------------
104
+ with gr.Box():
105
+
106
+ with gr.Row():
107
+ gr.Markdown("### Step 02: 翻译")
108
+
109
+ with gr.Row():
110
+ with gr.Column():
111
+ with gr.Row():
112
+ outputs_text = gr.Textbox(label="提取内容", lines=20)
113
+ with gr.Row():
114
+ clear_text_btn = gr.Button('Clear')
115
+ translate_btn = gr.Button(value='翻译', variant="primary")
116
+ with gr.Column():
117
+ outputs_tr_text = gr.Textbox(label="翻译内容", lines=20)
118
+
119
+ # ---------------------- OCR Tesseract ----------------------
120
+ ocr_btn.click(fn=ocr_tesseract, inputs=[inputs_img, inputs_lang], outputs=[
121
+ outputs_text,])
122
+
123
+ clear_img_btn.click(fn=clear_content, inputs=[], outputs=[inputs_img])
124
+
125
+ example_images.click(fn=set_example_image, inputs=[
126
+ example_images,], outputs=[
127
+ inputs_img,])
128
+
129
+ # ---------------------- OCR Tesseract ----------------------
130
+ translate_btn.click(fn=translate, inputs=[outputs_text], outputs=[outputs_tr_text])
131
+ clear_text_btn.click(fn=clear_content, inputs=[], outputs=[outputs_text])
132
+
133
+ ocr_tr.launch(inbrowser=True)
134
+
135
+
136
+ if __name__ == '__main__':
137
+ main()