Zengyf-CVer commited on
Commit
c6d26b8
1 Parent(s): 3b93977

app update

Browse files
Files changed (8) hide show
  1. .gitignore +64 -0
  2. app.py +180 -0
  3. data/test.png +0 -0
  4. data/test02.png +0 -0
  5. data/test03.png +0 -0
  6. packages.txt +1 -0
  7. requirements.txt +8 -0
  8. style.css +7 -0
.gitignore ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OCR Translate
2
+ # 创建人:曾逸夫
3
+ # 项目地址:https://gitee.com/CV_Lab/ocr-translate
4
+
5
+ # 图片格式
6
+ *.jpg
7
+ *.jpeg
8
+ *.png
9
+ *.svg
10
+ *.gif
11
+
12
+ # 视频格式
13
+ *.mp4
14
+ *.avi
15
+ .ipynb_checkpoints
16
+ /__pycache__
17
+ */__pycache__
18
+
19
+ # 日志格式
20
+ *.log
21
+ *.data
22
+ *.txt
23
+
24
+ # 生成文件
25
+ *.pdf
26
+ *.xlsx
27
+ *.csv
28
+
29
+ # 参数文件
30
+ *.yaml
31
+ *.json
32
+
33
+ # 压缩文件格式
34
+ *.zip
35
+ *.tar
36
+ *.tar.gz
37
+ *.rar
38
+
39
+ # 字体格式
40
+ *.ttc
41
+ *.ttf
42
+ *.otf
43
+ *.pkl
44
+
45
+ # 模型文件
46
+ *.pt
47
+ *.db
48
+
49
+ /flagged
50
+ /run
51
+ /opus-mt-en-zh
52
+
53
+ !requirements.txt
54
+ !cls_name/*
55
+ !model_config/*
56
+ !img_examples/*
57
+ !data/*
58
+
59
+ !requirements.txt
60
+ !packages.txt
61
+ !.pre-commit-config.yaml
62
+
63
+ test.py
64
+ test*.py
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OCR Translate v0.2
2
+ # 创建人:曾逸夫
3
+ # 创建时间:2022-07-19
4
+
5
+ import os
6
+
7
+ os.system("sudo apt-get install xclip")
8
+
9
+ import gradio as gr
10
+ import nltk
11
+ import pyclip
12
+ import pytesseract
13
+ from nltk.tokenize import sent_tokenize
14
+ from transformers import MarianMTModel, MarianTokenizer
15
+
16
+ nltk.download('punkt')
17
+
18
+ OCR_TR_DESCRIPTION = '''# OCR Translate v0.2
19
+ <div id="content_align">OCR translation system based on Tesseract</div>'''
20
+
21
+ # 图片路径
22
+ img_dir = "./data"
23
+
24
+ # 获取tesseract语言列表
25
+ choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
26
+
27
+
28
+ # 翻译模型选择
29
+ def model_choice(src="en", trg="zh"):
30
+ # https://huggingface.co/Helsinki-NLP/opus-mt-zh-en
31
+ # https://huggingface.co/Helsinki-NLP/opus-mt-en-zh
32
+ model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}" # 模型名称
33
+
34
+ tokenizer = MarianTokenizer.from_pretrained(model_name) # 分词器
35
+ model = MarianMTModel.from_pretrained(model_name) # 模型
36
+
37
+ return tokenizer, model
38
+
39
+
40
+ # tesseract语言列表转pytesseract语言
41
+ def ocr_lang(lang_list):
42
+ lang_str = ""
43
+ lang_len = len(lang_list)
44
+ if lang_len == 1:
45
+ return lang_list[0]
46
+ else:
47
+ for i in range(lang_len):
48
+ lang_list.insert(lang_len - i, "+")
49
+
50
+ lang_str = "".join(lang_list[:-1])
51
+ return lang_str
52
+
53
+
54
+ # ocr tesseract
55
+ def ocr_tesseract(img, languages):
56
+ ocr_str = pytesseract.image_to_string(img, lang=ocr_lang(languages))
57
+ return ocr_str
58
+
59
+
60
+ # 清除
61
+ def clear_content():
62
+ return None
63
+
64
+
65
+ # 复制到剪贴板
66
+ def cp_text(input_text):
67
+ # sudo apt-get install xclip
68
+ try:
69
+ pyclip.copy(input_text)
70
+ except Exception as e:
71
+ print("sudo apt-get install xclip")
72
+ print(e)
73
+
74
+
75
+ # 清除剪贴板
76
+ def cp_clear():
77
+ pyclip.clear()
78
+
79
+
80
+ # 翻译
81
+ def translate(input_text, inputs_transStyle):
82
+ # 参考:https://huggingface.co/docs/transformers/model_doc/marian
83
+ if input_text is None or input_text == "":
84
+ return "System prompt: There is no content to translate!"
85
+
86
+ # 选择翻译模型
87
+ trans_src, trans_trg = inputs_transStyle.split("-")[0], inputs_transStyle.split("-")[1]
88
+ tokenizer, model = model_choice(trans_src, trans_trg)
89
+
90
+ translate_text = ""
91
+ input_text_list = input_text.split("\n\n")
92
+
93
+ translate_text_list_tmp = []
94
+ for i in range(len(input_text_list)):
95
+ if input_text_list[i] != "":
96
+ translate_text_list_tmp.append(input_text_list[i])
97
+
98
+ for i in range(len(translate_text_list_tmp)):
99
+ translated_sub = model.generate(
100
+ **tokenizer(sent_tokenize(translate_text_list_tmp[i]), return_tensors="pt", truncation=True, padding=True))
101
+ tgt_text_sub = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_sub]
102
+ translate_text_sub = "".join(tgt_text_sub)
103
+ translate_text = translate_text + "\n\n" + translate_text_sub
104
+
105
+ return translate_text[2:]
106
+
107
+
108
+ def main():
109
+
110
+ with gr.Blocks(css='style.css') as ocr_tr:
111
+ gr.Markdown(OCR_TR_DESCRIPTION)
112
+
113
+ # -------------- OCR 文字提取 --------------
114
+ with gr.Box():
115
+
116
+ with gr.Row():
117
+ gr.Markdown("### Step 01: Text extraction")
118
+
119
+ with gr.Row():
120
+ with gr.Column():
121
+ with gr.Row():
122
+ inputs_img = gr.Image(image_mode="RGB", source="upload", type="pil", label="image")
123
+ with gr.Row():
124
+ inputs_lang = gr.CheckboxGroup(choices=["chi_sim", "eng"],
125
+ type="value",
126
+ value=['eng'],
127
+ label='language')
128
+
129
+ with gr.Row():
130
+ clear_img_btn = gr.Button('Clear')
131
+ ocr_btn = gr.Button(value='OCR extraction', variant="primary")
132
+
133
+ with gr.Column():
134
+ with gr.Row():
135
+ outputs_text = gr.Textbox(label="Extract content", lines=20)
136
+ with gr.Row():
137
+ inputs_transStyle = gr.Radio(choices=["zh-en", "en-zh"],
138
+ type="value",
139
+ value="zh-en",
140
+ label='translation mode')
141
+ with gr.Row():
142
+ clear_text_btn = gr.Button('Clear')
143
+ translate_btn = gr.Button(value='translate', variant="primary")
144
+
145
+ with gr.Row():
146
+ example_list = [["./data/test.png", ["eng"]], ["./data/test02.png", ["eng"]],
147
+ ["./data/test03.png", ["chi_sim"]]]
148
+ gr.Examples(example_list, [inputs_img, inputs_lang], outputs_text, ocr_tesseract, cache_examples=False)
149
+
150
+ # -------------- 翻译 --------------
151
+ with gr.Box():
152
+
153
+ with gr.Row():
154
+ gr.Markdown("### Step 02: Translation")
155
+
156
+ with gr.Row():
157
+ outputs_tr_text = gr.Textbox(label="translate content", lines=20)
158
+
159
+ with gr.Row():
160
+ cp_clear_btn = gr.Button(value='clear clipboard')
161
+ cp_btn = gr.Button(value='copy to clipboard', variant="primary")
162
+
163
+ # ---------------------- OCR Tesseract ----------------------
164
+ ocr_btn.click(fn=ocr_tesseract, inputs=[inputs_img, inputs_lang], outputs=[
165
+ outputs_text,])
166
+ clear_img_btn.click(fn=clear_content, inputs=[], outputs=[inputs_img])
167
+
168
+ # ---------------------- 翻译 ----------------------
169
+ translate_btn.click(fn=translate, inputs=[outputs_text, inputs_transStyle], outputs=[outputs_tr_text])
170
+ clear_text_btn.click(fn=clear_content, inputs=[], outputs=[outputs_text])
171
+
172
+ # ---------------------- 复制到剪贴板 ----------------------
173
+ cp_btn.click(fn=cp_text, inputs=[outputs_tr_text], outputs=[])
174
+ cp_clear_btn.click(fn=cp_clear, inputs=[], outputs=[])
175
+
176
+ ocr_tr.launch(inbrowser=True)
177
+
178
+
179
+ if __name__ == '__main__':
180
+ main()
data/test.png ADDED
data/test02.png ADDED
data/test03.png ADDED
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tesseract-ocr-all
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pytesseract>=0.3.9
2
+ pyclip>=0.6.0
3
+ gradio>=3.0.18
4
+ nltk>=3.7
5
+ sentencepiece>=0.1.96
6
+ transformers>=4.20.0
7
+ sacremoses>=0.0.53
8
+ torch>=1.11.0
style.css ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ }
4
+
5
+ #content_align {
6
+ text-align: center;
7
+ }