SoybeanMilk commited on
Commit
6c60394
1 Parent(s): 5d94527

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -63
app.py CHANGED
@@ -1,40 +1,26 @@
1
- # OCR Translate v0.2
2
- # 创建人:曾逸夫
3
- # 创建时间:2022-07-19
4
-
5
  import os
6
-
7
- os.system("sudo apt-get install xclip")
8
-
9
  import gradio as gr
10
  import nltk
11
- import pyclip
12
  import pytesseract
 
13
  from nltk.tokenize import sent_tokenize
14
- from transformers import MarianMTModel, MarianTokenizer
 
 
15
 
 
16
  nltk.download('punkt')
17
 
18
  OCR_TR_DESCRIPTION = '''# OCR Translate v0.2
19
  <div id="content_align">OCR translation system based on Tesseract</div>'''
20
 
21
- # 图片路径
22
- img_dir = "./data"
23
-
24
- # 获取tesseract语言列表
25
  choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
26
 
27
-
28
- # 翻译模型选择
29
- def model_choice(src="en", trg="zh"):
30
- # https://huggingface.co/Helsinki-NLP/opus-mt-zh-en
31
- # https://huggingface.co/Helsinki-NLP/opus-mt-en-zh
32
- model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}" # 模型名称
33
-
34
- tokenizer = MarianTokenizer.from_pretrained(model_name) # 分词器
35
- model = MarianMTModel.from_pretrained(model_name) # 模型
36
-
37
- return tokenizer, model
38
 
39
 
40
  # tesseract语言列表转pytesseract语言
@@ -62,48 +48,30 @@ def clear_content():
62
  return None
63
 
64
 
 
 
65
  # 复制到剪贴板
66
  def cp_text(input_text):
67
- # sudo apt-get install xclip
68
  try:
69
- pyclip.copy(input_text)
70
  except Exception as e:
71
- print("sudo apt-get install xclip")
72
  print(e)
73
 
74
-
75
  # 清除剪贴板
76
  def cp_clear():
77
- pyclip.clear()
78
 
79
-
80
- # 翻译
81
  def translate(input_text, inputs_transStyle):
82
- # 参考:https://huggingface.co/docs/transformers/model_doc/marian
83
  if input_text is None or input_text == "":
84
  return "System prompt: There is no content to translate!"
85
-
86
  # 选择翻译模型
87
- trans_src, trans_trg = inputs_transStyle.split("-")[0], inputs_transStyle.split("-")[1]
88
- tokenizer, model = model_choice(trans_src, trans_trg)
89
-
90
- translate_text = ""
91
- input_text_list = input_text.split("\n\n")
92
-
93
- translate_text_list_tmp = []
94
- for i in range(len(input_text_list)):
95
- if input_text_list[i] != "":
96
- translate_text_list_tmp.append(input_text_list[i])
97
-
98
- for i in range(len(translate_text_list_tmp)):
99
- translated_sub = model.generate(
100
- **tokenizer(sent_tokenize(translate_text_list_tmp[i]), return_tensors="pt", truncation=True, padding=True))
101
- tgt_text_sub = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_sub]
102
- translate_text_sub = "".join(tgt_text_sub)
103
- translate_text = translate_text + "\n\n" + translate_text_sub
104
-
105
- return translate_text[2:]
106
-
107
 
108
  def main():
109
 
@@ -133,19 +101,14 @@ def main():
133
  with gr.Column():
134
  with gr.Row():
135
  outputs_text = gr.Textbox(label="Extract content", lines=20)
136
- with gr.Row():
137
- inputs_transStyle = gr.Radio(choices=["zh-en", "en-zh"],
138
- type="value",
139
- value="zh-en",
140
- label='translation mode')
141
  with gr.Row():
142
  clear_text_btn = gr.Button('Clear')
143
  translate_btn = gr.Button(value='Translate', variant="primary")
144
 
145
  with gr.Row():
146
- example_list = [["./data/test.png", ["eng"]], ["./data/test02.png", ["eng"]],
147
- ["./data/test03.png", ["chi_sim"]]]
148
- gr.Examples(example_list, [inputs_img, inputs_lang], outputs_text, ocr_tesseract, cache_examples=False)
149
 
150
  # -------------- 翻译 --------------
151
  with gr.Box():
@@ -173,8 +136,7 @@ def main():
173
  cp_btn.click(fn=cp_text, inputs=[outputs_tr_text], outputs=[])
174
  cp_clear_btn.click(fn=cp_clear, inputs=[], outputs=[])
175
 
176
- ocr_tr.launch(inbrowser=True)
177
-
178
 
179
  if __name__ == '__main__':
180
  main()
 
 
 
 
 
1
  import os
2
+ import pyperclip
 
 
3
  import gradio as gr
4
  import nltk
 
5
  import pytesseract
6
+ import google.generativeai as genai
7
  from nltk.tokenize import sent_tokenize
8
+ from transformers import *
9
+ import torch
10
+ from tqdm import tqdm # Import tqdm
11
 
12
+ # Download necessary data for nltk
13
  nltk.download('punkt')
14
 
15
  OCR_TR_DESCRIPTION = '''# OCR Translate v0.2
16
  <div id="content_align">OCR translation system based on Tesseract</div>'''
17
 
18
+ # Getting the list of available languages for Tesseract
 
 
 
19
  choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
20
 
21
+ # Replace YOUR_API_KEY with your API key from https://cloud.google.com/docs/authentication/api-keys
22
+ genai.configure(api_key="AIzaSyC_o3nrbe2fVUifhUg0Zh2oVsweSIdlEbg")
23
+ model = genai.GenerativeModel('gemini-pro')
 
 
 
 
 
 
 
 
24
 
25
 
26
  # tesseract语言列表转pytesseract语言
 
48
  return None
49
 
50
 
51
+ import pyperclip
52
+
53
  # 复制到剪贴板
54
  def cp_text(input_text):
 
55
  try:
56
+ pyperclip.copy(input_text)
57
  except Exception as e:
58
+ print("Error occurred while copying to clipboard")
59
  print(e)
60
 
 
61
  # 清除剪贴板
62
  def cp_clear():
63
+ pyperclip.clear()
64
 
65
+ # Add a translation function
 
66
  def translate(input_text, inputs_transStyle):
 
67
  if input_text is None or input_text == "":
68
  return "System prompt: There is no content to translate!"
69
+
70
  # 选择翻译模型
71
+ prompt = f"Please reformat the following article to have clear paragraph breaks and correct punctuation, and then translate it into {inputs_transStyle}. In the translation, do not display the original text, fictional content, or any repeated content. Ensure that the original meaning and context are preserved as much as possible."
72
+ # Reorder for ease of reading and translate into {inputs_transStyle}"
73
+ response = model.generate_content([prompt, input_text])
74
+ return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  def main():
77
 
 
101
  with gr.Column():
102
  with gr.Row():
103
  outputs_text = gr.Textbox(label="Extract content", lines=20)
104
+ inputs_transStyle = gr.inputs.Dropdown(choices=["Chinese (Simplified)", "Chinese (Traditional)", "English", "Japanese", "Korean"],
105
+ default="Chinese (Simplified)", label='translation mode')
 
 
 
106
  with gr.Row():
107
  clear_text_btn = gr.Button('Clear')
108
  translate_btn = gr.Button(value='Translate', variant="primary")
109
 
110
  with gr.Row():
111
+ pass
 
 
112
 
113
  # -------------- 翻译 --------------
114
  with gr.Box():
 
136
  cp_btn.click(fn=cp_text, inputs=[outputs_tr_text], outputs=[])
137
  cp_clear_btn.click(fn=cp_clear, inputs=[], outputs=[])
138
 
139
+ ocr_tr.launch(inbrowser=True, share=True)
 
140
 
141
  if __name__ == '__main__':
142
  main()