wkplhc commited on
Commit
28bf845
·
verified ·
1 Parent(s): db73429

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -45
app.py CHANGED
@@ -4,39 +4,61 @@ from bs4 import BeautifulSoup
4
  import re
5
  import os
6
  import tempfile
7
- import pytesseract
8
- from PIL import Image, ImageEnhance, ImageFilter
9
  import numpy as np
10
  from urllib.parse import urlparse
11
  import time
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # 确保中文显示正常
14
  import matplotlib.pyplot as plt
15
  plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
16
 
17
- # 设置Tesseract OCR路径(Hugging Face Spaces上已预安装)
18
- try:
19
- pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
20
- except:
21
- pass # 在Windows上可能需要手动设置
22
-
23
  def extract_gif_urls(html_content):
24
  """从HTML内容中提取符合条件的GIF图片URL"""
25
  soup = BeautifulSoup(html_content, 'html.parser')
26
  img_tags = soup.find_all('img')
27
 
28
  gif_urls = []
29
- # 匹配霹雳布袋戏相关的GIF格式,特别是0101.gif这类序列
30
  # 放宽正则匹配条件,确保能识别到相关GIF
31
- pattern = r'010\d+\.gif$'
32
 
33
  for img in img_tags:
34
  src = img.get('src', '')
35
- # 放宽匹配条件,只要包含数字序列的GIF都考虑
36
- if src and re.search(r'\d+\.gif$', src, re.IGNORECASE):
37
  # 处理相对路径
38
  if not src.startswith(('http://', 'https://')):
39
- # 尝试补全相对路径(针对常见情况)
40
  if src.startswith('/'):
41
  parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None
42
  if parsed_url:
@@ -47,17 +69,16 @@ def extract_gif_urls(html_content):
47
  continue
48
  gif_urls.append(src)
49
 
50
- # 按文件名排序(0101.gif, 0102.gif...)
51
  try:
52
  gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
53
  except:
54
- pass # 排序失败时保持原顺序
55
  return gif_urls
56
 
57
  def download_gif(url, save_path):
58
  """下载GIF图片"""
59
  try:
60
- # 添加请求头,模拟浏览器行为
61
  headers = {
62
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
63
  }
@@ -73,20 +94,21 @@ def download_gif(url, save_path):
73
 
74
  def process_gif_for_ocr(gif_path):
75
  """处理GIF图片以提高OCR识别率"""
76
- # 打开GIF
 
 
77
  try:
78
  gif = Image.open(gif_path)
79
 
80
- # 尝试提取多个帧,避免只取第一帧可能丢失内容
81
  frames = []
82
  try:
83
- for i in range(10): # 最多尝试10帧
84
  gif.seek(i)
85
- frames.append(gif.convert('L')) # 转为灰度图
86
  except EOFError:
87
  pass
88
 
89
- # 如果没有获取到帧,返回None
90
  if not frames:
91
  return None
92
 
@@ -100,7 +122,7 @@ def process_gif_for_ocr(gif_path):
100
  # 轻微锐化
101
  frame = frame.filter(ImageFilter.SHARPEN)
102
 
103
- # 二值化处理,动态调整阈值
104
  threshold = 140
105
  frame = frame.point(lambda p: p > threshold and 255)
106
 
@@ -111,11 +133,10 @@ def process_gif_for_ocr(gif_path):
111
 
112
  def ocr_image(image):
113
  """对处理后的图像进行OCR识别"""
114
- if image is None:
115
- return ""
116
 
117
  try:
118
- # 使用Tesseract进行OCR,指定中文识别,增加更多配置参数
119
  custom_config = r'--oem 3 --psm 3 -l chi_sim+eng'
120
  text = pytesseract.image_to_string(image, config=custom_config)
121
 
@@ -124,22 +145,23 @@ def ocr_image(image):
124
  return text
125
  except Exception as e:
126
  print(f"OCR识别失败: {str(e)}")
127
- return "OCR识别失败"
128
 
129
  def extract_text_from_url(url, progress=gr.Progress()):
130
  """从指定URL提取GIF并识别文本"""
 
 
 
 
131
  try:
132
- # 创建临时目录
133
  with tempfile.TemporaryDirectory() as temp_dir:
134
  progress(0, desc="正在获取网页内容...")
135
 
136
- # 获取网页内容,添加请求头模拟浏览器
137
  headers = {
138
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
139
  }
140
  response = requests.get(url, timeout=15, headers=headers)
141
  if response.status_code != 200:
142
- # 确保返回两个值:错误信息和空列表
143
  return f"无法访问网页,状态码:{response.status_code}", []
144
 
145
  # 提取GIF URL
@@ -147,7 +169,6 @@ def extract_text_from_url(url, progress=gr.Progress()):
147
  gif_urls = extract_gif_urls(response.text)
148
 
149
  if not gif_urls:
150
- # 确保返回两个值:提示信息和空列表
151
  return "未找到符合条件的GIF图片", []
152
 
153
  progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片,开始处理...")
@@ -159,11 +180,9 @@ def extract_text_from_url(url, progress=gr.Progress()):
159
  current_progress = 0.3
160
 
161
  for i, gif_url in enumerate(gif_urls):
162
- # 更新进度
163
  current_progress += step
164
  progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...")
165
 
166
- # 提取文件名
167
  parsed_url = urlparse(gif_url)
168
  filename = os.path.basename(parsed_url.path)
169
 
@@ -173,13 +192,13 @@ def extract_text_from_url(url, progress=gr.Progress()):
173
  all_text.append(f"【{filename}】下载失败")
174
  continue
175
 
176
- # 处理GIF以提高OCR识别率
177
  processed_image = process_gif_for_ocr(gif_path)
178
  if processed_image is None:
179
  all_text.append(f"【{filename}】处理失败")
180
  continue
181
 
182
- # 保存处理后的图像用于展示
183
  processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
184
  processed_image.save(processed_path)
185
  gif_images.append(Image.open(processed_path))
@@ -188,17 +207,13 @@ def extract_text_from_url(url, progress=gr.Progress()):
188
  text = ocr_image(processed_image)
189
  all_text.append(f"【{filename}】\n{text}")
190
 
191
- # 避免请求过于频繁
192
  time.sleep(0.5)
193
 
194
- # 拼接所有文本
195
  result_text = "\n\n".join(all_text)
196
-
197
  progress(1.0, desc="处理完成")
198
  return result_text, gif_images
199
 
200
  except Exception as e:
201
- # 确保返回两个值:错误信息和空列表
202
  return f"处理过程出错:{str(e)}", []
203
 
204
  def create_interface():
@@ -208,13 +223,16 @@ def create_interface():
208
  # 霹雳布袋戏GIF文本提取工具
209
 
210
  这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片,并识别其中的文本内容。
211
-
212
- ## 使用方法:
213
- 1. 输入包含GIF的网页URL(例如:https://pilicreateworld.tw-blog.com/PILI/PILI69/01.HTM)
214
- 2. 点击"提取文本"按钮
215
- 3. 等待处理完成,查看识别结果
216
  """)
217
 
 
 
 
 
 
 
 
 
218
  with gr.Row():
219
  url_input = gr.Textbox(
220
  label="网页URL",
@@ -234,16 +252,16 @@ def create_interface():
234
  label="处理后的GIF帧",
235
  show_label=True,
236
  elem_id="gallery",
237
- columns=2, # 适配旧版本Gradio的参数名称
238
  height="auto"
239
  )
240
 
241
  with gr.Row():
242
  gr.Markdown("""
243
  ## 注意事项:
 
244
  - 识别 accuracy 取决于GIF图片的清晰度
245
  - 处理可能需要几分钟时间,请耐心等待
246
- - 如遇网络问题,请检查URL是否正确或稍后重试
247
  """)
248
 
249
  # 设置事件
 
4
  import re
5
  import os
6
  import tempfile
7
+ import subprocess
 
8
  import numpy as np
9
  from urllib.parse import urlparse
10
  import time
11
 
12
+ # 尝试安装Tesseract(仅在Hugging Face Spaces环境中有效)
13
+ def install_tesseract():
14
+ try:
15
+ # 检查Tesseract是否已安装
16
+ subprocess.run(['tesseract', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
17
+ return True
18
+ except (FileNotFoundError, subprocess.CalledProcessError):
19
+ print("Tesseract未安装,尝试自动安装...")
20
+ try:
21
+ # 在Ubuntu/Debian系统上安装Tesseract
22
+ subprocess.run(['apt-get', 'update'], check=True)
23
+ subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr', 'tesseract-ocr-chi-sim'], check=True)
24
+ # 安装Python绑定
25
+ subprocess.run(['pip', 'install', 'pytesseract'], check=True)
26
+ return True
27
+ except Exception as e:
28
+ print(f"自动安装Tesseract失败: {str(e)}")
29
+ return False
30
+
31
+ # 检查并安装Tesseract
32
+ tesseract_available = install_tesseract()
33
+
34
+ # 只有在Tesseract可用时才导入相关库
35
+ if tesseract_available:
36
+ import pytesseract
37
+ from PIL import Image, ImageEnhance, ImageFilter
38
+ # 设置Tesseract OCR路径
39
+ try:
40
+ pytesseract.pytesseract.tesseract_cmd = subprocess.check_output(['which', 'tesseract']).decode().strip()
41
+ except:
42
+ pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
43
+
44
  # 确保中文显示正常
45
  import matplotlib.pyplot as plt
46
  plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
47
 
 
 
 
 
 
 
48
  def extract_gif_urls(html_content):
49
  """从HTML内容中提取符合条件的GIF图片URL"""
50
  soup = BeautifulSoup(html_content, 'html.parser')
51
  img_tags = soup.find_all('img')
52
 
53
  gif_urls = []
 
54
  # 放宽正则匹配条件,确保能识别到相关GIF
55
+ pattern = r'\d+\.gif$'
56
 
57
  for img in img_tags:
58
  src = img.get('src', '')
59
+ if src and re.search(pattern, src, re.IGNORECASE):
 
60
  # 处理相对路径
61
  if not src.startswith(('http://', 'https://')):
 
62
  if src.startswith('/'):
63
  parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None
64
  if parsed_url:
 
69
  continue
70
  gif_urls.append(src)
71
 
72
+ # 按文件名排序
73
  try:
74
  gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
75
  except:
76
+ pass
77
  return gif_urls
78
 
79
  def download_gif(url, save_path):
80
  """下载GIF图片"""
81
  try:
 
82
  headers = {
83
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
84
  }
 
94
 
95
  def process_gif_for_ocr(gif_path):
96
  """处理GIF图片以提高OCR识别率"""
97
+ if not tesseract_available:
98
+ return None
99
+
100
  try:
101
  gif = Image.open(gif_path)
102
 
103
+ # 尝试提取多个帧
104
  frames = []
105
  try:
106
+ for i in range(10):
107
  gif.seek(i)
108
+ frames.append(gif.convert('L'))
109
  except EOFError:
110
  pass
111
 
 
112
  if not frames:
113
  return None
114
 
 
122
  # 轻微锐化
123
  frame = frame.filter(ImageFilter.SHARPEN)
124
 
125
+ # 二值化处理
126
  threshold = 140
127
  frame = frame.point(lambda p: p > threshold and 255)
128
 
 
133
 
134
  def ocr_image(image):
135
  """对处理后的图像进行OCR识别"""
136
+ if not tesseract_available or image is None:
137
+ return "Tesseract OCR未安装,无法识别文本"
138
 
139
  try:
 
140
  custom_config = r'--oem 3 --psm 3 -l chi_sim+eng'
141
  text = pytesseract.image_to_string(image, config=custom_config)
142
 
 
145
  return text
146
  except Exception as e:
147
  print(f"OCR识别失败: {str(e)}")
148
+ return f"OCR识别失败: {str(e)}"
149
 
150
  def extract_text_from_url(url, progress=gr.Progress()):
151
  """从指定URL提取GIF并识别文本"""
152
+ # 检查Tesseract是否可用
153
+ if not tesseract_available:
154
+ return "Tesseract OCR安装失败,无法进行文本识别。请联系管理员解决此问题。", []
155
+
156
  try:
 
157
  with tempfile.TemporaryDirectory() as temp_dir:
158
  progress(0, desc="正在获取网页内容...")
159
 
 
160
  headers = {
161
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
162
  }
163
  response = requests.get(url, timeout=15, headers=headers)
164
  if response.status_code != 200:
 
165
  return f"无法访问网页,状态码:{response.status_code}", []
166
 
167
  # 提取GIF URL
 
169
  gif_urls = extract_gif_urls(response.text)
170
 
171
  if not gif_urls:
 
172
  return "未找到符合条件的GIF图片", []
173
 
174
  progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片,开始处理...")
 
180
  current_progress = 0.3
181
 
182
  for i, gif_url in enumerate(gif_urls):
 
183
  current_progress += step
184
  progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...")
185
 
 
186
  parsed_url = urlparse(gif_url)
187
  filename = os.path.basename(parsed_url.path)
188
 
 
192
  all_text.append(f"【{filename}】下载失败")
193
  continue
194
 
195
+ # 处理GIF
196
  processed_image = process_gif_for_ocr(gif_path)
197
  if processed_image is None:
198
  all_text.append(f"【{filename}】处理失败")
199
  continue
200
 
201
+ # 保存处理后的图像
202
  processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
203
  processed_image.save(processed_path)
204
  gif_images.append(Image.open(processed_path))
 
207
  text = ocr_image(processed_image)
208
  all_text.append(f"【{filename}】\n{text}")
209
 
 
210
  time.sleep(0.5)
211
 
 
212
  result_text = "\n\n".join(all_text)
 
213
  progress(1.0, desc="处理完成")
214
  return result_text, gif_images
215
 
216
  except Exception as e:
 
217
  return f"处理过程出错:{str(e)}", []
218
 
219
  def create_interface():
 
223
  # 霹雳布袋戏GIF文本提取工具
224
 
225
  这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片,并识别其中的文本内容。
 
 
 
 
 
226
  """)
227
 
228
+ # 显示Tesseract状态
229
+ if not tesseract_available:
230
+ gr.Markdown("""
231
+ <div style="background-color: #ffebee; padding: 10px; border-radius: 5px; color: #b71c1c;">
232
+ ⚠️ 注意:Tesseract OCR引擎安装失败,可能无法正常识别文本。
233
+ </div>
234
+ """)
235
+
236
  with gr.Row():
237
  url_input = gr.Textbox(
238
  label="网页URL",
 
252
  label="处理后的GIF帧",
253
  show_label=True,
254
  elem_id="gallery",
255
+ columns=2,
256
  height="auto"
257
  )
258
 
259
  with gr.Row():
260
  gr.Markdown("""
261
  ## 注意事项:
262
+ - 首次使用可能需要时间安装OCR组件
263
  - 识别 accuracy 取决于GIF图片的清晰度
264
  - 处理可能需要几分钟时间,请耐心等待
 
265
  """)
266
 
267
  # 设置事件