wkplhc commited on
Commit
5479c69
·
verified ·
1 Parent(s): 77d74e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -26
app.py CHANGED
@@ -27,41 +27,71 @@ def extract_gif_urls(html_content):
27
 
28
  gif_urls = []
29
  # 匹配霹雳布袋戏相关的GIF格式,特别是0101.gif这类序列
 
30
  pattern = r'010\d+\.gif$'
31
 
32
  for img in img_tags:
33
  src = img.get('src', '')
34
- if src and re.search(pattern, src, re.IGNORECASE):
 
35
  # 处理相对路径
36
  if not src.startswith(('http://', 'https://')):
37
- continue # 简单处理,实际可能需要更复杂的URL拼接
 
 
 
 
 
 
 
 
38
  gif_urls.append(src)
39
 
40
  # 按文件名排序(0101.gif, 0102.gif...)
41
- gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
 
 
 
42
  return gif_urls
43
 
44
  def download_gif(url, save_path):
45
  """下载GIF图片"""
46
  try:
47
- response = requests.get(url, stream=True, timeout=10)
 
 
 
 
48
  if response.status_code == 200:
49
  with open(save_path, 'wb') as f:
50
  f.write(response.content)
51
  return True
52
  return False
53
- except:
 
54
  return False
55
 
56
  def process_gif_for_ocr(gif_path):
57
  """处理GIF图片以提高OCR识别率"""
58
  # 打开GIF
59
- gif = Image.open(gif_path)
60
-
61
- # 提取第一帧(通常文本在第一帧)
62
  try:
63
- gif.seek(0)
64
- frame = gif.convert('L') # 转为灰度图
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  # 增强对比度
67
  enhancer = ImageEnhance.Contrast(frame)
@@ -70,12 +100,13 @@ def process_gif_for_ocr(gif_path):
70
  # 轻微锐化
71
  frame = frame.filter(ImageFilter.SHARPEN)
72
 
73
- # 二值化处理
74
- threshold = 150
75
  frame = frame.point(lambda p: p > threshold and 255)
76
 
77
  return frame
78
- except EOFError:
 
79
  return None
80
 
81
  def ocr_image(image):
@@ -83,13 +114,17 @@ def ocr_image(image):
83
  if image is None:
84
  return ""
85
 
86
- # 使用Tesseract进行OCR,指定中文识别
87
- custom_config = r'--oem 3 --psm 6 -l chi_sim+eng'
88
- text = pytesseract.image_to_string(image, config=custom_config)
89
-
90
- # 清理识别结果
91
- text = text.replace('\f', '').replace('\n\n', '\n').strip()
92
- return text
 
 
 
 
93
 
94
  def extract_text_from_url(url, progress=gr.Progress()):
95
  """从指定URL提取GIF并识别文本"""
@@ -98,17 +133,22 @@ def extract_text_from_url(url, progress=gr.Progress()):
98
  with tempfile.TemporaryDirectory() as temp_dir:
99
  progress(0, desc="正在获取网页内容...")
100
 
101
- # 获取网页内容
102
- response = requests.get(url, timeout=15)
 
 
 
103
  if response.status_code != 200:
104
- return f"无法访问网页,状态码:{response.status_code}"
 
105
 
106
  # 提取GIF URL
107
  progress(0.2, desc="正在提取GIF图片链接...")
108
  gif_urls = extract_gif_urls(response.text)
109
 
110
  if not gif_urls:
111
- return "未找到符合条件的GIF图片"
 
112
 
113
  progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片,开始处理...")
114
 
@@ -142,7 +182,7 @@ def extract_text_from_url(url, progress=gr.Progress()):
142
  # 保存处理后的图像用于展示
143
  processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
144
  processed_image.save(processed_path)
145
- gif_images.append(processed_path)
146
 
147
  # 识别文本
148
  text = ocr_image(processed_image)
@@ -155,9 +195,10 @@ def extract_text_from_url(url, progress=gr.Progress()):
155
  result_text = "\n\n".join(all_text)
156
 
157
  progress(1.0, desc="处理完成")
158
- return result_text, [Image.open(img_path) for img_path in gif_images]
159
 
160
  except Exception as e:
 
161
  return f"处理过程出错:{str(e)}", []
162
 
163
  def create_interface():
 
27
 
28
  gif_urls = []
29
  # 匹配霹雳布袋戏相关的GIF格式,特别是0101.gif这类序列
30
+ # 放宽正则匹配条件,确保能识别到相关GIF
31
  pattern = r'010\d+\.gif$'
32
 
33
  for img in img_tags:
34
  src = img.get('src', '')
35
+ # 放宽匹配条件,只要包含数字序列的GIF都考虑
36
+ if src and re.search(r'\d+\.gif$', src, re.IGNORECASE):
37
  # 处理相对路径
38
  if not src.startswith(('http://', 'https://')):
39
+ # 尝试补全相对路径(针对常见情况)
40
+ if src.startswith('/'):
41
+ parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None
42
+ if parsed_url:
43
+ src = f"{parsed_url.scheme}://{parsed_url.netloc}{src}"
44
+ else:
45
+ continue
46
+ else:
47
+ continue
48
  gif_urls.append(src)
49
 
50
  # 按文件名排序(0101.gif, 0102.gif...)
51
+ try:
52
+ gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
53
+ except:
54
+ pass # 排序失败时保持原顺序
55
  return gif_urls
56
 
57
  def download_gif(url, save_path):
58
  """下载GIF图片"""
59
  try:
60
+ # 添加请求头,模拟浏览器行为
61
+ headers = {
62
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
63
+ }
64
+ response = requests.get(url, stream=True, timeout=15, headers=headers)
65
  if response.status_code == 200:
66
  with open(save_path, 'wb') as f:
67
  f.write(response.content)
68
  return True
69
  return False
70
+ except Exception as e:
71
+ print(f"下载GIF失败: {str(e)}")
72
  return False
73
 
74
  def process_gif_for_ocr(gif_path):
75
  """处理GIF图片以提高OCR识别率"""
76
  # 打开GIF
 
 
 
77
  try:
78
+ gif = Image.open(gif_path)
79
+
80
+ # 尝试提取多个帧,避免只取第一帧可能丢失内容
81
+ frames = []
82
+ try:
83
+ for i in range(10): # 最多尝试10帧
84
+ gif.seek(i)
85
+ frames.append(gif.convert('L')) # 转为灰度图
86
+ except EOFError:
87
+ pass
88
+
89
+ # 如果没有获取到帧,返回None
90
+ if not frames:
91
+ return None
92
+
93
+ # 取第一帧进行处理
94
+ frame = frames[0]
95
 
96
  # 增强对比度
97
  enhancer = ImageEnhance.Contrast(frame)
 
100
  # 轻微锐化
101
  frame = frame.filter(ImageFilter.SHARPEN)
102
 
103
+ # 二值化处理,动态调整阈值
104
+ threshold = 140
105
  frame = frame.point(lambda p: p > threshold and 255)
106
 
107
  return frame
108
+ except Exception as e:
109
+ print(f"处理GIF失败: {str(e)}")
110
  return None
111
 
112
  def ocr_image(image):
 
114
  if image is None:
115
  return ""
116
 
117
+ try:
118
+ # 使用Tesseract进行OCR,指定中文识别,增加更多配置参数
119
+ custom_config = r'--oem 3 --psm 3 -l chi_sim+eng'
120
+ text = pytesseract.image_to_string(image, config=custom_config)
121
+
122
+ # 清理识别结果
123
+ text = text.replace('\f', '').replace('\n\n', '\n').strip()
124
+ return text
125
+ except Exception as e:
126
+ print(f"OCR识别失败: {str(e)}")
127
+ return "OCR识别失败"
128
 
129
  def extract_text_from_url(url, progress=gr.Progress()):
130
  """从指定URL提取GIF并识别文本"""
 
133
  with tempfile.TemporaryDirectory() as temp_dir:
134
  progress(0, desc="正在获取网页内容...")
135
 
136
+ # 获取网页内容,添加请求头模拟浏览器
137
+ headers = {
138
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
139
+ }
140
+ response = requests.get(url, timeout=15, headers=headers)
141
  if response.status_code != 200:
142
+ # 确保返回两个值:错误信息和空列表
143
+ return f"无法访问网页,状态码:{response.status_code}", []
144
 
145
  # 提取GIF URL
146
  progress(0.2, desc="正在提取GIF图片链接...")
147
  gif_urls = extract_gif_urls(response.text)
148
 
149
  if not gif_urls:
150
+ # 确保返回两个值:提示信息和空列表
151
+ return "未找到符合条件的GIF图片", []
152
 
153
  progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片,开始处理...")
154
 
 
182
  # 保存处理后的图像用于展示
183
  processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
184
  processed_image.save(processed_path)
185
+ gif_images.append(Image.open(processed_path))
186
 
187
  # 识别文本
188
  text = ocr_image(processed_image)
 
195
  result_text = "\n\n".join(all_text)
196
 
197
  progress(1.0, desc="处理完成")
198
+ return result_text, gif_images
199
 
200
  except Exception as e:
201
+ # 确保返回两个值:错误信息和空列表
202
  return f"处理过程出错:{str(e)}", []
203
 
204
  def create_interface():