Update app.py
Browse files
app.py
CHANGED
|
@@ -27,41 +27,71 @@ def extract_gif_urls(html_content):
|
|
| 27 |
|
| 28 |
gif_urls = []
|
| 29 |
# 匹配霹雳布袋戏相关的GIF格式,特别是0101.gif这类序列
|
|
|
|
| 30 |
pattern = r'010\d+\.gif$'
|
| 31 |
|
| 32 |
for img in img_tags:
|
| 33 |
src = img.get('src', '')
|
| 34 |
-
|
|
|
|
| 35 |
# 处理相对路径
|
| 36 |
if not src.startswith(('http://', 'https://')):
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
gif_urls.append(src)
|
| 39 |
|
| 40 |
# 按文件名排序(0101.gif, 0102.gif...)
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
| 42 |
return gif_urls
|
| 43 |
|
| 44 |
def download_gif(url, save_path):
|
| 45 |
"""下载GIF图片"""
|
| 46 |
try:
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
if response.status_code == 200:
|
| 49 |
with open(save_path, 'wb') as f:
|
| 50 |
f.write(response.content)
|
| 51 |
return True
|
| 52 |
return False
|
| 53 |
-
except:
|
|
|
|
| 54 |
return False
|
| 55 |
|
| 56 |
def process_gif_for_ocr(gif_path):
|
| 57 |
"""处理GIF图片以提高OCR识别率"""
|
| 58 |
# 打开GIF
|
| 59 |
-
gif = Image.open(gif_path)
|
| 60 |
-
|
| 61 |
-
# 提取第一帧(通常文本在第一帧)
|
| 62 |
try:
|
| 63 |
-
gif.
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
# 增强对比度
|
| 67 |
enhancer = ImageEnhance.Contrast(frame)
|
|
@@ -70,12 +100,13 @@ def process_gif_for_ocr(gif_path):
|
|
| 70 |
# 轻微锐化
|
| 71 |
frame = frame.filter(ImageFilter.SHARPEN)
|
| 72 |
|
| 73 |
-
#
|
| 74 |
-
threshold =
|
| 75 |
frame = frame.point(lambda p: p > threshold and 255)
|
| 76 |
|
| 77 |
return frame
|
| 78 |
-
except
|
|
|
|
| 79 |
return None
|
| 80 |
|
| 81 |
def ocr_image(image):
|
|
@@ -83,13 +114,17 @@ def ocr_image(image):
|
|
| 83 |
if image is None:
|
| 84 |
return ""
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
def extract_text_from_url(url, progress=gr.Progress()):
|
| 95 |
"""从指定URL提取GIF并识别文本"""
|
|
@@ -98,17 +133,22 @@ def extract_text_from_url(url, progress=gr.Progress()):
|
|
| 98 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 99 |
progress(0, desc="正在获取网页内容...")
|
| 100 |
|
| 101 |
-
#
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
| 103 |
if response.status_code != 200:
|
| 104 |
-
|
|
|
|
| 105 |
|
| 106 |
# 提取GIF URL
|
| 107 |
progress(0.2, desc="正在提取GIF图片链接...")
|
| 108 |
gif_urls = extract_gif_urls(response.text)
|
| 109 |
|
| 110 |
if not gif_urls:
|
| 111 |
-
|
|
|
|
| 112 |
|
| 113 |
progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片,开始处理...")
|
| 114 |
|
|
@@ -142,7 +182,7 @@ def extract_text_from_url(url, progress=gr.Progress()):
|
|
| 142 |
# 保存处理后的图像用于展示
|
| 143 |
processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
|
| 144 |
processed_image.save(processed_path)
|
| 145 |
-
gif_images.append(processed_path)
|
| 146 |
|
| 147 |
# 识别文本
|
| 148 |
text = ocr_image(processed_image)
|
|
@@ -155,9 +195,10 @@ def extract_text_from_url(url, progress=gr.Progress()):
|
|
| 155 |
result_text = "\n\n".join(all_text)
|
| 156 |
|
| 157 |
progress(1.0, desc="处理完成")
|
| 158 |
-
return result_text,
|
| 159 |
|
| 160 |
except Exception as e:
|
|
|
|
| 161 |
return f"处理过程出错:{str(e)}", []
|
| 162 |
|
| 163 |
def create_interface():
|
|
|
|
| 27 |
|
| 28 |
gif_urls = []
|
| 29 |
# 匹配霹雳布袋戏相关的GIF格式,特别是0101.gif这类序列
|
| 30 |
+
# 放宽正则匹配条件,确保能识别到相关GIF
|
| 31 |
pattern = r'010\d+\.gif$'
|
| 32 |
|
| 33 |
for img in img_tags:
|
| 34 |
src = img.get('src', '')
|
| 35 |
+
# 放宽匹配条件,只要包含数字序列的GIF都考虑
|
| 36 |
+
if src and re.search(r'\d+\.gif$', src, re.IGNORECASE):
|
| 37 |
# 处理相对路径
|
| 38 |
if not src.startswith(('http://', 'https://')):
|
| 39 |
+
# 尝试补全相对路径(针对常见情况)
|
| 40 |
+
if src.startswith('/'):
|
| 41 |
+
parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None
|
| 42 |
+
if parsed_url:
|
| 43 |
+
src = f"{parsed_url.scheme}://{parsed_url.netloc}{src}"
|
| 44 |
+
else:
|
| 45 |
+
continue
|
| 46 |
+
else:
|
| 47 |
+
continue
|
| 48 |
gif_urls.append(src)
|
| 49 |
|
| 50 |
# 按文件名排序(0101.gif, 0102.gif...)
|
| 51 |
+
try:
|
| 52 |
+
gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
|
| 53 |
+
except:
|
| 54 |
+
pass # 排序失败时保持原顺序
|
| 55 |
return gif_urls
|
| 56 |
|
| 57 |
def download_gif(url, save_path):
|
| 58 |
"""下载GIF图片"""
|
| 59 |
try:
|
| 60 |
+
# 添加请求头,模拟浏览器行为
|
| 61 |
+
headers = {
|
| 62 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 63 |
+
}
|
| 64 |
+
response = requests.get(url, stream=True, timeout=15, headers=headers)
|
| 65 |
if response.status_code == 200:
|
| 66 |
with open(save_path, 'wb') as f:
|
| 67 |
f.write(response.content)
|
| 68 |
return True
|
| 69 |
return False
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"下载GIF失败: {str(e)}")
|
| 72 |
return False
|
| 73 |
|
| 74 |
def process_gif_for_ocr(gif_path):
|
| 75 |
"""处理GIF图片以提高OCR识别率"""
|
| 76 |
# 打开GIF
|
|
|
|
|
|
|
|
|
|
| 77 |
try:
|
| 78 |
+
gif = Image.open(gif_path)
|
| 79 |
+
|
| 80 |
+
# 尝试提取多个帧,避免只取第一帧可能丢失内容
|
| 81 |
+
frames = []
|
| 82 |
+
try:
|
| 83 |
+
for i in range(10): # 最多尝试10帧
|
| 84 |
+
gif.seek(i)
|
| 85 |
+
frames.append(gif.convert('L')) # 转为灰度图
|
| 86 |
+
except EOFError:
|
| 87 |
+
pass
|
| 88 |
+
|
| 89 |
+
# 如果没有获取到帧,返回None
|
| 90 |
+
if not frames:
|
| 91 |
+
return None
|
| 92 |
+
|
| 93 |
+
# 取第一帧进行处理
|
| 94 |
+
frame = frames[0]
|
| 95 |
|
| 96 |
# 增强对比度
|
| 97 |
enhancer = ImageEnhance.Contrast(frame)
|
|
|
|
| 100 |
# 轻微锐化
|
| 101 |
frame = frame.filter(ImageFilter.SHARPEN)
|
| 102 |
|
| 103 |
+
# 二值化处理,动态调整阈值
|
| 104 |
+
threshold = 140
|
| 105 |
frame = frame.point(lambda p: p > threshold and 255)
|
| 106 |
|
| 107 |
return frame
|
| 108 |
+
except Exception as e:
|
| 109 |
+
print(f"处理GIF失败: {str(e)}")
|
| 110 |
return None
|
| 111 |
|
| 112 |
def ocr_image(image):
|
|
|
|
| 114 |
if image is None:
|
| 115 |
return ""
|
| 116 |
|
| 117 |
+
try:
|
| 118 |
+
# 使用Tesseract进行OCR,指定中文识别,增加更多配置参数
|
| 119 |
+
custom_config = r'--oem 3 --psm 3 -l chi_sim+eng'
|
| 120 |
+
text = pytesseract.image_to_string(image, config=custom_config)
|
| 121 |
+
|
| 122 |
+
# 清理识别结果
|
| 123 |
+
text = text.replace('\f', '').replace('\n\n', '\n').strip()
|
| 124 |
+
return text
|
| 125 |
+
except Exception as e:
|
| 126 |
+
print(f"OCR识别失败: {str(e)}")
|
| 127 |
+
return "OCR识别失败"
|
| 128 |
|
| 129 |
def extract_text_from_url(url, progress=gr.Progress()):
|
| 130 |
"""从指定URL提取GIF并识别文本"""
|
|
|
|
| 133 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 134 |
progress(0, desc="正在获取网页内容...")
|
| 135 |
|
| 136 |
+
# 获取网页内容,添加请求头模拟浏览器
|
| 137 |
+
headers = {
|
| 138 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 139 |
+
}
|
| 140 |
+
response = requests.get(url, timeout=15, headers=headers)
|
| 141 |
if response.status_code != 200:
|
| 142 |
+
# 确保返回两个值:错误信息和空列表
|
| 143 |
+
return f"无法访问网页,状态码:{response.status_code}", []
|
| 144 |
|
| 145 |
# 提取GIF URL
|
| 146 |
progress(0.2, desc="正在提取GIF图片链接...")
|
| 147 |
gif_urls = extract_gif_urls(response.text)
|
| 148 |
|
| 149 |
if not gif_urls:
|
| 150 |
+
# 确保返回两个值:提示信息和空列表
|
| 151 |
+
return "未找到符合条件的GIF图片", []
|
| 152 |
|
| 153 |
progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片,开始处理...")
|
| 154 |
|
|
|
|
| 182 |
# 保存处理后的图像用于展示
|
| 183 |
processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
|
| 184 |
processed_image.save(processed_path)
|
| 185 |
+
gif_images.append(Image.open(processed_path))
|
| 186 |
|
| 187 |
# 识别文本
|
| 188 |
text = ocr_image(processed_image)
|
|
|
|
| 195 |
result_text = "\n\n".join(all_text)
|
| 196 |
|
| 197 |
progress(1.0, desc="处理完成")
|
| 198 |
+
return result_text, gif_images
|
| 199 |
|
| 200 |
except Exception as e:
|
| 201 |
+
# 确保返回两个值:错误信息和空列表
|
| 202 |
return f"处理过程出错:{str(e)}", []
|
| 203 |
|
| 204 |
def create_interface():
|