File size: 9,873 Bytes
5ddadc9 28bf845 5ddadc9 28bf845 5ddadc9 5479c69 28bf845 5ddadc9 28bf845 5ddadc9 5479c69 5ddadc9 28bf845 5479c69 28bf845 5ddadc9 5479c69 5ddadc9 5479c69 5ddadc9 28bf845 5ddadc9 5479c69 28bf845 5479c69 28bf845 5479c69 28bf845 5479c69 5ddadc9 28bf845 5479c69 5ddadc9 5479c69 5ddadc9 28bf845 5ddadc9 5479c69 28bf845 5ddadc9 28bf845 5ddadc9 5479c69 5ddadc9 5479c69 5ddadc9 5479c69 5ddadc9 28bf845 5ddadc9 28bf845 5ddadc9 5479c69 5ddadc9 5479c69 5ddadc9 28bf845 5ddadc9 c86ea4a 28bf845 77d74e9 c86ea4a 5ddadc9 28bf845 5ddadc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
import os
import tempfile
import subprocess
import numpy as np
from urllib.parse import urlparse
import time
# 尝试安装Tesseract(仅在Hugging Face Spaces环境中有效)
def install_tesseract():
try:
# 检查Tesseract是否已安装
subprocess.run(['tesseract', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return True
except (FileNotFoundError, subprocess.CalledProcessError):
print("Tesseract未安装,尝试自动安装...")
try:
# 在Ubuntu/Debian系统上安装Tesseract
subprocess.run(['apt-get', 'update'], check=True)
subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr', 'tesseract-ocr-chi-sim'], check=True)
# 安装Python绑定
subprocess.run(['pip', 'install', 'pytesseract'], check=True)
return True
except Exception as e:
print(f"自动安装Tesseract失败: {str(e)}")
return False
# 检查并安装Tesseract
tesseract_available = install_tesseract()
# 只有在Tesseract可用时才导入相关库
if tesseract_available:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
# 设置Tesseract OCR路径
try:
pytesseract.pytesseract.tesseract_cmd = subprocess.check_output(['which', 'tesseract']).decode().strip()
except:
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
# 确保中文显示正常
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
def extract_gif_urls(html_content):
"""从HTML内容中提取符合条件的GIF图片URL"""
soup = BeautifulSoup(html_content, 'html.parser')
img_tags = soup.find_all('img')
gif_urls = []
# 放宽正则匹配条件,确保能识别到相关GIF
pattern = r'\d+\.gif$'
for img in img_tags:
src = img.get('src', '')
if src and re.search(pattern, src, re.IGNORECASE):
# 处理相对路径
if not src.startswith(('http://', 'https://')):
if src.startswith('/'):
parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None
if parsed_url:
src = f"{parsed_url.scheme}://{parsed_url.netloc}{src}"
else:
continue
else:
continue
gif_urls.append(src)
# 按文件名排序
try:
gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
except:
pass
return gif_urls
def download_gif(url, save_path):
"""下载GIF图片"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, stream=True, timeout=15, headers=headers)
if response.status_code == 200:
with open(save_path, 'wb') as f:
f.write(response.content)
return True
return False
except Exception as e:
print(f"下载GIF失败: {str(e)}")
return False
def process_gif_for_ocr(gif_path):
"""处理GIF图片以提高OCR识别率"""
if not tesseract_available:
return None
try:
gif = Image.open(gif_path)
# 尝试提取多个帧
frames = []
try:
for i in range(10):
gif.seek(i)
frames.append(gif.convert('L'))
except EOFError:
pass
if not frames:
return None
# 取第一帧进行处理
frame = frames[0]
# 增强对比度
enhancer = ImageEnhance.Contrast(frame)
frame = enhancer.enhance(2.0)
# 轻微锐化
frame = frame.filter(ImageFilter.SHARPEN)
# 二值化处理
threshold = 140
frame = frame.point(lambda p: p > threshold and 255)
return frame
except Exception as e:
print(f"处理GIF失败: {str(e)}")
return None
def ocr_image(image):
"""对处理后的图像进行OCR识别"""
if not tesseract_available or image is None:
return "Tesseract OCR未安装,无法识别文本"
try:
custom_config = r'--oem 3 --psm 3 -l chi_sim+eng'
text = pytesseract.image_to_string(image, config=custom_config)
# 清理识别结果
text = text.replace('\f', '').replace('\n\n', '\n').strip()
return text
except Exception as e:
print(f"OCR识别失败: {str(e)}")
return f"OCR识别失败: {str(e)}"
def extract_text_from_url(url, progress=gr.Progress()):
"""从指定URL提取GIF并识别文本"""
# 检查Tesseract是否可用
if not tesseract_available:
return "Tesseract OCR安装失败,无法进行文本识别。请联系管理员解决此问题。", []
try:
with tempfile.TemporaryDirectory() as temp_dir:
progress(0, desc="正在获取网页内容...")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, timeout=15, headers=headers)
if response.status_code != 200:
return f"无法访问网页,状态码:{response.status_code}", []
# 提取GIF URL
progress(0.2, desc="正在提取GIF图片链接...")
gif_urls = extract_gif_urls(response.text)
if not gif_urls:
return "未找到符合条件的GIF图片", []
progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片,开始处理...")
# 下载并处理每个GIF
all_text = []
gif_images = []
step = 0.7 / len(gif_urls)
current_progress = 0.3
for i, gif_url in enumerate(gif_urls):
current_progress += step
progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...")
parsed_url = urlparse(gif_url)
filename = os.path.basename(parsed_url.path)
# 下载GIF
gif_path = os.path.join(temp_dir, filename)
if not download_gif(gif_url, gif_path):
all_text.append(f"【{filename}】下载失败")
continue
# 处理GIF
processed_image = process_gif_for_ocr(gif_path)
if processed_image is None:
all_text.append(f"【{filename}】处理失败")
continue
# 保存处理后的图像
processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
processed_image.save(processed_path)
gif_images.append(Image.open(processed_path))
# 识别文本
text = ocr_image(processed_image)
all_text.append(f"【{filename}】\n{text}")
time.sleep(0.5)
result_text = "\n\n".join(all_text)
progress(1.0, desc="处理完成")
return result_text, gif_images
except Exception as e:
return f"处理过程出错:{str(e)}", []
def create_interface():
"""创建Gradio界面"""
with gr.Blocks(title="霹雳布袋戏GIF文本提取工具") as demo:
gr.Markdown("""
# 霹雳布袋戏GIF文本提取工具
这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片,并识别其中的文本内容。
""")
# 显示Tesseract状态
if not tesseract_available:
gr.Markdown("""
<div style="background-color: #ffebee; padding: 10px; border-radius: 5px; color: #b71c1c;">
⚠️ 注意:Tesseract OCR引擎安装失败,可能无法正常识别文本。
</div>
""")
with gr.Row():
url_input = gr.Textbox(
label="网页URL",
placeholder="请输入包含GIF的网页地址",
value="https://pilicreateworld.tw-blog.com/PILI/PILI69/01.HTM"
)
with gr.Row():
extract_btn = gr.Button("提取文本", variant="primary")
with gr.Row():
with gr.Column(scale=1):
result_text = gr.Textbox(label="识别结果", lines=20)
with gr.Column(scale=1):
processed_images = gr.Gallery(
label="处理后的GIF帧",
show_label=True,
elem_id="gallery",
columns=2,
height="auto"
)
with gr.Row():
gr.Markdown("""
## 注意事项:
- 首次使用可能需要时间安装OCR组件
- 识别 accuracy 取决于GIF图片的清晰度
- 处理可能需要几分钟时间,请耐心等待
""")
# 设置事件
extract_btn.click(
fn=extract_text_from_url,
inputs=[url_input],
outputs=[result_text, processed_images]
)
return demo
# 创建并启动界面
if __name__ == "__main__":
demo = create_interface()
demo.launch()
|