|
|
import gradio as gr |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
import re |
|
|
import os |
|
|
import tempfile |
|
|
import subprocess |
|
|
import numpy as np |
|
|
from urllib.parse import urlparse |
|
|
import time |
|
|
|
|
|
|
|
|
def install_tesseract(): |
|
|
try: |
|
|
|
|
|
subprocess.run(['tesseract', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
|
|
return True |
|
|
except (FileNotFoundError, subprocess.CalledProcessError): |
|
|
print("Tesseract未安装,尝试自动安装...") |
|
|
try: |
|
|
|
|
|
subprocess.run(['apt-get', 'update'], check=True) |
|
|
subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr', 'tesseract-ocr-chi-sim'], check=True) |
|
|
|
|
|
subprocess.run(['pip', 'install', 'pytesseract'], check=True) |
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"自动安装Tesseract失败: {str(e)}") |
|
|
return False |
|
|
|
|
|
|
|
|
tesseract_available = install_tesseract() |
|
|
|
|
|
|
|
|
if tesseract_available: |
|
|
import pytesseract |
|
|
from PIL import Image, ImageEnhance, ImageFilter |
|
|
|
|
|
try: |
|
|
pytesseract.pytesseract.tesseract_cmd = subprocess.check_output(['which', 'tesseract']).decode().strip() |
|
|
except: |
|
|
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' |
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt |
|
|
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"] |
|
|
|
|
|
def extract_gif_urls(html_content): |
|
|
"""从HTML内容中提取符合条件的GIF图片URL""" |
|
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
|
img_tags = soup.find_all('img') |
|
|
|
|
|
gif_urls = [] |
|
|
|
|
|
pattern = r'\d+\.gif$' |
|
|
|
|
|
for img in img_tags: |
|
|
src = img.get('src', '') |
|
|
if src and re.search(pattern, src, re.IGNORECASE): |
|
|
|
|
|
if not src.startswith(('http://', 'https://')): |
|
|
if src.startswith('/'): |
|
|
parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None |
|
|
if parsed_url: |
|
|
src = f"{parsed_url.scheme}://{parsed_url.netloc}{src}" |
|
|
else: |
|
|
continue |
|
|
else: |
|
|
continue |
|
|
gif_urls.append(src) |
|
|
|
|
|
|
|
|
try: |
|
|
gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1))) |
|
|
except: |
|
|
pass |
|
|
return gif_urls |
|
|
|
|
|
def download_gif(url, save_path): |
|
|
"""下载GIF图片""" |
|
|
try: |
|
|
headers = { |
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
|
} |
|
|
response = requests.get(url, stream=True, timeout=15, headers=headers) |
|
|
if response.status_code == 200: |
|
|
with open(save_path, 'wb') as f: |
|
|
f.write(response.content) |
|
|
return True |
|
|
return False |
|
|
except Exception as e: |
|
|
print(f"下载GIF失败: {str(e)}") |
|
|
return False |
|
|
|
|
|
def process_gif_for_ocr(gif_path): |
|
|
"""处理GIF图片以提高OCR识别率""" |
|
|
if not tesseract_available: |
|
|
return None |
|
|
|
|
|
try: |
|
|
gif = Image.open(gif_path) |
|
|
|
|
|
|
|
|
frames = [] |
|
|
try: |
|
|
for i in range(10): |
|
|
gif.seek(i) |
|
|
frames.append(gif.convert('L')) |
|
|
except EOFError: |
|
|
pass |
|
|
|
|
|
if not frames: |
|
|
return None |
|
|
|
|
|
|
|
|
frame = frames[0] |
|
|
|
|
|
|
|
|
enhancer = ImageEnhance.Contrast(frame) |
|
|
frame = enhancer.enhance(2.0) |
|
|
|
|
|
|
|
|
frame = frame.filter(ImageFilter.SHARPEN) |
|
|
|
|
|
|
|
|
threshold = 140 |
|
|
frame = frame.point(lambda p: p > threshold and 255) |
|
|
|
|
|
return frame |
|
|
except Exception as e: |
|
|
print(f"处理GIF失败: {str(e)}") |
|
|
return None |
|
|
|
|
|
def ocr_image(image): |
|
|
"""对处理后的图像进行OCR识别""" |
|
|
if not tesseract_available or image is None: |
|
|
return "Tesseract OCR未安装,无法识别文本" |
|
|
|
|
|
try: |
|
|
custom_config = r'--oem 3 --psm 3 -l chi_sim+eng' |
|
|
text = pytesseract.image_to_string(image, config=custom_config) |
|
|
|
|
|
|
|
|
text = text.replace('\f', '').replace('\n\n', '\n').strip() |
|
|
return text |
|
|
except Exception as e: |
|
|
print(f"OCR识别失败: {str(e)}") |
|
|
return f"OCR识别失败: {str(e)}" |
|
|
|
|
|
def extract_text_from_url(url, progress=gr.Progress()): |
|
|
"""从指定URL提取GIF并识别文本""" |
|
|
|
|
|
if not tesseract_available: |
|
|
return "Tesseract OCR安装失败,无法进行文本识别。请联系管理员解决此问题。", [] |
|
|
|
|
|
try: |
|
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
|
progress(0, desc="正在获取网页内容...") |
|
|
|
|
|
headers = { |
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
|
} |
|
|
response = requests.get(url, timeout=15, headers=headers) |
|
|
if response.status_code != 200: |
|
|
return f"无法访问网页,状态码:{response.status_code}", [] |
|
|
|
|
|
|
|
|
progress(0.2, desc="正在提取GIF图片链接...") |
|
|
gif_urls = extract_gif_urls(response.text) |
|
|
|
|
|
if not gif_urls: |
|
|
return "未找到符合条件的GIF图片", [] |
|
|
|
|
|
progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片,开始处理...") |
|
|
|
|
|
|
|
|
all_text = [] |
|
|
gif_images = [] |
|
|
step = 0.7 / len(gif_urls) |
|
|
current_progress = 0.3 |
|
|
|
|
|
for i, gif_url in enumerate(gif_urls): |
|
|
current_progress += step |
|
|
progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...") |
|
|
|
|
|
parsed_url = urlparse(gif_url) |
|
|
filename = os.path.basename(parsed_url.path) |
|
|
|
|
|
|
|
|
gif_path = os.path.join(temp_dir, filename) |
|
|
if not download_gif(gif_url, gif_path): |
|
|
all_text.append(f"【{filename}】下载失败") |
|
|
continue |
|
|
|
|
|
|
|
|
processed_image = process_gif_for_ocr(gif_path) |
|
|
if processed_image is None: |
|
|
all_text.append(f"【{filename}】处理失败") |
|
|
continue |
|
|
|
|
|
|
|
|
processed_path = os.path.join(temp_dir, f"processed_{filename}.png") |
|
|
processed_image.save(processed_path) |
|
|
gif_images.append(Image.open(processed_path)) |
|
|
|
|
|
|
|
|
text = ocr_image(processed_image) |
|
|
all_text.append(f"【{filename}】\n{text}") |
|
|
|
|
|
time.sleep(0.5) |
|
|
|
|
|
result_text = "\n\n".join(all_text) |
|
|
progress(1.0, desc="处理完成") |
|
|
return result_text, gif_images |
|
|
|
|
|
except Exception as e: |
|
|
return f"处理过程出错:{str(e)}", [] |
|
|
|
|
|
def create_interface(): |
|
|
"""创建Gradio界面""" |
|
|
with gr.Blocks(title="霹雳布袋戏GIF文本提取工具") as demo: |
|
|
gr.Markdown(""" |
|
|
# 霹雳布袋戏GIF文本提取工具 |
|
|
|
|
|
这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片,并识别其中的文本内容。 |
|
|
""") |
|
|
|
|
|
|
|
|
if not tesseract_available: |
|
|
gr.Markdown(""" |
|
|
<div style="background-color: #ffebee; padding: 10px; border-radius: 5px; color: #b71c1c;"> |
|
|
⚠️ 注意:Tesseract OCR引擎安装失败,可能无法正常识别文本。 |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
url_input = gr.Textbox( |
|
|
label="网页URL", |
|
|
placeholder="请输入包含GIF的网页地址", |
|
|
value="https://pilicreateworld.tw-blog.com/PILI/PILI69/01.HTM" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
extract_btn = gr.Button("提取文本", variant="primary") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
result_text = gr.Textbox(label="识别结果", lines=20) |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
processed_images = gr.Gallery( |
|
|
label="处理后的GIF帧", |
|
|
show_label=True, |
|
|
elem_id="gallery", |
|
|
columns=2, |
|
|
height="auto" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
gr.Markdown(""" |
|
|
## 注意事项: |
|
|
- 首次使用可能需要时间安装OCR组件 |
|
|
- 识别 accuracy 取决于GIF图片的清晰度 |
|
|
- 处理可能需要几分钟时间,请耐心等待 |
|
|
""") |
|
|
|
|
|
|
|
|
extract_btn.click( |
|
|
fn=extract_text_from_url, |
|
|
inputs=[url_input], |
|
|
outputs=[result_text, processed_images] |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo = create_interface() |
|
|
demo.launch() |
|
|
|