import gradio as gr import requests from bs4 import BeautifulSoup import re import os import tempfile import subprocess import numpy as np from urllib.parse import urlparse import time # 尝试安装Tesseract(仅在Hugging Face Spaces环境中有效) def install_tesseract(): try: # 检查Tesseract是否已安装 subprocess.run(['tesseract', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return True except (FileNotFoundError, subprocess.CalledProcessError): print("Tesseract未安装,尝试自动安装...") try: # 在Ubuntu/Debian系统上安装Tesseract subprocess.run(['apt-get', 'update'], check=True) subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr', 'tesseract-ocr-chi-sim'], check=True) # 安装Python绑定 subprocess.run(['pip', 'install', 'pytesseract'], check=True) return True except Exception as e: print(f"自动安装Tesseract失败: {str(e)}") return False # 检查并安装Tesseract tesseract_available = install_tesseract() # 只有在Tesseract可用时才导入相关库 if tesseract_available: import pytesseract from PIL import Image, ImageEnhance, ImageFilter # 设置Tesseract OCR路径 try: pytesseract.pytesseract.tesseract_cmd = subprocess.check_output(['which', 'tesseract']).decode().strip() except: pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # 确保中文显示正常 import matplotlib.pyplot as plt plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"] def extract_gif_urls(html_content): """从HTML内容中提取符合条件的GIF图片URL""" soup = BeautifulSoup(html_content, 'html.parser') img_tags = soup.find_all('img') gif_urls = [] # 放宽正则匹配条件,确保能识别到相关GIF pattern = r'\d+\.gif$' for img in img_tags: src = img.get('src', '') if src and re.search(pattern, src, re.IGNORECASE): # 处理相对路径 if not src.startswith(('http://', 'https://')): if src.startswith('/'): parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None if parsed_url: src = f"{parsed_url.scheme}://{parsed_url.netloc}{src}" else: continue else: continue gif_urls.append(src) # 按文件名排序 try: gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1))) except: pass return gif_urls def download_gif(url, save_path): """下载GIF图片""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, stream=True, timeout=15, headers=headers) if response.status_code == 200: with open(save_path, 'wb') as f: f.write(response.content) return True return False except Exception as e: print(f"下载GIF失败: {str(e)}") return False def process_gif_for_ocr(gif_path): """处理GIF图片以提高OCR识别率""" if not tesseract_available: return None try: gif = Image.open(gif_path) # 尝试提取多个帧 frames = [] try: for i in range(10): gif.seek(i) frames.append(gif.convert('L')) except EOFError: pass if not frames: return None # 取第一帧进行处理 frame = frames[0] # 增强对比度 enhancer = ImageEnhance.Contrast(frame) frame = enhancer.enhance(2.0) # 轻微锐化 frame = frame.filter(ImageFilter.SHARPEN) # 二值化处理 threshold = 140 frame = frame.point(lambda p: p > threshold and 255) return frame except Exception as e: print(f"处理GIF失败: {str(e)}") return None def ocr_image(image): """对处理后的图像进行OCR识别""" if not tesseract_available or image is None: return "Tesseract OCR未安装,无法识别文本" try: custom_config = r'--oem 3 --psm 3 -l chi_sim+eng' text = pytesseract.image_to_string(image, config=custom_config) # 清理识别结果 text = text.replace('\f', '').replace('\n\n', '\n').strip() return text except Exception as e: print(f"OCR识别失败: {str(e)}") return f"OCR识别失败: {str(e)}" def extract_text_from_url(url, progress=gr.Progress()): """从指定URL提取GIF并识别文本""" # 检查Tesseract是否可用 if not tesseract_available: return "Tesseract OCR安装失败,无法进行文本识别。请联系管理员解决此问题。", [] try: with tempfile.TemporaryDirectory() as temp_dir: progress(0, desc="正在获取网页内容...") headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, timeout=15, headers=headers) if response.status_code != 200: return f"无法访问网页,状态码:{response.status_code}", [] # 提取GIF URL progress(0.2, desc="正在提取GIF图片链接...") gif_urls = extract_gif_urls(response.text) if not gif_urls: return "未找到符合条件的GIF图片", [] progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片,开始处理...") # 下载并处理每个GIF all_text = [] gif_images = [] step = 0.7 / len(gif_urls) current_progress = 0.3 for i, gif_url in enumerate(gif_urls): current_progress += step progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...") parsed_url = urlparse(gif_url) filename = os.path.basename(parsed_url.path) # 下载GIF gif_path = os.path.join(temp_dir, filename) if not download_gif(gif_url, gif_path): all_text.append(f"【{filename}】下载失败") continue # 处理GIF processed_image = process_gif_for_ocr(gif_path) if processed_image is None: all_text.append(f"【{filename}】处理失败") continue # 保存处理后的图像 processed_path = os.path.join(temp_dir, f"processed_{filename}.png") processed_image.save(processed_path) gif_images.append(Image.open(processed_path)) # 识别文本 text = ocr_image(processed_image) all_text.append(f"【{filename}】\n{text}") time.sleep(0.5) result_text = "\n\n".join(all_text) progress(1.0, desc="处理完成") return result_text, gif_images except Exception as e: return f"处理过程出错:{str(e)}", [] def create_interface(): """创建Gradio界面""" with gr.Blocks(title="霹雳布袋戏GIF文本提取工具") as demo: gr.Markdown(""" # 霹雳布袋戏GIF文本提取工具 这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片,并识别其中的文本内容。 """) # 显示Tesseract状态 if not tesseract_available: gr.Markdown("""