Spaces:

wkplhc
/

ocr

Sleeping

ocr

File size: 9,873 Bytes

5ddadc9
 
 
 
 
 
28bf845
5ddadc9
 
 
 
28bf845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ddadc9
 
 
 
 
 
 
 
 
 
5479c69
28bf845
5ddadc9
 
 
28bf845
5ddadc9
 
5479c69
 
 
 
 
 
 
 
5ddadc9
 
28bf845
5479c69
 
 
28bf845
5ddadc9
 
 
 
 
5479c69
 
 
 
5ddadc9
 
 
 
 
5479c69
 
5ddadc9
 
 
 
28bf845
 
 
5ddadc9
5479c69
 
28bf845
5479c69
 
28bf845
5479c69
28bf845
5479c69
 
 
 
 
 
 
 
5ddadc9
 
 
 
 
 
 
 
28bf845
5479c69
5ddadc9
 
 
5479c69
 
5ddadc9
 
 
 
28bf845
 
5ddadc9
5479c69
 
 
 
 
 
 
 
 
28bf845
5ddadc9
 
 
28bf845
 
 
 
5ddadc9
 
 
 
5479c69
 
 
 
5ddadc9
5479c69
5ddadc9
 
 
 
 
 
5479c69
5ddadc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28bf845
5ddadc9
 
 
 
 
28bf845
5ddadc9
 
5479c69
5ddadc9
 
 
 
 
 
 
 
 
5479c69
5ddadc9
 
 
 
 
 
 
 
 
 
 
 
 
28bf845
 
 
 
 
 
 
 
5ddadc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c86ea4a
28bf845
77d74e9
c86ea4a
5ddadc9
 
 
 
28bf845
5ddadc9

import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
import os
import tempfile
import subprocess
import numpy as np
from urllib.parse import urlparse
import time

# 尝试安装Tesseract（仅在Hugging Face Spaces环境中有效）
def install_tesseract():
    try:
        # 检查Tesseract是否已安装
        subprocess.run(['tesseract', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return True
    except (FileNotFoundError, subprocess.CalledProcessError):
        print("Tesseract未安装，尝试自动安装...")
        try:
            # 在Ubuntu/Debian系统上安装Tesseract
            subprocess.run(['apt-get', 'update'], check=True)
            subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr', 'tesseract-ocr-chi-sim'], check=True)
            # 安装Python绑定
            subprocess.run(['pip', 'install', 'pytesseract'], check=True)
            return True
        except Exception as e:
            print(f"自动安装Tesseract失败: {str(e)}")
            return False

# 检查并安装Tesseract
tesseract_available = install_tesseract()

# 只有在Tesseract可用时才导入相关库
if tesseract_available:
    import pytesseract
    from PIL import Image, ImageEnhance, ImageFilter
    # 设置Tesseract OCR路径
    try:
        pytesseract.pytesseract.tesseract_cmd = subprocess.check_output(['which', 'tesseract']).decode().strip()
    except:
        pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

# 确保中文显示正常
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]

def extract_gif_urls(html_content):
    """从HTML内容中提取符合条件的GIF图片URL"""
    soup = BeautifulSoup(html_content, 'html.parser')
    img_tags = soup.find_all('img')
    
    gif_urls = []
    # 放宽正则匹配条件，确保能识别到相关GIF
    pattern = r'\d+\.gif$'
    
    for img in img_tags:
        src = img.get('src', '')
        if src and re.search(pattern, src, re.IGNORECASE):
            # 处理相对路径
            if not src.startswith(('http://', 'https://')):
                if src.startswith('/'):
                    parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None
                    if parsed_url:
                        src = f"{parsed_url.scheme}://{parsed_url.netloc}{src}"
                    else:
                        continue
                else:
                    continue
            gif_urls.append(src)
    
    # 按文件名排序
    try:
        gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
    except:
        pass
    return gif_urls

def download_gif(url, save_path):
    """下载GIF图片"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, stream=True, timeout=15, headers=headers)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            return True
        return False
    except Exception as e:
        print(f"下载GIF失败: {str(e)}")
        return False

def process_gif_for_ocr(gif_path):
    """处理GIF图片以提高OCR识别率"""
    if not tesseract_available:
        return None
        
    try:
        gif = Image.open(gif_path)
        
        # 尝试提取多个帧
        frames = []
        try:
            for i in range(10):
                gif.seek(i)
                frames.append(gif.convert('L'))
        except EOFError:
            pass
        
        if not frames:
            return None
            
        # 取第一帧进行处理
        frame = frames[0]
        
        # 增强对比度
        enhancer = ImageEnhance.Contrast(frame)
        frame = enhancer.enhance(2.0)
        
        # 轻微锐化
        frame = frame.filter(ImageFilter.SHARPEN)
        
        # 二值化处理
        threshold = 140
        frame = frame.point(lambda p: p > threshold and 255)
        
        return frame
    except Exception as e:
        print(f"处理GIF失败: {str(e)}")
        return None

def ocr_image(image):
    """对处理后的图像进行OCR识别"""
    if not tesseract_available or image is None:
        return "Tesseract OCR未安装，无法识别文本"
    
    try:
        custom_config = r'--oem 3 --psm 3 -l chi_sim+eng'
        text = pytesseract.image_to_string(image, config=custom_config)
        
        # 清理识别结果
        text = text.replace('\f', '').replace('\n\n', '\n').strip()
        return text
    except Exception as e:
        print(f"OCR识别失败: {str(e)}")
        return f"OCR识别失败: {str(e)}"

def extract_text_from_url(url, progress=gr.Progress()):
    """从指定URL提取GIF并识别文本"""
    # 检查Tesseract是否可用
    if not tesseract_available:
        return "Tesseract OCR安装失败，无法进行文本识别。请联系管理员解决此问题。", []
    
    try:
        with tempfile.TemporaryDirectory() as temp_dir:
            progress(0, desc="正在获取网页内容...")
            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(url, timeout=15, headers=headers)
            if response.status_code != 200:
                return f"无法访问网页，状态码：{response.status_code}", []
            
            # 提取GIF URL
            progress(0.2, desc="正在提取GIF图片链接...")
            gif_urls = extract_gif_urls(response.text)
            
            if not gif_urls:
                return "未找到符合条件的GIF图片", []
            
            progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片，开始处理...")
            
            # 下载并处理每个GIF
            all_text = []
            gif_images = []
            step = 0.7 / len(gif_urls)
            current_progress = 0.3
            
            for i, gif_url in enumerate(gif_urls):
                current_progress += step
                progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...")
                
                parsed_url = urlparse(gif_url)
                filename = os.path.basename(parsed_url.path)
                
                # 下载GIF
                gif_path = os.path.join(temp_dir, filename)
                if not download_gif(gif_url, gif_path):
                    all_text.append(f"【{filename}】下载失败")
                    continue
                
                # 处理GIF
                processed_image = process_gif_for_ocr(gif_path)
                if processed_image is None:
                    all_text.append(f"【{filename}】处理失败")
                    continue
                
                # 保存处理后的图像
                processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
                processed_image.save(processed_path)
                gif_images.append(Image.open(processed_path))
                
                # 识别文本
                text = ocr_image(processed_image)
                all_text.append(f"【{filename}】\n{text}")
                
                time.sleep(0.5)
            
            result_text = "\n\n".join(all_text)
            progress(1.0, desc="处理完成")
            return result_text, gif_images
    
    except Exception as e:
        return f"处理过程出错：{str(e)}", []

def create_interface():
    """创建Gradio界面"""
    with gr.Blocks(title="霹雳布袋戏GIF文本提取工具") as demo:
        gr.Markdown("""
        # 霹雳布袋戏GIF文本提取工具
        
        这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片，并识别其中的文本内容。
        """)
        
        # 显示Tesseract状态
        if not tesseract_available:
            gr.Markdown("""
            <div style="background-color: #ffebee; padding: 10px; border-radius: 5px; color: #b71c1c;">
            ⚠️ 注意：Tesseract OCR引擎安装失败，可能无法正常识别文本。
            </div>
            """)
        
        with gr.Row():
            url_input = gr.Textbox(
                label="网页URL", 
                placeholder="请输入包含GIF的网页地址",
                value="https://pilicreateworld.tw-blog.com/PILI/PILI69/01.HTM"
            )
        
        with gr.Row():
            extract_btn = gr.Button("提取文本", variant="primary")
        
        with gr.Row():
            with gr.Column(scale=1):
                result_text = gr.Textbox(label="识别结果", lines=20)
            
            with gr.Column(scale=1):
                processed_images = gr.Gallery(
                    label="处理后的GIF帧", 
                    show_label=True, 
                    elem_id="gallery",
                    columns=2,
                    height="auto"
                )
        
        with gr.Row():
            gr.Markdown("""
            ## 注意事项：
            - 首次使用可能需要时间安装OCR组件
            - 识别 accuracy 取决于GIF图片的清晰度
            - 处理可能需要几分钟时间，请耐心等待
            """)
        
        # 设置事件
        extract_btn.click(
            fn=extract_text_from_url,
            inputs=[url_input],
            outputs=[result_text, processed_images]
        )
    
    return demo

# 创建并启动界面
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()