import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
import os
import tempfile
import subprocess
import numpy as np
from urllib.parse import urlparse
import time

# 尝试安装Tesseract（仅在Hugging Face Spaces环境中有效）
def install_tesseract():
    try:
        # 检查Tesseract是否已安装
        subprocess.run(['tesseract', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return True
    except (FileNotFoundError, subprocess.CalledProcessError):
        print("Tesseract未安装，尝试自动安装...")
        try:
            # 在Ubuntu/Debian系统上安装Tesseract
            subprocess.run(['apt-get', 'update'], check=True)
            subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr', 'tesseract-ocr-chi-sim'], check=True)
            # 安装Python绑定
            subprocess.run(['pip', 'install', 'pytesseract'], check=True)
            return True
        except Exception as e:
            print(f"自动安装Tesseract失败: {str(e)}")
            return False

# 检查并安装Tesseract
tesseract_available = install_tesseract()

# 只有在Tesseract可用时才导入相关库
if tesseract_available:
    import pytesseract
    from PIL import Image, ImageEnhance, ImageFilter
    # 设置Tesseract OCR路径
    try:
        pytesseract.pytesseract.tesseract_cmd = subprocess.check_output(['which', 'tesseract']).decode().strip()
    except:
        pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

# 确保中文显示正常
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]

def extract_gif_urls(html_content):
    """从HTML内容中提取符合条件的GIF图片URL"""
    soup = BeautifulSoup(html_content, 'html.parser')
    img_tags = soup.find_all('img')
    
    gif_urls = []
    # 放宽正则匹配条件，确保能识别到相关GIF
    pattern = r'\d+\.gif$'
    
    for img in img_tags:
        src = img.get('src', '')
        if src and re.search(pattern, src, re.IGNORECASE):
            # 处理相对路径
            if not src.startswith(('http://', 'https://')):
                if src.startswith('/'):
                    parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None
                    if parsed_url:
                        src = f"{parsed_url.scheme}://{parsed_url.netloc}{src}"
                    else:
                        continue
                else:
                    continue
            gif_urls.append(src)
    
    # 按文件名排序
    try:
        gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
    except:
        pass
    return gif_urls

def download_gif(url, save_path):
    """下载GIF图片"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, stream=True, timeout=15, headers=headers)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            return True
        return False
    except Exception as e:
        print(f"下载GIF失败: {str(e)}")
        return False

def process_gif_for_ocr(gif_path):
    """处理GIF图片以提高OCR识别率"""
    if not tesseract_available:
        return None
        
    try:
        gif = Image.open(gif_path)
        
        # 尝试提取多个帧
        frames = []
        try:
            for i in range(10):
                gif.seek(i)
                frames.append(gif.convert('L'))
        except EOFError:
            pass
        
        if not frames:
            return None
            
        # 取第一帧进行处理
        frame = frames[0]
        
        # 增强对比度
        enhancer = ImageEnhance.Contrast(frame)
        frame = enhancer.enhance(2.0)
        
        # 轻微锐化
        frame = frame.filter(ImageFilter.SHARPEN)
        
        # 二值化处理
        threshold = 140
        frame = frame.point(lambda p: p > threshold and 255)
        
        return frame
    except Exception as e:
        print(f"处理GIF失败: {str(e)}")
        return None

def ocr_image(image):
    """对处理后的图像进行OCR识别"""
    if not tesseract_available or image is None:
        return "Tesseract OCR未安装，无法识别文本"
    
    try:
        custom_config = r'--oem 3 --psm 3 -l chi_sim+eng'
        text = pytesseract.image_to_string(image, config=custom_config)
        
        # 清理识别结果
        text = text.replace('\f', '').replace('\n\n', '\n').strip()
        return text
    except Exception as e:
        print(f"OCR识别失败: {str(e)}")
        return f"OCR识别失败: {str(e)}"

def extract_text_from_url(url, progress=gr.Progress()):
    """从指定URL提取GIF并识别文本"""
    # 检查Tesseract是否可用
    if not tesseract_available:
        return "Tesseract OCR安装失败，无法进行文本识别。请联系管理员解决此问题。", []
    
    try:
        with tempfile.TemporaryDirectory() as temp_dir:
            progress(0, desc="正在获取网页内容...")
            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(url, timeout=15, headers=headers)
            if response.status_code != 200:
                return f"无法访问网页，状态码：{response.status_code}", []
            
            # 提取GIF URL
            progress(0.2, desc="正在提取GIF图片链接...")
            gif_urls = extract_gif_urls(response.text)
            
            if not gif_urls:
                return "未找到符合条件的GIF图片", []
            
            progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片，开始处理...")
            
            # 下载并处理每个GIF
            all_text = []
            gif_images = []
            step = 0.7 / len(gif_urls)
            current_progress = 0.3
            
            for i, gif_url in enumerate(gif_urls):
                current_progress += step
                progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...")
                
                parsed_url = urlparse(gif_url)
                filename = os.path.basename(parsed_url.path)
                
                # 下载GIF
                gif_path = os.path.join(temp_dir, filename)
                if not download_gif(gif_url, gif_path):
                    all_text.append(f"【{filename}】下载失败")
                    continue
                
                # 处理GIF
                processed_image = process_gif_for_ocr(gif_path)
                if processed_image is None:
                    all_text.append(f"【{filename}】处理失败")
                    continue
                
                # 保存处理后的图像
                processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
                processed_image.save(processed_path)
                gif_images.append(Image.open(processed_path))
                
                # 识别文本
                text = ocr_image(processed_image)
                all_text.append(f"【{filename}】\n{text}")
                
                time.sleep(0.5)
            
            result_text = "\n\n".join(all_text)
            progress(1.0, desc="处理完成")
            return result_text, gif_images
    
    except Exception as e:
        return f"处理过程出错：{str(e)}", []

def create_interface():
    """创建Gradio界面"""
    with gr.Blocks(title="霹雳布袋戏GIF文本提取工具") as demo:
        gr.Markdown("""
        # 霹雳布袋戏GIF文本提取工具
        
        这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片，并识别其中的文本内容。
        """)
        
        # 显示Tesseract状态
        if not tesseract_available:
            gr.Markdown("""
            <div style="background-color: #ffebee; padding: 10px; border-radius: 5px; color: #b71c1c;">
            ⚠️ 注意：Tesseract OCR引擎安装失败，可能无法正常识别文本。
            </div>
            """)
        
        with gr.Row():
            url_input = gr.Textbox(
                label="网页URL", 
                placeholder="请输入包含GIF的网页地址",
                value="https://pilicreateworld.tw-blog.com/PILI/PILI69/01.HTM"
            )
        
        with gr.Row():
            extract_btn = gr.Button("提取文本", variant="primary")
        
        with gr.Row():
            with gr.Column(scale=1):
                result_text = gr.Textbox(label="识别结果", lines=20)
            
            with gr.Column(scale=1):
                processed_images = gr.Gallery(
                    label="处理后的GIF帧", 
                    show_label=True, 
                    elem_id="gallery",
                    columns=2,
                    height="auto"
                )
        
        with gr.Row():
            gr.Markdown("""
            ## 注意事项：
            - 首次使用可能需要时间安装OCR组件
            - 识别 accuracy 取决于GIF图片的清晰度
            - 处理可能需要几分钟时间，请耐心等待
            """)
        
        # 设置事件
        extract_btn.click(
            fn=extract_text_from_url,
            inputs=[url_input],
            outputs=[result_text, processed_images]
        )
    
    return demo

# 创建并启动界面
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()