File size: 9,873 Bytes
5ddadc9
 
 
 
 
 
28bf845
5ddadc9
 
 
 
28bf845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ddadc9
 
 
 
 
 
 
 
 
 
5479c69
28bf845
5ddadc9
 
 
28bf845
5ddadc9
 
5479c69
 
 
 
 
 
 
 
5ddadc9
 
28bf845
5479c69
 
 
28bf845
5ddadc9
 
 
 
 
5479c69
 
 
 
5ddadc9
 
 
 
 
5479c69
 
5ddadc9
 
 
 
28bf845
 
 
5ddadc9
5479c69
 
28bf845
5479c69
 
28bf845
5479c69
28bf845
5479c69
 
 
 
 
 
 
 
5ddadc9
 
 
 
 
 
 
 
28bf845
5479c69
5ddadc9
 
 
5479c69
 
5ddadc9
 
 
 
28bf845
 
5ddadc9
5479c69
 
 
 
 
 
 
 
 
28bf845
5ddadc9
 
 
28bf845
 
 
 
5ddadc9
 
 
 
5479c69
 
 
 
5ddadc9
5479c69
5ddadc9
 
 
 
 
 
5479c69
5ddadc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28bf845
5ddadc9
 
 
 
 
28bf845
5ddadc9
 
5479c69
5ddadc9
 
 
 
 
 
 
 
 
5479c69
5ddadc9
 
 
 
 
 
 
 
 
 
 
 
 
28bf845
 
 
 
 
 
 
 
5ddadc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c86ea4a
28bf845
77d74e9
c86ea4a
5ddadc9
 
 
 
28bf845
5ddadc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
import os
import tempfile
import subprocess
import numpy as np
from urllib.parse import urlparse
import time

# 尝试安装Tesseract(仅在Hugging Face Spaces环境中有效)
def install_tesseract():
    try:
        # 检查Tesseract是否已安装
        subprocess.run(['tesseract', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return True
    except (FileNotFoundError, subprocess.CalledProcessError):
        print("Tesseract未安装,尝试自动安装...")
        try:
            # 在Ubuntu/Debian系统上安装Tesseract
            subprocess.run(['apt-get', 'update'], check=True)
            subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr', 'tesseract-ocr-chi-sim'], check=True)
            # 安装Python绑定
            subprocess.run(['pip', 'install', 'pytesseract'], check=True)
            return True
        except Exception as e:
            print(f"自动安装Tesseract失败: {str(e)}")
            return False

# 检查并安装Tesseract
tesseract_available = install_tesseract()

# 只有在Tesseract可用时才导入相关库
if tesseract_available:
    import pytesseract
    from PIL import Image, ImageEnhance, ImageFilter
    # 设置Tesseract OCR路径
    try:
        pytesseract.pytesseract.tesseract_cmd = subprocess.check_output(['which', 'tesseract']).decode().strip()
    except:
        pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

# 确保中文显示正常
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]

def extract_gif_urls(html_content):
    """从HTML内容中提取符合条件的GIF图片URL"""
    soup = BeautifulSoup(html_content, 'html.parser')
    img_tags = soup.find_all('img')
    
    gif_urls = []
    # 放宽正则匹配条件,确保能识别到相关GIF
    pattern = r'\d+\.gif$'
    
    for img in img_tags:
        src = img.get('src', '')
        if src and re.search(pattern, src, re.IGNORECASE):
            # 处理相对路径
            if not src.startswith(('http://', 'https://')):
                if src.startswith('/'):
                    parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None
                    if parsed_url:
                        src = f"{parsed_url.scheme}://{parsed_url.netloc}{src}"
                    else:
                        continue
                else:
                    continue
            gif_urls.append(src)
    
    # 按文件名排序
    try:
        gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
    except:
        pass
    return gif_urls

def download_gif(url, save_path):
    """下载GIF图片"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, stream=True, timeout=15, headers=headers)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            return True
        return False
    except Exception as e:
        print(f"下载GIF失败: {str(e)}")
        return False

def process_gif_for_ocr(gif_path):
    """处理GIF图片以提高OCR识别率"""
    if not tesseract_available:
        return None
        
    try:
        gif = Image.open(gif_path)
        
        # 尝试提取多个帧
        frames = []
        try:
            for i in range(10):
                gif.seek(i)
                frames.append(gif.convert('L'))
        except EOFError:
            pass
        
        if not frames:
            return None
            
        # 取第一帧进行处理
        frame = frames[0]
        
        # 增强对比度
        enhancer = ImageEnhance.Contrast(frame)
        frame = enhancer.enhance(2.0)
        
        # 轻微锐化
        frame = frame.filter(ImageFilter.SHARPEN)
        
        # 二值化处理
        threshold = 140
        frame = frame.point(lambda p: p > threshold and 255)
        
        return frame
    except Exception as e:
        print(f"处理GIF失败: {str(e)}")
        return None

def ocr_image(image):
    """对处理后的图像进行OCR识别"""
    if not tesseract_available or image is None:
        return "Tesseract OCR未安装,无法识别文本"
    
    try:
        custom_config = r'--oem 3 --psm 3 -l chi_sim+eng'
        text = pytesseract.image_to_string(image, config=custom_config)
        
        # 清理识别结果
        text = text.replace('\f', '').replace('\n\n', '\n').strip()
        return text
    except Exception as e:
        print(f"OCR识别失败: {str(e)}")
        return f"OCR识别失败: {str(e)}"

def extract_text_from_url(url, progress=gr.Progress()):
    """从指定URL提取GIF并识别文本"""
    # 检查Tesseract是否可用
    if not tesseract_available:
        return "Tesseract OCR安装失败,无法进行文本识别。请联系管理员解决此问题。", []
    
    try:
        with tempfile.TemporaryDirectory() as temp_dir:
            progress(0, desc="正在获取网页内容...")
            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(url, timeout=15, headers=headers)
            if response.status_code != 200:
                return f"无法访问网页,状态码:{response.status_code}", []
            
            # 提取GIF URL
            progress(0.2, desc="正在提取GIF图片链接...")
            gif_urls = extract_gif_urls(response.text)
            
            if not gif_urls:
                return "未找到符合条件的GIF图片", []
            
            progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片,开始处理...")
            
            # 下载并处理每个GIF
            all_text = []
            gif_images = []
            step = 0.7 / len(gif_urls)
            current_progress = 0.3
            
            for i, gif_url in enumerate(gif_urls):
                current_progress += step
                progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...")
                
                parsed_url = urlparse(gif_url)
                filename = os.path.basename(parsed_url.path)
                
                # 下载GIF
                gif_path = os.path.join(temp_dir, filename)
                if not download_gif(gif_url, gif_path):
                    all_text.append(f"【{filename}】下载失败")
                    continue
                
                # 处理GIF
                processed_image = process_gif_for_ocr(gif_path)
                if processed_image is None:
                    all_text.append(f"【{filename}】处理失败")
                    continue
                
                # 保存处理后的图像
                processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
                processed_image.save(processed_path)
                gif_images.append(Image.open(processed_path))
                
                # 识别文本
                text = ocr_image(processed_image)
                all_text.append(f"【{filename}】\n{text}")
                
                time.sleep(0.5)
            
            result_text = "\n\n".join(all_text)
            progress(1.0, desc="处理完成")
            return result_text, gif_images
    
    except Exception as e:
        return f"处理过程出错:{str(e)}", []

def create_interface():
    """创建Gradio界面"""
    with gr.Blocks(title="霹雳布袋戏GIF文本提取工具") as demo:
        gr.Markdown("""
        # 霹雳布袋戏GIF文本提取工具
        
        这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片,并识别其中的文本内容。
        """)
        
        # 显示Tesseract状态
        if not tesseract_available:
            gr.Markdown("""
            <div style="background-color: #ffebee; padding: 10px; border-radius: 5px; color: #b71c1c;">
            ⚠️ 注意:Tesseract OCR引擎安装失败,可能无法正常识别文本。
            </div>
            """)
        
        with gr.Row():
            url_input = gr.Textbox(
                label="网页URL", 
                placeholder="请输入包含GIF的网页地址",
                value="https://pilicreateworld.tw-blog.com/PILI/PILI69/01.HTM"
            )
        
        with gr.Row():
            extract_btn = gr.Button("提取文本", variant="primary")
        
        with gr.Row():
            with gr.Column(scale=1):
                result_text = gr.Textbox(label="识别结果", lines=20)
            
            with gr.Column(scale=1):
                processed_images = gr.Gallery(
                    label="处理后的GIF帧", 
                    show_label=True, 
                    elem_id="gallery",
                    columns=2,
                    height="auto"
                )
        
        with gr.Row():
            gr.Markdown("""
            ## 注意事项:
            - 首次使用可能需要时间安装OCR组件
            - 识别 accuracy 取决于GIF图片的清晰度
            - 处理可能需要几分钟时间,请耐心等待
            """)
        
        # 设置事件
        extract_btn.click(
            fn=extract_text_from_url,
            inputs=[url_input],
            outputs=[result_text, processed_images]
        )
    
    return demo

# 创建并启动界面
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()