Spaces:

wkplhc
/

ocr

Sleeping

App Files Files Community

wkplhc commited on Sep 15

Commit

28bf845

verified ·

1 Parent(s): db73429

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -45

app.py CHANGED Viewed

@@ -4,39 +4,61 @@ from bs4 import BeautifulSoup
 import re
 import os
 import tempfile
-import pytesseract
-from PIL import Image, ImageEnhance, ImageFilter
 import numpy as np
 from urllib.parse import urlparse
 import time
 # 确保中文显示正常
 import matplotlib.pyplot as plt
 plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
-# 设置Tesseract OCR路径（Hugging Face Spaces上已预安装）
-try:
-    pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
-except:
-    pass  # 在Windows上可能需要手动设置
 def extract_gif_urls(html_content):
     """从HTML内容中提取符合条件的GIF图片URL"""
     soup = BeautifulSoup(html_content, 'html.parser')
     img_tags = soup.find_all('img')
     gif_urls = []
-    # 匹配霹雳布袋戏相关的GIF格式，特别是0101.gif这类序列
     # 放宽正则匹配条件，确保能识别到相关GIF
-    pattern = r'010\d+\.gif$'
     for img in img_tags:
         src = img.get('src', '')
-        # 放宽匹配条件，只要包含数字序列的GIF都考虑
-        if src and re.search(r'\d+\.gif$', src, re.IGNORECASE):
             # 处理相对路径
             if not src.startswith(('http://', 'https://')):
-                # 尝试补全相对路径（针对常见情况）
                 if src.startswith('/'):
                     parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None
                     if parsed_url:
@@ -47,17 +69,16 @@ def extract_gif_urls(html_content):
                     continue
             gif_urls.append(src)
-    # 按文件名排序（0101.gif, 0102.gif...）
     try:
         gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
     except:
-        pass  # 排序失败时保持原顺序
     return gif_urls
 def download_gif(url, save_path):
     """下载GIF图片"""
     try:
-        # 添加请求头，模拟浏览器行为
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
@@ -73,20 +94,21 @@ def download_gif(url, save_path):
 def process_gif_for_ocr(gif_path):
     """处理GIF图片以提高OCR识别率"""
-    # 打开GIF
     try:
         gif = Image.open(gif_path)
-        # 尝试提取多个帧，避免只取第一帧可能丢失内容
         frames = []
         try:
-            for i in range(10):  # 最多尝试10帧
                 gif.seek(i)
-                frames.append(gif.convert('L'))  # 转为灰度图
         except EOFError:
             pass
-        # 如果没有获取到帧，返回None
         if not frames:
             return None
@@ -100,7 +122,7 @@ def process_gif_for_ocr(gif_path):
         # 轻微锐化
         frame = frame.filter(ImageFilter.SHARPEN)
-        # 二值化处理，动态调整阈值
         threshold = 140
         frame = frame.point(lambda p: p > threshold and 255)
@@ -111,11 +133,10 @@ def process_gif_for_ocr(gif_path):
 def ocr_image(image):
     """对处理后的图像进行OCR识别"""
-    if image is None:
-        return ""
     try:
-        # 使用Tesseract进行OCR，指定中文识别，增加更多配置参数
         custom_config = r'--oem 3 --psm 3 -l chi_sim+eng'
         text = pytesseract.image_to_string(image, config=custom_config)
@@ -124,22 +145,23 @@ def ocr_image(image):
         return text
     except Exception as e:
         print(f"OCR识别失败: {str(e)}")
-        return "OCR识别失败"
 def extract_text_from_url(url, progress=gr.Progress()):
     """从指定URL提取GIF并识别文本"""
     try:
-        # 创建临时目录
         with tempfile.TemporaryDirectory() as temp_dir:
             progress(0, desc="正在获取网页内容...")
-            # 获取网页内容，添加请求头模拟浏览器
             headers = {
                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
             }
             response = requests.get(url, timeout=15, headers=headers)
             if response.status_code != 200:
-                # 确保返回两个值：错误信息和空列表
                 return f"无法访问网页，状态码：{response.status_code}", []
             # 提取GIF URL
@@ -147,7 +169,6 @@ def extract_text_from_url(url, progress=gr.Progress()):
             gif_urls = extract_gif_urls(response.text)
             if not gif_urls:
-                # 确保返回两个值：提示信息和空列表
                 return "未找到符合条件的GIF图片", []
             progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片，开始处理...")
@@ -159,11 +180,9 @@ def extract_text_from_url(url, progress=gr.Progress()):
             current_progress = 0.3
             for i, gif_url in enumerate(gif_urls):
-                # 更新进度
                 current_progress += step
                 progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...")
-                # 提取文件名
                 parsed_url = urlparse(gif_url)
                 filename = os.path.basename(parsed_url.path)
@@ -173,13 +192,13 @@ def extract_text_from_url(url, progress=gr.Progress()):
                     all_text.append(f"【{filename}】下载失败")
                     continue
-                # 处理GIF以提高OCR识别率
                 processed_image = process_gif_for_ocr(gif_path)
                 if processed_image is None:
                     all_text.append(f"【{filename}】处理失败")
                     continue
-                # 保存处理后的图像用于展示
                 processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
                 processed_image.save(processed_path)
                 gif_images.append(Image.open(processed_path))
@@ -188,17 +207,13 @@ def extract_text_from_url(url, progress=gr.Progress()):
                 text = ocr_image(processed_image)
                 all_text.append(f"【{filename}】\n{text}")
-                # 避免请求过于频繁
                 time.sleep(0.5)
-            # 拼接所有文本
             result_text = "\n\n".join(all_text)
             progress(1.0, desc="处理完成")
             return result_text, gif_images
     except Exception as e:
-        # 确保返回两个值：错误信息和空列表
         return f"处理过程出错：{str(e)}", []
 def create_interface():
@@ -208,13 +223,16 @@ def create_interface():
         # 霹雳布袋戏GIF文本提取工具
         这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片，并识别其中的文本内容。
-        ## 使用方法：
-        1. 输入包含GIF的网页URL（例如：https://pilicreateworld.tw-blog.com/PILI/PILI69/01.HTM）
-        2. 点击"提取文本"按钮
-        3. 等待处理完成，查看识别结果
         """)
         with gr.Row():
             url_input = gr.Textbox(
                 label="网页URL",
@@ -234,16 +252,16 @@ def create_interface():
                     label="处理后的GIF帧",
                     show_label=True,
                     elem_id="gallery",
-                    columns=2,  # 适配旧版本Gradio的参数名称
                     height="auto"
                 )
         with gr.Row():
             gr.Markdown("""
             ## 注意事项：
             - 识别 accuracy 取决于GIF图片的清晰度
             - 处理可能需要几分钟时间，请耐心等待
-            - 如遇网络问题，请检查URL是否正确或稍后重试
             """)
         # 设置事件

 import re
 import os
 import tempfile
+import subprocess
 import numpy as np
 from urllib.parse import urlparse
 import time
+# 尝试安装Tesseract（仅在Hugging Face Spaces环境中有效）
+def install_tesseract():
+    try:
+        # 检查Tesseract是否已安装
+        subprocess.run(['tesseract', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return True
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        print("Tesseract未安装，尝试自动安装...")
+        try:
+            # 在Ubuntu/Debian系统上安装Tesseract
+            subprocess.run(['apt-get', 'update'], check=True)
+            subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr', 'tesseract-ocr-chi-sim'], check=True)
+            # 安装Python绑定
+            subprocess.run(['pip', 'install', 'pytesseract'], check=True)
+            return True
+        except Exception as e:
+            print(f"自动安装Tesseract失败: {str(e)}")
+            return False
+# 检查并安装Tesseract
+tesseract_available = install_tesseract()
+# 只有在Tesseract可用时才导入相关库
+if tesseract_available:
+    import pytesseract
+    from PIL import Image, ImageEnhance, ImageFilter
+    # 设置Tesseract OCR路径
+    try:
+        pytesseract.pytesseract.tesseract_cmd = subprocess.check_output(['which', 'tesseract']).decode().strip()
+    except:
+        pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
 # 确保中文显示正常
 import matplotlib.pyplot as plt
 plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
 def extract_gif_urls(html_content):
     """从HTML内容中提取符合条件的GIF图片URL"""
     soup = BeautifulSoup(html_content, 'html.parser')
     img_tags = soup.find_all('img')
     gif_urls = []
     # 放宽正则匹配条件，确保能识别到相关GIF
+    pattern = r'\d+\.gif$'
     for img in img_tags:
         src = img.get('src', '')
+        if src and re.search(pattern, src, re.IGNORECASE):
             # 处理相对路径
             if not src.startswith(('http://', 'https://')):
                 if src.startswith('/'):
                     parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None
                     if parsed_url:
                     continue
             gif_urls.append(src)
+    # 按文件名排序
     try:
         gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
     except:
+        pass
     return gif_urls
 def download_gif(url, save_path):
     """下载GIF图片"""
     try:
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
 def process_gif_for_ocr(gif_path):
     """处理GIF图片以提高OCR识别率"""
+    if not tesseract_available:
+        return None
     try:
         gif = Image.open(gif_path)
+        # 尝试提取多个帧
         frames = []
         try:
+            for i in range(10):
                 gif.seek(i)
+                frames.append(gif.convert('L'))
         except EOFError:
             pass
         if not frames:
             return None
         # 轻微锐化
         frame = frame.filter(ImageFilter.SHARPEN)
+        # 二值化处理
         threshold = 140
         frame = frame.point(lambda p: p > threshold and 255)
 def ocr_image(image):
     """对处理后的图像进行OCR识别"""
+    if not tesseract_available or image is None:
+        return "Tesseract OCR未安装，无法识别文本"
     try:
         custom_config = r'--oem 3 --psm 3 -l chi_sim+eng'
         text = pytesseract.image_to_string(image, config=custom_config)
         return text
     except Exception as e:
         print(f"OCR识别失败: {str(e)}")
+        return f"OCR识别失败: {str(e)}"
 def extract_text_from_url(url, progress=gr.Progress()):
     """从指定URL提取GIF并识别文本"""
+    # 检查Tesseract是否可用
+    if not tesseract_available:
+        return "Tesseract OCR安装失败，无法进行文本识别。请联系管理员解决此问题。", []
     try:
         with tempfile.TemporaryDirectory() as temp_dir:
             progress(0, desc="正在获取网页内容...")
             headers = {
                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
             }
             response = requests.get(url, timeout=15, headers=headers)
             if response.status_code != 200:
                 return f"无法访问网页，状态码：{response.status_code}", []
             # 提取GIF URL
             gif_urls = extract_gif_urls(response.text)
             if not gif_urls:
                 return "未找到符合条件的GIF图片", []
             progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片，开始处理...")
             current_progress = 0.3
             for i, gif_url in enumerate(gif_urls):
                 current_progress += step
                 progress(current_progress, desc=f"处理第{i+1}/{len(gif_urls)}个GIF...")
                 parsed_url = urlparse(gif_url)
                 filename = os.path.basename(parsed_url.path)
                     all_text.append(f"【{filename}】下载失败")
                     continue
+                # 处理GIF
                 processed_image = process_gif_for_ocr(gif_path)
                 if processed_image is None:
                     all_text.append(f"【{filename}】处理失败")
                     continue
+                # 保存处理后的图像
                 processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
                 processed_image.save(processed_path)
                 gif_images.append(Image.open(processed_path))
                 text = ocr_image(processed_image)
                 all_text.append(f"【{filename}】\n{text}")
                 time.sleep(0.5)
             result_text = "\n\n".join(all_text)
             progress(1.0, desc="处理完成")
             return result_text, gif_images
     except Exception as e:
         return f"处理过程出错：{str(e)}", []
 def create_interface():
         # 霹雳布袋戏GIF文本提取工具
         这个工具可以从指定的霹雳布袋戏相关网页中提取GIF图片，并识别其中的文本内容。
         """)
+        # 显示Tesseract状态
+        if not tesseract_available:
+            gr.Markdown("""
+            <div style="background-color: #ffebee; padding: 10px; border-radius: 5px; color: #b71c1c;">
+            ⚠️ 注意：Tesseract OCR引擎安装失败，可能无法正常识别文本。
+            </div>
+            """)
         with gr.Row():
             url_input = gr.Textbox(
                 label="网页URL",
                     label="处理后的GIF帧",
                     show_label=True,
                     elem_id="gallery",
+                    columns=2,
                     height="auto"
                 )
         with gr.Row():
             gr.Markdown("""
             ## 注意事项：
+            - 首次使用可能需要时间安装OCR组件
             - 识别 accuracy 取决于GIF图片的清晰度
             - 处理可能需要几分钟时间，请耐心等待
             """)
         # 设置事件