Spaces:

wkplhc
/

ocr

Sleeping

App Files Files Community

wkplhc commited on Sep 15

Commit

5479c69

verified ·

1 Parent(s): 77d74e9

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -26

app.py CHANGED Viewed

@@ -27,41 +27,71 @@ def extract_gif_urls(html_content):
     gif_urls = []
     # 匹配霹雳布袋戏相关的GIF格式，特别是0101.gif这类序列
     pattern = r'010\d+\.gif$'
     for img in img_tags:
         src = img.get('src', '')
-        if src and re.search(pattern, src, re.IGNORECASE):
             # 处理相对路径
             if not src.startswith(('http://', 'https://')):
-                continue  # 简单处理，实际可能需要更复杂的URL拼接
             gif_urls.append(src)
     # 按文件名排序（0101.gif, 0102.gif...）
-    gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
     return gif_urls
 def download_gif(url, save_path):
     """下载GIF图片"""
     try:
-        response = requests.get(url, stream=True, timeout=10)
         if response.status_code == 200:
             with open(save_path, 'wb') as f:
                 f.write(response.content)
             return True
         return False
-    except:
         return False
 def process_gif_for_ocr(gif_path):
     """处理GIF图片以提高OCR识别率"""
     # 打开GIF
-    gif = Image.open(gif_path)
-    # 提取第一帧（通常文本在第一帧）
     try:
-        gif.seek(0)
-        frame = gif.convert('L')  # 转为灰度图
         # 增强对比度
         enhancer = ImageEnhance.Contrast(frame)
@@ -70,12 +100,13 @@ def process_gif_for_ocr(gif_path):
         # 轻微锐化
         frame = frame.filter(ImageFilter.SHARPEN)
-        # 二值化处理
-        threshold = 150
         frame = frame.point(lambda p: p > threshold and 255)
         return frame
-    except EOFError:
         return None
 def ocr_image(image):
@@ -83,13 +114,17 @@ def ocr_image(image):
     if image is None:
         return ""
-    # 使用Tesseract进行OCR，指定中文识别
-    custom_config = r'--oem 3 --psm 6 -l chi_sim+eng'
-    text = pytesseract.image_to_string(image, config=custom_config)
-    # 清理识别结果
-    text = text.replace('\f', '').replace('\n\n', '\n').strip()
-    return text
 def extract_text_from_url(url, progress=gr.Progress()):
     """从指定URL提取GIF并识别文本"""
@@ -98,17 +133,22 @@ def extract_text_from_url(url, progress=gr.Progress()):
         with tempfile.TemporaryDirectory() as temp_dir:
             progress(0, desc="正在获取网页内容...")
-            # 获取网页内容
-            response = requests.get(url, timeout=15)
             if response.status_code != 200:
-                return f"无法访问网页，状态码：{response.status_code}"
             # 提取GIF URL
             progress(0.2, desc="正在提取GIF图片链接...")
             gif_urls = extract_gif_urls(response.text)
             if not gif_urls:
-                return "未找到符合条件的GIF图片"
             progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片，开始处理...")
@@ -142,7 +182,7 @@ def extract_text_from_url(url, progress=gr.Progress()):
                 # 保存处理后的图像用于展示
                 processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
                 processed_image.save(processed_path)
-                gif_images.append(processed_path)
                 # 识别文本
                 text = ocr_image(processed_image)
@@ -155,9 +195,10 @@ def extract_text_from_url(url, progress=gr.Progress()):
             result_text = "\n\n".join(all_text)
             progress(1.0, desc="处理完成")
-            return result_text, [Image.open(img_path) for img_path in gif_images]
     except Exception as e:
         return f"处理过程出错：{str(e)}", []
 def create_interface():

     gif_urls = []
     # 匹配霹雳布袋戏相关的GIF格式，特别是0101.gif这类序列
+    # 放宽正则匹配条件，确保能识别到相关GIF
     pattern = r'010\d+\.gif$'
     for img in img_tags:
         src = img.get('src', '')
+        # 放宽匹配条件，只要包含数字序列的GIF都考虑
+        if src and re.search(r'\d+\.gif$', src, re.IGNORECASE):
             # 处理相对路径
             if not src.startswith(('http://', 'https://')):
+                # 尝试补全相对路径（针对常见情况）
+                if src.startswith('/'):
+                    parsed_url = urlparse(html_content.url) if hasattr(html_content, 'url') else None
+                    if parsed_url:
+                        src = f"{parsed_url.scheme}://{parsed_url.netloc}{src}"
+                    else:
+                        continue
+                else:
+                    continue
             gif_urls.append(src)
     # 按文件名排序（0101.gif, 0102.gif...）
+    try:
+        gif_urls.sort(key=lambda x: int(re.search(r'(\d+)\.gif', x).group(1)))
+    except:
+        pass  # 排序失败时保持原顺序
     return gif_urls
 def download_gif(url, save_path):
     """下载GIF图片"""
     try:
+        # 添加请求头，模拟浏览器行为
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        response = requests.get(url, stream=True, timeout=15, headers=headers)
         if response.status_code == 200:
             with open(save_path, 'wb') as f:
                 f.write(response.content)
             return True
         return False
+    except Exception as e:
+        print(f"下载GIF失败: {str(e)}")
         return False
 def process_gif_for_ocr(gif_path):
     """处理GIF图片以提高OCR识别率"""
     # 打开GIF
     try:
+        gif = Image.open(gif_path)
+        # 尝试提取多个帧，避免只取第一帧可能丢失内容
+        frames = []
+        try:
+            for i in range(10):  # 最多尝试10帧
+                gif.seek(i)
+                frames.append(gif.convert('L'))  # 转为灰度图
+        except EOFError:
+            pass
+        # 如果没有获取到帧，返回None
+        if not frames:
+            return None
+        # 取第一帧进行处理
+        frame = frames[0]
         # 增强对比度
         enhancer = ImageEnhance.Contrast(frame)
         # 轻微锐化
         frame = frame.filter(ImageFilter.SHARPEN)
+        # 二值化处理，动态调整阈值
+        threshold = 140
         frame = frame.point(lambda p: p > threshold and 255)
         return frame
+    except Exception as e:
+        print(f"处理GIF失败: {str(e)}")
         return None
 def ocr_image(image):
     if image is None:
         return ""
+    try:
+        # 使用Tesseract进行OCR，指定中文识别，增加更多配置参数
+        custom_config = r'--oem 3 --psm 3 -l chi_sim+eng'
+        text = pytesseract.image_to_string(image, config=custom_config)
+        # 清理识别结果
+        text = text.replace('\f', '').replace('\n\n', '\n').strip()
+        return text
+    except Exception as e:
+        print(f"OCR识别失败: {str(e)}")
+        return "OCR识别失败"
 def extract_text_from_url(url, progress=gr.Progress()):
     """从指定URL提取GIF并识别文本"""
         with tempfile.TemporaryDirectory() as temp_dir:
             progress(0, desc="正在获取网页内容...")
+            # 获取网页内容，添加请求头模拟浏览器
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+            }
+            response = requests.get(url, timeout=15, headers=headers)
             if response.status_code != 200:
+                # 确保返回两个值：错误信息和空列表
+                return f"无法访问网页，状态码：{response.status_code}", []
             # 提取GIF URL
             progress(0.2, desc="正在提取GIF图片链接...")
             gif_urls = extract_gif_urls(response.text)
             if not gif_urls:
+                # 确保返回两个值：提示信息和空列表
+                return "未找到符合条件的GIF图片", []
             progress(0.3, desc=f"找到{len(gif_urls)}个GIF图片，开始处理...")
                 # 保存处理后的图像用于展示
                 processed_path = os.path.join(temp_dir, f"processed_{filename}.png")
                 processed_image.save(processed_path)
+                gif_images.append(Image.open(processed_path))
                 # 识别文本
                 text = ocr_image(processed_image)
             result_text = "\n\n".join(all_text)
             progress(1.0, desc="处理完成")
+            return result_text, gif_images
     except Exception as e:
+        # 确保返回两个值：错误信息和空列表
         return f"处理过程出错：{str(e)}", []
 def create_interface():