Spaces:
Sleeping
Sleeping
CPU
Browse files
app.py
CHANGED
|
@@ -11,6 +11,29 @@ import numpy as np
|
|
| 11 |
from paddleocr import PaddleOCR, draw_ocr
|
| 12 |
from PIL import Image
|
| 13 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
LANG_CONFIG = {
|
|
@@ -22,7 +45,7 @@ LANG_CONFIG = {
|
|
| 22 |
"japan": {"num_workers": 1},
|
| 23 |
}
|
| 24 |
|
| 25 |
-
#
|
| 26 |
LANG_MAP = {
|
| 27 |
"ch": "中文",
|
| 28 |
"en": "英文",
|
|
@@ -32,14 +55,14 @@ LANG_MAP = {
|
|
| 32 |
"japan": "日语",
|
| 33 |
}
|
| 34 |
|
| 35 |
-
#
|
| 36 |
-
|
| 37 |
-
"
|
| 38 |
-
"en":
|
| 39 |
-
"fr":
|
| 40 |
-
"
|
| 41 |
-
"
|
| 42 |
-
"
|
| 43 |
}
|
| 44 |
|
| 45 |
CONCURRENCY_LIMIT = 8
|
|
@@ -95,7 +118,11 @@ class PaddleOCRModelManager(object):
|
|
| 95 |
|
| 96 |
|
| 97 |
def create_model(lang):
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
|
| 101 |
model_managers = {}
|
|
@@ -113,38 +140,34 @@ def close_model_managers():
|
|
| 113 |
atexit.register(close_model_managers)
|
| 114 |
|
| 115 |
|
| 116 |
-
def
|
| 117 |
-
"""
|
| 118 |
-
if not text:
|
| 119 |
return "en"
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
for lang, char_set in LANG_FEATURES.items():
|
| 124 |
-
if not char_set: # 跳过空字符集
|
| 125 |
-
continue
|
| 126 |
-
|
| 127 |
-
# 计算文本中该语言特征字符的数量
|
| 128 |
-
count = sum(1 for char in text if char in char_set)
|
| 129 |
-
if count > 0:
|
| 130 |
-
lang_scores[lang] = count / len(text)
|
| 131 |
-
|
| 132 |
-
# 特殊处理韩语(通过Unicode范围检测)
|
| 133 |
-
korean_count = sum(1 for char in text if '\uac00' <= char <= '\ud7a3')
|
| 134 |
-
if korean_count > 0:
|
| 135 |
-
lang_scores["korean"] = korean_count / len(text)
|
| 136 |
-
|
| 137 |
-
# 如果没有检测到任何语言特征,默认为英语
|
| 138 |
-
if not lang_scores:
|
| 139 |
return "en"
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
|
| 145 |
def auto_detect_language(image_path):
|
| 146 |
"""使用多模型投票的方式检测语言"""
|
| 147 |
-
|
|
|
|
| 148 |
results = {}
|
| 149 |
|
| 150 |
for lang in languages_to_try:
|
|
@@ -155,10 +178,11 @@ def auto_detect_language(image_path):
|
|
| 155 |
# 提取所有文本
|
| 156 |
all_text = " ".join([line[1][0] for line in result])
|
| 157 |
if all_text.strip():
|
| 158 |
-
#
|
| 159 |
-
detected =
|
| 160 |
results[detected] = results.get(detected, 0) + 1
|
| 161 |
-
except Exception:
|
|
|
|
| 162 |
continue
|
| 163 |
|
| 164 |
# 如果没有检测结果,默认使用英文
|
|
|
|
| 11 |
from paddleocr import PaddleOCR, draw_ocr
|
| 12 |
from PIL import Image
|
| 13 |
import gradio as gr
|
| 14 |
+
import fasttext
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# 加载fasttext语言检测模型
|
| 18 |
+
# 首次运行时会自动下载模型
|
| 19 |
+
try:
|
| 20 |
+
# 检查模型文件是否存在
|
| 21 |
+
model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "lid.176.bin")
|
| 22 |
+
if not os.path.exists(model_path):
|
| 23 |
+
# 如果模型不存在,则下载
|
| 24 |
+
import urllib.request
|
| 25 |
+
print("下载fasttext语言检测模型...")
|
| 26 |
+
urllib.request.urlretrieve(
|
| 27 |
+
"https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin",
|
| 28 |
+
model_path
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# 加载模型
|
| 32 |
+
lang_model = fasttext.load_model(model_path)
|
| 33 |
+
print("fasttext语言检测模型加载成功")
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"警告: 无法加载fasttext模型: {e}")
|
| 36 |
+
lang_model = None
|
| 37 |
|
| 38 |
|
| 39 |
LANG_CONFIG = {
|
|
|
|
| 45 |
"japan": {"num_workers": 1},
|
| 46 |
}
|
| 47 |
|
| 48 |
+
# 语言映射表
|
| 49 |
LANG_MAP = {
|
| 50 |
"ch": "中文",
|
| 51 |
"en": "英文",
|
|
|
|
| 55 |
"japan": "日语",
|
| 56 |
}
|
| 57 |
|
| 58 |
+
# fasttext语言代码到PaddleOCR语言代码的映射
|
| 59 |
+
FASTTEXT_TO_PADDLE = {
|
| 60 |
+
"zh": "ch", # 中文
|
| 61 |
+
"en": "en", # 英文
|
| 62 |
+
"fr": "fr", # 法语
|
| 63 |
+
"de": "german", # 德语
|
| 64 |
+
"ko": "korean", # 韩语
|
| 65 |
+
"ja": "japan", # 日语
|
| 66 |
}
|
| 67 |
|
| 68 |
CONCURRENCY_LIMIT = 8
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
def create_model(lang):
|
| 121 |
+
# 为中文模型添加特殊参数,提高中文识别准确性
|
| 122 |
+
if lang == "ch":
|
| 123 |
+
return PaddleOCR(lang=lang, use_angle_cls=True, use_gpu=False, rec_char_dict_path='ppocr/utils/ppocr_keys_v1.txt')
|
| 124 |
+
else:
|
| 125 |
+
return PaddleOCR(lang=lang, use_angle_cls=True, use_gpu=False)
|
| 126 |
|
| 127 |
|
| 128 |
model_managers = {}
|
|
|
|
| 140 |
atexit.register(close_model_managers)
|
| 141 |
|
| 142 |
|
| 143 |
+
def detect_language_with_fasttext(text):
|
| 144 |
+
"""使用fasttext检测语言"""
|
| 145 |
+
if not text or not text.strip():
|
| 146 |
return "en"
|
| 147 |
|
| 148 |
+
if lang_model is None:
|
| 149 |
+
# 如果fasttext模型加载失败,使用默认语言
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
return "en"
|
| 151 |
|
| 152 |
+
try:
|
| 153 |
+
# 预处理文本,保留一定长度
|
| 154 |
+
text = text[:1000] # 限制文本长度,提高效率
|
| 155 |
+
|
| 156 |
+
# 使用fasttext预测语言
|
| 157 |
+
predictions = lang_model.predict(text.replace('\n', ' '))
|
| 158 |
+
lang_code = predictions[0][0].replace('__label__', '')
|
| 159 |
+
|
| 160 |
+
# 映射到PaddleOCR支持的语言
|
| 161 |
+
return FASTTEXT_TO_PADDLE.get(lang_code, "en")
|
| 162 |
+
except Exception as e:
|
| 163 |
+
print(f"语言检测错误: {e}")
|
| 164 |
+
return "en" # 出错时默认使用英文
|
| 165 |
|
| 166 |
|
| 167 |
def auto_detect_language(image_path):
|
| 168 |
"""使用多模型投票的方式检测语言"""
|
| 169 |
+
# 尝试不同语言的模型
|
| 170 |
+
languages_to_try = ["en", "ch"] # 先尝试英文,然后是中文
|
| 171 |
results = {}
|
| 172 |
|
| 173 |
for lang in languages_to_try:
|
|
|
|
| 178 |
# 提取所有文本
|
| 179 |
all_text = " ".join([line[1][0] for line in result])
|
| 180 |
if all_text.strip():
|
| 181 |
+
# 使用fasttext检测语言
|
| 182 |
+
detected = detect_language_with_fasttext(all_text)
|
| 183 |
results[detected] = results.get(detected, 0) + 1
|
| 184 |
+
except Exception as e:
|
| 185 |
+
print(f"OCR处理错误 ({lang}): {e}")
|
| 186 |
continue
|
| 187 |
|
| 188 |
# 如果没有检测结果,默认使用英文
|