Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- app.py +23 -39
- requirements.txt +13 -26
app.py
CHANGED
|
@@ -7,20 +7,14 @@ import json
|
|
| 7 |
import chardet
|
| 8 |
from sklearn.metrics import precision_score, recall_score, f1_score
|
| 9 |
import time
|
| 10 |
-
from nlp_pipeline import process_text
|
| 11 |
-
|
| 12 |
# ======================== 数据库模块 ========================
|
| 13 |
from sqlalchemy import create_engine
|
| 14 |
from sqlalchemy.orm import sessionmaker
|
| 15 |
from contextlib import contextmanager
|
| 16 |
import logging
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
|
| 20 |
-
logging.basicConfig(
|
| 21 |
-
level=logging.INFO,
|
| 22 |
-
format="%(asctime)s - %(levelname)s - %(message)s"
|
| 23 |
-
)
|
| 24 |
|
| 25 |
# 使用SQLAlchemy的连接池来管理数据库连接
|
| 26 |
DATABASE_URL = "mysql+pymysql://user:password@host/dbname" # 请根据实际情况修改连接字符串
|
|
@@ -415,41 +409,31 @@ def process_text(text, model_type="bert"):
|
|
| 415 |
|
| 416 |
|
| 417 |
def process_file(file, model_type="bert"):
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
if len(raw) > 5 * 1024 * 1024:
|
| 422 |
-
return "❌ 文件太大", "", "", ""
|
| 423 |
-
|
| 424 |
-
detect = chardet.detect(raw)
|
| 425 |
-
encoding = detect.get('encoding')
|
| 426 |
-
logger.info(f"chardet 猜测编码:{encoding} (置信度 {detect.get('confidence'):.2f})")
|
| 427 |
-
|
| 428 |
-
text = None
|
| 429 |
-
if encoding:
|
| 430 |
-
try:
|
| 431 |
-
text = raw.decode(encoding)
|
| 432 |
-
except UnicodeDecodeError:
|
| 433 |
-
logger.warning(f"{encoding} 解码失败,尝试其他编码")
|
| 434 |
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
try:
|
| 438 |
-
text = raw.decode(enc)
|
| 439 |
-
logger.info(f"成功使用 {enc} 解码")
|
| 440 |
-
break
|
| 441 |
-
except UnicodeDecodeError:
|
| 442 |
-
continue
|
| 443 |
|
| 444 |
-
|
| 445 |
try:
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
except
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
|
|
|
|
|
|
|
|
|
|
| 453 |
|
| 454 |
|
| 455 |
|
|
|
|
| 7 |
import chardet
|
| 8 |
from sklearn.metrics import precision_score, recall_score, f1_score
|
| 9 |
import time
|
|
|
|
|
|
|
| 10 |
# ======================== 数据库模块 ========================
|
| 11 |
from sqlalchemy import create_engine
|
| 12 |
from sqlalchemy.orm import sessionmaker
|
| 13 |
from contextlib import contextmanager
|
| 14 |
import logging
|
| 15 |
|
| 16 |
+
# 配置日志
|
| 17 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# 使用SQLAlchemy的连接池来管理数据库连接
|
| 20 |
DATABASE_URL = "mysql+pymysql://user:password@host/dbname" # 请根据实际情况修改连接字符串
|
|
|
|
| 409 |
|
| 410 |
|
| 411 |
def process_file(file, model_type="bert"):
|
| 412 |
+
try:
|
| 413 |
+
with open(file.name, 'rb') as f:
|
| 414 |
+
content = f.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
|
| 416 |
+
if len(content) > 5 * 1024 * 1024:
|
| 417 |
+
return "❌ 文件太大", "", "", ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
|
| 419 |
+
# 检测编码
|
| 420 |
try:
|
| 421 |
+
encoding = chardet.detect(content)['encoding'] or 'utf-8'
|
| 422 |
+
text = content.decode(encoding)
|
| 423 |
+
except UnicodeDecodeError:
|
| 424 |
+
# 尝试常见中文编码
|
| 425 |
+
for enc in ['gb18030', 'utf-16', 'big5'] :
|
| 426 |
+
try:
|
| 427 |
+
text = content.decode(enc)
|
| 428 |
+
break
|
| 429 |
+
except:
|
| 430 |
+
continue
|
| 431 |
+
else:
|
| 432 |
+
return "❌ 编码解析失败", "", "", ""
|
| 433 |
|
| 434 |
+
return process_text(text, model_type)
|
| 435 |
+
except Exception as e:
|
| 436 |
+
return f"❌ 文件处理错误: {str(e)}", "", "", ""
|
| 437 |
|
| 438 |
|
| 439 |
|
requirements.txt
CHANGED
|
@@ -1,28 +1,15 @@
|
|
| 1 |
-
# ========== 深度学习 & 加速 ==========
|
| 2 |
-
torch>=2.1.0,<3.0.0
|
| 3 |
-
transformers==4.39.3
|
| 4 |
-
accelerate>=0.27.0,<1.0.0
|
| 5 |
-
sentencepiece>=0.2.0,<0.3.0
|
| 6 |
-
cpm-kernels>=1.0.11,<2.0.0
|
| 7 |
-
|
| 8 |
-
# ========== 自然语言处理 ==========
|
| 9 |
-
networkx>=3.0,<4.0
|
| 10 |
-
scikit-learn>=1.3.0,<2.0.0
|
| 11 |
-
chardet>=5.2.0,<6.0.0
|
| 12 |
-
protobuf==3.20.3
|
| 13 |
-
|
| 14 |
-
# ========== 数据 & 可视化 ==========
|
| 15 |
-
pandas>=2.1.0,<3.0.0
|
| 16 |
-
pyvis>=0.3.2,<0.4.0
|
| 17 |
-
|
| 18 |
-
# ========== Web 服务 ==========
|
| 19 |
gradio==3.50.2
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
| 28 |
pymysql==1.1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
gradio==3.50.2
|
| 2 |
+
transformers==4.39.3
|
| 3 |
+
torch>=2.1.0,<3.0.0
|
| 4 |
+
networkx>=3.0
|
| 5 |
+
python-dotenv>=1.0.0
|
| 6 |
+
sentencepiece>=0.2.0
|
| 7 |
+
cpm-kernels>=1.0.11
|
| 8 |
+
accelerate>=0.27.0
|
| 9 |
+
scikit-learn>=1.3.0
|
| 10 |
+
chardet>=5.2.0
|
| 11 |
+
pandas>=2.1.0
|
| 12 |
+
pyvis>=0.3.2
|
| 13 |
pymysql==1.1.0
|
| 14 |
+
protobuf==3.20.3 # 避免与新版transformers冲突
|
| 15 |
+
sqlalchemy>=1.4
|