chen666-666 commited on
Commit
221602b
·
verified ·
1 Parent(s): db5986d

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +23 -39
  2. requirements.txt +13 -26
app.py CHANGED
@@ -7,20 +7,14 @@ import json
7
  import chardet
8
  from sklearn.metrics import precision_score, recall_score, f1_score
9
  import time
10
- from nlp_pipeline import process_text
11
-
12
  # ======================== 数据库模块 ========================
13
  from sqlalchemy import create_engine
14
  from sqlalchemy.orm import sessionmaker
15
  from contextlib import contextmanager
16
  import logging
17
 
18
- # 日志配置
19
- logger = logging.getLogger(__name__)
20
- logging.basicConfig(
21
- level=logging.INFO,
22
- format="%(asctime)s - %(levelname)s - %(message)s"
23
- )
24
 
25
  # 使用SQLAlchemy的连接池来管理数据库连接
26
  DATABASE_URL = "mysql+pymysql://user:password@host/dbname" # 请根据实际情况修改连接字符串
@@ -415,41 +409,31 @@ def process_text(text, model_type="bert"):
415
 
416
 
417
  def process_file(file, model_type="bert"):
418
- # 读取二进制
419
- with open(file.name, 'rb') as f:
420
- raw = f.read()
421
- if len(raw) > 5 * 1024 * 1024:
422
- return "❌ 文件太大", "", "", ""
423
-
424
- detect = chardet.detect(raw)
425
- encoding = detect.get('encoding')
426
- logger.info(f"chardet 猜测编码:{encoding} (置信度 {detect.get('confidence'):.2f})")
427
-
428
- text = None
429
- if encoding:
430
- try:
431
- text = raw.decode(encoding)
432
- except UnicodeDecodeError:
433
- logger.warning(f"{encoding} 解码失败,尝试其他编码")
434
 
435
- if text is None:
436
- for enc in ['utf-8','gb18030','utf-16','big5','shift_jis','iso-8859-1']:
437
- try:
438
- text = raw.decode(enc)
439
- logger.info(f"成功使用 {enc} 解码")
440
- break
441
- except UnicodeDecodeError:
442
- continue
443
 
444
- if text is None:
445
  try:
446
- text = raw.decode('utf-8', errors='replace')
447
- logger.warning("所有严格解码失败,使用 utf-8+replace 模式")
448
- except Exception:
449
- return "❌ 编码解析失败(所有尝试均失败)", "", "", ""
450
-
451
- return process_text(text, model_type)
 
 
 
 
 
 
452
 
 
 
 
453
 
454
 
455
 
 
7
  import chardet
8
  from sklearn.metrics import precision_score, recall_score, f1_score
9
  import time
 
 
10
  # ======================== 数据库模块 ========================
11
  from sqlalchemy import create_engine
12
  from sqlalchemy.orm import sessionmaker
13
  from contextlib import contextmanager
14
  import logging
15
 
16
+ # 配置日志
17
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
 
 
 
18
 
19
  # 使用SQLAlchemy的连接池来管理数据库连接
20
  DATABASE_URL = "mysql+pymysql://user:password@host/dbname" # 请根据实际情况修改连接字符串
 
409
 
410
 
411
  def process_file(file, model_type="bert"):
412
+ try:
413
+ with open(file.name, 'rb') as f:
414
+ content = f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
415
 
416
+ if len(content) > 5 * 1024 * 1024:
417
+ return "❌ 文件太大", "", "", ""
 
 
 
 
 
 
418
 
419
+ # 检测编码
420
  try:
421
+ encoding = chardet.detect(content)['encoding'] or 'utf-8'
422
+ text = content.decode(encoding)
423
+ except UnicodeDecodeError:
424
+ # 尝试常见中文编码
425
+ for enc in ['gb18030', 'utf-16', 'big5'] :
426
+ try:
427
+ text = content.decode(enc)
428
+ break
429
+ except:
430
+ continue
431
+ else:
432
+ return "❌ 编码解析失败", "", "", ""
433
 
434
+ return process_text(text, model_type)
435
+ except Exception as e:
436
+ return f"❌ 文件处理错误: {str(e)}", "", "", ""
437
 
438
 
439
 
requirements.txt CHANGED
@@ -1,28 +1,15 @@
1
- # ========== 深度学习 & 加速 ==========
2
- torch>=2.1.0,<3.0.0
3
- transformers==4.39.3
4
- accelerate>=0.27.0,<1.0.0
5
- sentencepiece>=0.2.0,<0.3.0
6
- cpm-kernels>=1.0.11,<2.0.0
7
-
8
- # ========== 自然语言处理 ==========
9
- networkx>=3.0,<4.0
10
- scikit-learn>=1.3.0,<2.0.0
11
- chardet>=5.2.0,<6.0.0
12
- protobuf==3.20.3
13
-
14
- # ========== 数据 & 可视化 ==========
15
- pandas>=2.1.0,<3.0.0
16
- pyvis>=0.3.2,<0.4.0
17
-
18
- # ========== Web 服务 ==========
19
  gradio==3.50.2
20
- uvicorn[standard]>=0.22.0,<1.0.0 # Gradio 部署时可选
21
- gunicorn>=20.1.0,<21.0.0 # 生产环境 WSGI
22
-
23
- # ========== 配置管理 ==========
24
- python-dotenv>=1.0.0,<2.0.0
25
-
26
- # ========== 数据库 ==========
27
- sqlalchemy>=1.4,<2.0.0
 
 
 
28
  pymysql==1.1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  gradio==3.50.2
2
+ transformers==4.39.3
3
+ torch>=2.1.0,<3.0.0
4
+ networkx>=3.0
5
+ python-dotenv>=1.0.0
6
+ sentencepiece>=0.2.0
7
+ cpm-kernels>=1.0.11
8
+ accelerate>=0.27.0
9
+ scikit-learn>=1.3.0
10
+ chardet>=5.2.0
11
+ pandas>=2.1.0
12
+ pyvis>=0.3.2
13
  pymysql==1.1.0
14
+ protobuf==3.20.3 # 避免与新版transformers冲突
15
+ sqlalchemy>=1.4