Spaces:
Sleeping
Sleeping
fix: pkuseg 模型预下载
Browse files- app.py +38 -0
- src/mfa_runner.py +6 -3
app.py
CHANGED
|
@@ -175,6 +175,13 @@ def setup_mfa_linux():
|
|
| 175 |
|
| 176 |
# 3. 安装中文/日语分词依赖(无论新装还是已有环境都需要检查)
|
| 177 |
pip_path = mfa_env / "bin" / "pip"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
if pip_path.exists():
|
| 179 |
# 检查是否已安装分词依赖
|
| 180 |
pkuseg_path = mfa_env / "lib" / "python3.11" / "site-packages" / "spacy_pkuseg"
|
|
@@ -188,6 +195,37 @@ def setup_mfa_linux():
|
|
| 188 |
logger.info("分词依赖安装完成")
|
| 189 |
else:
|
| 190 |
logger.info("分词依赖已存在")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
# 4. 确保 MFA 环境的 bin 目录在 PATH 中
|
| 193 |
if mfa_bin_dir.exists() and str(mfa_bin_dir) not in os.environ.get("PATH", ""):
|
|
|
|
| 175 |
|
| 176 |
# 3. 安装中文/日语分词依赖(无论新装还是已有环境都需要检查)
|
| 177 |
pip_path = mfa_env / "bin" / "pip"
|
| 178 |
+
python_path = mfa_env / "bin" / "python"
|
| 179 |
+
|
| 180 |
+
# 设置 pkuseg 模型目录到持久化路径(避免每次重启重新下载)
|
| 181 |
+
pkuseg_home = PERSISTENT_MODELS_DIR / "pkuseg" if PERSISTENT_MODELS_DIR.parent.exists() else Path("/root/.pkuseg")
|
| 182 |
+
pkuseg_home.mkdir(parents=True, exist_ok=True)
|
| 183 |
+
os.environ["PKUSEG_HOME"] = str(pkuseg_home)
|
| 184 |
+
|
| 185 |
if pip_path.exists():
|
| 186 |
# 检查是否已安装分词依赖
|
| 187 |
pkuseg_path = mfa_env / "lib" / "python3.11" / "site-packages" / "spacy_pkuseg"
|
|
|
|
| 195 |
logger.info("分词依赖安装完成")
|
| 196 |
else:
|
| 197 |
logger.info("分词依赖已存在")
|
| 198 |
+
|
| 199 |
+
# 预下载 pkuseg 模型(避免运行时从 GitHub 下载超时)
|
| 200 |
+
pkuseg_model_path = pkuseg_home / "spacy_ontonotes"
|
| 201 |
+
if not pkuseg_model_path.exists() and python_path.exists():
|
| 202 |
+
logger.info(f"预下载 pkuseg 中文分词模型到 {pkuseg_home}...")
|
| 203 |
+
env = os.environ.copy()
|
| 204 |
+
env["PKUSEG_HOME"] = str(pkuseg_home)
|
| 205 |
+
|
| 206 |
+
# 重试下载(GitHub 访问不稳定)
|
| 207 |
+
max_retries = 3
|
| 208 |
+
for attempt in range(1, max_retries + 1):
|
| 209 |
+
try:
|
| 210 |
+
logger.info(f"下载尝试 {attempt}/{max_retries}...")
|
| 211 |
+
result = subprocess.run([
|
| 212 |
+
str(python_path), "-c",
|
| 213 |
+
"import spacy_pkuseg; spacy_pkuseg.pkuseg(postag=True)"
|
| 214 |
+
], env=env, capture_output=True, text=True, timeout=600)
|
| 215 |
+
if result.returncode == 0:
|
| 216 |
+
logger.info("pkuseg 模型下载完成")
|
| 217 |
+
break
|
| 218 |
+
else:
|
| 219 |
+
logger.warning(f"尝试 {attempt} 失败: {result.stderr[-200:] if result.stderr else ''}")
|
| 220 |
+
except subprocess.TimeoutExpired:
|
| 221 |
+
logger.warning(f"尝试 {attempt} 超时")
|
| 222 |
+
except Exception as e:
|
| 223 |
+
logger.warning(f"尝试 {attempt} 异常: {e}")
|
| 224 |
+
|
| 225 |
+
if attempt == max_retries:
|
| 226 |
+
logger.warning("pkuseg 模型下载失败,MFA 中文对齐可能不可用")
|
| 227 |
+
else:
|
| 228 |
+
logger.info(f"pkuseg 模型已存在: {pkuseg_model_path}")
|
| 229 |
|
| 230 |
# 4. 确保 MFA 环境的 bin 目录在 PATH 中
|
| 231 |
if mfa_bin_dir.exists() and str(mfa_bin_dir) not in os.environ.get("PATH", ""):
|
src/mfa_runner.py
CHANGED
|
@@ -93,9 +93,12 @@ def _build_mfa_env() -> dict:
|
|
| 93 |
]
|
| 94 |
env["PATH"] = ";".join(mfa_paths) + ";" + env.get("PATH", "")
|
| 95 |
else:
|
| 96 |
-
# Linux:
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
return env
|
| 101 |
|
|
|
|
| 93 |
]
|
| 94 |
env["PATH"] = ";".join(mfa_paths) + ";" + env.get("PATH", "")
|
| 95 |
else:
|
| 96 |
+
# Linux: 设置 pkuseg 模型目录(云端使用持久化路径)
|
| 97 |
+
persistent_models = Path("/home/studio_service/models")
|
| 98 |
+
if persistent_models.exists():
|
| 99 |
+
pkuseg_home = persistent_models / "pkuseg"
|
| 100 |
+
pkuseg_home.mkdir(parents=True, exist_ok=True)
|
| 101 |
+
env["PKUSEG_HOME"] = str(pkuseg_home)
|
| 102 |
|
| 103 |
return env
|
| 104 |
|