TNOT commited on
Commit
8c2dad0
·
1 Parent(s): 6a97e38

fix: pkuseg 模型预下载

Browse files
Files changed (2) hide show
  1. app.py +38 -0
  2. src/mfa_runner.py +6 -3
app.py CHANGED
@@ -175,6 +175,13 @@ def setup_mfa_linux():
175
 
176
  # 3. 安装中文/日语分词依赖(无论新装还是已有环境都需要检查)
177
  pip_path = mfa_env / "bin" / "pip"
 
 
 
 
 
 
 
178
  if pip_path.exists():
179
  # 检查是否已安装分词依赖
180
  pkuseg_path = mfa_env / "lib" / "python3.11" / "site-packages" / "spacy_pkuseg"
@@ -188,6 +195,37 @@ def setup_mfa_linux():
188
  logger.info("分词依赖安装完成")
189
  else:
190
  logger.info("分词依赖已存在")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  # 4. 确保 MFA 环境的 bin 目录在 PATH 中
193
  if mfa_bin_dir.exists() and str(mfa_bin_dir) not in os.environ.get("PATH", ""):
 
175
 
176
  # 3. 安装中文/日语分词依赖(无论新装还是已有环境都需要检查)
177
  pip_path = mfa_env / "bin" / "pip"
178
+ python_path = mfa_env / "bin" / "python"
179
+
180
+ # 设置 pkuseg 模型目录到持久化路径(避免每次重启重新下载)
181
+ pkuseg_home = PERSISTENT_MODELS_DIR / "pkuseg" if PERSISTENT_MODELS_DIR.parent.exists() else Path("/root/.pkuseg")
182
+ pkuseg_home.mkdir(parents=True, exist_ok=True)
183
+ os.environ["PKUSEG_HOME"] = str(pkuseg_home)
184
+
185
  if pip_path.exists():
186
  # 检查是否已安装分词依赖
187
  pkuseg_path = mfa_env / "lib" / "python3.11" / "site-packages" / "spacy_pkuseg"
 
195
  logger.info("分词依赖安装完成")
196
  else:
197
  logger.info("分词依赖已存在")
198
+
199
+ # 预下载 pkuseg 模型(避免运行时从 GitHub 下载超时)
200
+ pkuseg_model_path = pkuseg_home / "spacy_ontonotes"
201
+ if not pkuseg_model_path.exists() and python_path.exists():
202
+ logger.info(f"预下载 pkuseg 中文分词模型到 {pkuseg_home}...")
203
+ env = os.environ.copy()
204
+ env["PKUSEG_HOME"] = str(pkuseg_home)
205
+
206
+ # 重试下载(GitHub 访问不稳定)
207
+ max_retries = 3
208
+ for attempt in range(1, max_retries + 1):
209
+ try:
210
+ logger.info(f"下载尝试 {attempt}/{max_retries}...")
211
+ result = subprocess.run([
212
+ str(python_path), "-c",
213
+ "import spacy_pkuseg; spacy_pkuseg.pkuseg(postag=True)"
214
+ ], env=env, capture_output=True, text=True, timeout=600)
215
+ if result.returncode == 0:
216
+ logger.info("pkuseg 模型下载完成")
217
+ break
218
+ else:
219
+ logger.warning(f"尝试 {attempt} 失败: {result.stderr[-200:] if result.stderr else ''}")
220
+ except subprocess.TimeoutExpired:
221
+ logger.warning(f"尝试 {attempt} 超时")
222
+ except Exception as e:
223
+ logger.warning(f"尝试 {attempt} 异常: {e}")
224
+
225
+ if attempt == max_retries:
226
+ logger.warning("pkuseg 模型下载失败,MFA 中文对齐可能不可用")
227
+ else:
228
+ logger.info(f"pkuseg 模型已存在: {pkuseg_model_path}")
229
 
230
  # 4. 确保 MFA 环境的 bin 目录在 PATH 中
231
  if mfa_bin_dir.exists() and str(mfa_bin_dir) not in os.environ.get("PATH", ""):
src/mfa_runner.py CHANGED
@@ -93,9 +93,12 @@ def _build_mfa_env() -> dict:
93
  ]
94
  env["PATH"] = ";".join(mfa_paths) + ";" + env.get("PATH", "")
95
  else:
96
- # Linux: 确保 conda 环境变量正确
97
- # 通常不需要额外设置,但保留扩展点
98
- pass
 
 
 
99
 
100
  return env
101
 
 
93
  ]
94
  env["PATH"] = ";".join(mfa_paths) + ";" + env.get("PATH", "")
95
  else:
96
+ # Linux: 设置 pkuseg 模型目录(云端使用持久化路径)
97
+ persistent_models = Path("/home/studio_service/models")
98
+ if persistent_models.exists():
99
+ pkuseg_home = persistent_models / "pkuseg"
100
+ pkuseg_home.mkdir(parents=True, exist_ok=True)
101
+ env["PKUSEG_HOME"] = str(pkuseg_home)
102
 
103
  return env
104