Spaces:
Sleeping
Sleeping
improve: context management
Browse files- modules/info_extractor.py +27 -9
modules/info_extractor.py
CHANGED
|
@@ -3,6 +3,7 @@ import re
|
|
| 3 |
from utils.logger import log
|
| 4 |
import jieba
|
| 5 |
from typing import List, Tuple
|
|
|
|
| 6 |
|
| 7 |
class InfoExtractor:
|
| 8 |
def __init__(self):
|
|
@@ -285,17 +286,23 @@ class InfoExtractor:
|
|
| 285 |
'weekend': 2, 'week': 7, 'month': 30, 'vacation': 7, 'holiday': 3
|
| 286 |
}
|
| 287 |
|
| 288 |
-
def extract(self, user_message: str) -> dict:
|
| 289 |
|
| 290 |
# 输入验证
|
| 291 |
if not user_message or not isinstance(user_message, str):
|
| 292 |
log.warning("⚠️ 收到无效的用户消息")
|
| 293 |
-
return {}
|
| 294 |
|
| 295 |
if len(user_message.strip()) < 2:
|
| 296 |
log.warning("⚠️ 用户消息过短,跳过信息提取")
|
| 297 |
-
return {}
|
| 298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
log.info(f"🛠️ 使用分词策略提取信息:'{user_message[:50]}...'")
|
| 300 |
|
| 301 |
# 1. 智能分词
|
|
@@ -303,25 +310,36 @@ class InfoExtractor:
|
|
| 303 |
log.info(f"📝 分词结果:{tokens}")
|
| 304 |
|
| 305 |
# 2. 基于分词进行信息提取
|
| 306 |
-
|
| 307 |
|
| 308 |
# 提取目的地信息
|
| 309 |
destination_info = self._extract_destination_from_tokens(tokens)
|
| 310 |
if destination_info:
|
| 311 |
-
|
| 312 |
|
| 313 |
# 提取时长信息
|
| 314 |
duration_info = self._extract_duration_from_tokens(tokens)
|
| 315 |
if duration_info:
|
| 316 |
-
|
| 317 |
|
| 318 |
# 提取预算信息
|
| 319 |
budget_info = self._extract_budget_from_tokens(tokens)
|
| 320 |
if budget_info:
|
| 321 |
-
|
| 322 |
|
| 323 |
-
log.info(f"📊 分词提取结果: {
|
| 324 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
|
| 326 |
def _tokenize_message(self, text: str) -> list:
|
| 327 |
"""智能分词,支持中英文混合"""
|
|
|
|
| 3 |
from utils.logger import log
|
| 4 |
import jieba
|
| 5 |
from typing import List, Tuple
|
| 6 |
+
import copy
|
| 7 |
|
| 8 |
class InfoExtractor:
|
| 9 |
def __init__(self):
|
|
|
|
| 286 |
'weekend': 2, 'week': 7, 'month': 30, 'vacation': 7, 'holiday': 3
|
| 287 |
}
|
| 288 |
|
| 289 |
+
def extract(self, user_message: str,existing_info: dict = None) -> dict:
|
| 290 |
|
| 291 |
# 输入验证
|
| 292 |
if not user_message or not isinstance(user_message, str):
|
| 293 |
log.warning("⚠️ 收到无效的用户消息")
|
| 294 |
+
return existing_info or {}
|
| 295 |
|
| 296 |
if len(user_message.strip()) < 2:
|
| 297 |
log.warning("⚠️ 用户消息过短,跳过信息提取")
|
| 298 |
+
return existing_info or {}
|
| 299 |
|
| 300 |
+
if existing_info:
|
| 301 |
+
log.info(f"接收到上下文信息,将在此基础上更新: {existing_info}")
|
| 302 |
+
result = copy.deepcopy(existing_info)
|
| 303 |
+
else:
|
| 304 |
+
result = {}
|
| 305 |
+
|
| 306 |
log.info(f"🛠️ 使用分词策略提取信息:'{user_message[:50]}...'")
|
| 307 |
|
| 308 |
# 1. 智能分词
|
|
|
|
| 310 |
log.info(f"📝 分词结果:{tokens}")
|
| 311 |
|
| 312 |
# 2. 基于分词进行信息提取
|
| 313 |
+
newly_extracted_info = {}
|
| 314 |
|
| 315 |
# 提取目的地信息
|
| 316 |
destination_info = self._extract_destination_from_tokens(tokens)
|
| 317 |
if destination_info:
|
| 318 |
+
newly_extracted_info["destination"] = destination_info
|
| 319 |
|
| 320 |
# 提取时长信息
|
| 321 |
duration_info = self._extract_duration_from_tokens(tokens)
|
| 322 |
if duration_info:
|
| 323 |
+
newly_extracted_info["duration"] = duration_info
|
| 324 |
|
| 325 |
# 提取预算信息
|
| 326 |
budget_info = self._extract_budget_from_tokens(tokens)
|
| 327 |
if budget_info:
|
| 328 |
+
newly_extracted_info["budget"] = budget_info
|
| 329 |
|
| 330 |
+
log.info(f"📊 分词提取结果: {newly_extracted_info}")
|
| 331 |
+
return newly_extracted_info
|
| 332 |
+
|
| 333 |
+
def _merge_info(self, new_info: dict, existing_info: dict) -> dict:
|
| 334 |
+
|
| 335 |
+
for key, value in new_info.items():
|
| 336 |
+
# 如果新旧信息中同一个键的值都是字典,则递归深入合并
|
| 337 |
+
if isinstance(value, dict) and key in existing_info and isinstance(existing_info[key], dict):
|
| 338 |
+
self._merge_info(value, existing_info[key])
|
| 339 |
+
else:
|
| 340 |
+
# 否则,直接用新信息覆盖或添加
|
| 341 |
+
existing_info[key] = value
|
| 342 |
+
return existing_info
|
| 343 |
|
| 344 |
def _tokenize_message(self, text: str) -> list:
|
| 345 |
"""智能分词,支持中英文混合"""
|