Spaces:

Eliot0110
/

Travel_Assistant

Sleeping

App Files Files Community

Eliot0110 commited on Aug 5

Commit

240c11f

1 Parent(s): ae5cfb9

improve: re

Browse files

Files changed (1) hide show

modules/info_extractor.py +63 -26

modules/info_extractor.py CHANGED Viewed

@@ -1,43 +1,72 @@
 import json
 from utils.logger import log
 from .ai_model import AIModel
 class InfoExtractor:
     def __init__(self, ai_model):
         self.ai_model = ai_model
         self.prompt_template = self._build_prompt_template()
     def _build_prompt_template(self) -> str:
-        return """你是一个专业的旅游信息提取AI。
-你的任务是仔细阅读用户的请求，并从中提取出关键的旅行信息。
-请严格按照以下嵌套的JSON格式返回
----
-**重要规则**
-1. 如果某个信息在用户请求中没有明确提及，请将对应的值设为 null。
-2. **如果用户的请求只是简单的问候 (例如 "hi", "你好")，或者完全不包含任何目的地、时间、预算等旅行信息，请必须返回一个空的JSON对象，即 `{{}}`。**
----
 {{
   "destination": {{
-    "name": "提取出的目的地名称"
   }},
   "duration": {{
-    "days": "提取出的天数 (必须是整数)"
   }},
   "budget": {{
-    "type": "提取出的预算类型 (从 'economy', 'comfortable', 'luxury' 中选择一个)",
-    "amount": "提取出的具体预算金额 (必须是数字)",
-    "currency": "提取出的货币单位 (例如 'EUR', 'USD', 'CNY')"
   }}
 }}
-用户的输入是：
 ---
 {user_message}
----
 """
     def extract(self, message: str) -> dict:
@@ -50,29 +79,37 @@ class InfoExtractor:
         prompt = self.prompt_template.format(user_message=message)
         # 2. 调用AI模型生成结果
-        # 注意：这里假设你的ai_model有一个 .generate() 方法
         raw_response = self.ai_model.generate(prompt)
         if not raw_response:
             log.error("❌ LLM模型没有返回任何内容。")
             return {}
-        # 3. 解析LLM返回的JSON字符串
         try:
-            # 清理可能的Markdown代码块标记
-            clean_response = raw_response.strip().replace('```json', '').replace('```', '')
-            extracted_data = json.loads(clean_response)
             log.info(f"✅ LLM成功提取并解析JSON: {extracted_data}")
-        except json.JSONDecodeError:
-            log.error(f"❌ 无法解析LLM返回的JSON: '{raw_response}'")
-            # 在这里可以尝试用正则等方式做最后的补救，但暂时从简
             return {}
         # 4. 清理和格式化提取出的数据
-        # 移除值为null的顶级键
         final_info = {
             key: value for key, value in extracted_data.items() if value and any(v is not None for v in value.values())
         }
-        log.info(f"📋 LLM最终提取结果: {list(final_info.keys())}")
         return final_info

 import json
+import re  # 导入正则表达式模块
 from utils.logger import log
 from .ai_model import AIModel
 class InfoExtractor:
     def __init__(self, ai_model):
         self.ai_model = ai_model
         self.prompt_template = self._build_prompt_template()
     def _build_prompt_template(self) -> str:
+        # --- 重点更新：使用更强大、更明确的Prompt ---
+        return """你是一个专门用于从文本中提取结构化旅行信息的AI助理。
+你的唯一任务是分析用户提供的文本，并严格按照指定的JSON格式输出提取的信息。
+**输出要求:**
+1.  **严格的JSON格式**: 输出必须是一个单一、完整、有效的JSON对象。
+2.  **禁止任何额外文本**: 不要在JSON对象前后添加任何解释、注释、Markdown标记或任何其他文字。
+3.  **遵循指定结构**: JSON的键和层级结构必须与下方定义的格式完全一致。
+4.  **处理缺失信息**: 如果用户输入中没有提到某个字段，请将该字段的值设为 null。
+5.  **处理无关输入**: 如果用户输入是简单的问候或与旅行无关，请返回一个空的JSON对象 `{{}}`。
+**JSON输出格式定义:**
+```json
+{{
+  "destination": {{
+    "name": "string or null"
+  }},
+  "duration": {{
+    "days": "integer or null"
+  }},
+  "budget": {{
+    "type": "string ('economy', 'comfortable', 'luxury') or null",
+    "amount": "number or null",
+    "currency": "string or null"
+  }}
+}}
+```
+**示例:**
+用户输入: "我想去巴黎玩一个星期，预算大概是经济型的"
+你的输出:
+```json
 {{
   "destination": {{
+    "name": "巴黎"
   }},
   "duration": {{
+    "days": 7
   }},
   "budget": {{
+    "type": "economy",
+    "amount": null,
+    "currency": null
   }}
 }}
+```
 ---
+现在，请处理以下用户输入。
+**用户输入:**
+```
 {user_message}
+```
+**你的输出:**
 """
     def extract(self, message: str) -> dict:
         prompt = self.prompt_template.format(user_message=message)
         # 2. 调用AI模型生成结果
         raw_response = self.ai_model.generate(prompt)
         if not raw_response:
             log.error("❌ LLM模型没有返回任何内容。")
             return {}
+        # --- 重点更新：使用更稳健的JSON解析逻辑 ---
         try:
+            # 优先使用正则表达式从 ```json ... ``` 代码块中提取
+            match = re.search(r'```json\s*(\{.*?\})\s*```', raw_response, re.DOTALL)
+            if match:
+                json_str = match.group(1)
+            else:
+                # 如果正则没匹配到，就粗暴地寻找第一个'{'和最后一个'}'
+                start_index = raw_response.find('{')
+                end_index = raw_response.rfind('}')
+                if start_index != -1 and end_index != -1 and end_index > start_index:
+                    json_str = raw_response[start_index:end_index + 1]
+                else:
+                    raise json.JSONDecodeError("在LLM的返回中未找到有效的JSON对象。", raw_response, 0)
+            extracted_data = json.loads(json_str)
             log.info(f"✅ LLM成功提取并解析JSON: {extracted_data}")
+        except json.JSONDecodeError as e:
+            log.error(f"❌ 无法解析LLM返回的JSON: '{raw_response}'. 错误: {e}")
             return {}
         # 4. 清理和格式化提取出的数据
         final_info = {
             key: value for key, value in extracted_data.items() if value and any(v is not None for v in value.values())
         }
+        log.info(f"📊 LLM最终提取结果: {list(final_info.keys())}")
         return final_info