Spaces:

Eliot0110
/

Travel_Assistant

Sleeping

App Files Files Community

Eliot0110 commited on Aug 6

Commit

3ecb35b

1 Parent(s): 14f6b72

improve: tokenizer

Browse files

Files changed (1) hide show

modules/info_extractor.py +546 -432

modules/info_extractor.py CHANGED Viewed

@@ -1,11 +1,15 @@
 import json
 import re
 from utils.logger import log
 class InfoExtractor:
     def __init__(self):
-        # 预定义的提取结构，用于验证和规范化
         self.extraction_schema = {
             "destination": {"type": dict, "fields": {"name": str, "country": str}},
             "duration": {"type": dict, "fields": {"days": int, "description": str}},
@@ -267,21 +271,25 @@ class InfoExtractor:
             "翡翠岛": "都柏林", "欧洲之都": "布鲁塞尔", "高迪之城": "巴塞罗那",
         }
-        # 中文数字映射（保持原有）
         self.chinese_numbers = {
             '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
             '两': 2, '半': 0.5, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10,
             # 特殊时长表达
             '半个月': 15, '一个月': 30, '半年': 180, '一年': 365,
             '半天': 0.5, '一天': 1, '两天': 2, '三天': 3, '四天': 4, '五天': 5, '六天': 6, '七天': 7,
             '八天': 8, '九天': 9, '十天': 10, '半周': 3.5, '一周': 7, '两周': 14,
             # 假期相关
             '小长假': 3, '长假': 7, '十一': 7, '国庆': 7, '春节': 7, '五一': 3, '清明': 3,
-            '端午': 3, '中秋': 3, '元旦': 3, '暑假': 60, '寒假': 30, '周末': 2, '长周末': 3
         }
-    def extract(self, user_message: str) -> dict:
-        """使用纯正则表达式提取结构化信息 - 聚焦欧洲"""
         # 输入验证
         if not user_message or not isinstance(user_message, str):
@@ -292,525 +300,631 @@ class InfoExtractor:
             log.warning("⚠️ 用户消息过短，跳过信息提取")
             return {}
-        log.info("🛠️ 使用正则表达式提取信息（聚焦欧洲）")
         result = {}
-        # 1. 提取目的地信息
-        destination_info = self._extract_european_destination(user_message)
         if destination_info:
             result["destination"] = destination_info
-        # 2. 提取时长信息
-        duration_info = self._extract_duration(user_message)
         if duration_info:
             result["duration"] = duration_info
-        # 3. 提取预算信息
-        budget_info = self._extract_budget(user_message)
         if budget_info:
             result["budget"] = budget_info
-        log.info(f"📊 欧洲城市正则提取结果: {result}")
         return result
-    def _extract_european_destination(self, text: str) -> dict:
-        """提取欧洲目的地信息 - 专门针对欧洲城市"""
-        result = {}
-        # 目的地提取模式（复用之前的完整模式）
-        destination_patterns = [
-            # 基本动词 + 地点
-            r'去(\w+)', r'到(\w+)', r'想去(\w+)', r'前往(\w+)', r'计划去(\w+)', r'打算去(\w+)',
-            r'准备去(\w+)', r'希望去(\w+)', r'考虑去(\w+)', r'决定去(\w+)', r'选择去(\w+)',
-            r'旅行(\w+)', r'游(\w+)', r'玩(\w+)', r'访问(\w+)', r'探索(\w+)', r'体验(\w+)',
-            r'出发去(\w+)', r'飞去(\w+)', r'飞往(\w+)', r'飞到(\w+)', r'坐车去(\w+)', r'开车去(\w+)',
-            # 目的地关键词
-            r'目的地[\s是：:]*(\w+)', r'地方[\s是：:]*(\w+)', r'城市[\s是：:]*(\w+)',
-            r'国家[\s是：:]*(\w+)', r'地区[\s是：:]*(\w+)', r'景点[\s是：:]*(\w+)',
-            # 在某地表达
-            r'在(\w+)旅游', r'在(\w+)游玩', r'在(\w+)度假', r'在(\w+)旅行', r'在(\w+)玩',
-            r'在(\w+)观光', r'在(\w+)游览', r'在(\w+)休假', r'在(\w+)放松', r'在(\w+)散心',
-            # 某地 + 行程/之旅
-            r'(\w+)之旅', r'(\w+)行程', r'(\w+)旅程', r'(\w+)游', r'(\w+)行', r'(\w+)之行',
-            r'(\w+)深度游', r'(\w+)自由行', r'(\w+)跟团游', r'(\w+)自驾游', r'(\w+)蜜月游',
-            # 包含"的"的表达
-            r'(\w+)的旅行', r'(\w+)的行程', r'(\w+)的攻略', r'(\w+)的景点', r'(\w+)的美食',
-            r'(\w+)的文化', r'(\w+)的历史', r'(\w+)的风景', r'(\w+)的特色', r'(\w+)的魅力',
-            # 特殊交通方式表达
-            r'飞(\w+)', r'坐船去(\w+)', r'坐火车去(\w+)', r'自驾去(\w+)', r'徒步去(\w+)',
-            r'骑行去(\w+)', r'背包去(\w+)', r'穷游去(\w+)',
-            # 旅行类型 + 地点
-            r'自由行(\w+)', r'跟团(\w+)', r'自驾(\w+)', r'蜜月(\w+)', r'毕业(\w+)',
-            r'亲子(\w+)', r'家庭(\w+)', r'情侣(\w+)', r'闺蜜(\w+)', r'独自(\w+)',
-            # 度假/休闲表达
-            r'度假去(\w+)', r'休闲去(\w+)', r'放松去(\w+)', r'散心去(\w+)', r'疗养去(\w+)',
-            # 其他变体
-            r'想要去(\w+)', r'渴望去(\w+)', r'梦想去(\w+)', r'向往(\w+)', r'憧憬(\w+)',
-            r'安排去(\w+)', r'规划去(\w+)', r'预定(\w+)', r'订(\w+)的票', r'买(\w+)机票'
         ]
-        # 尝试所有模式
-        for pattern in destination_patterns:
-            matches = re.findall(pattern, text)
-            for match in matches:
-                city_name = match.strip()
-                # 首先检查别名映射
-                if city_name.lower() in self.european_city_aliases:
-                    city_name = self.european_city_aliases[city_name.lower()]
-                elif city_name in self.european_city_aliases:
-                    city_name = self.european_city_aliases[city_name]
-                # 验证是否为欧洲城市
-                if self._is_valid_european_city(city_name):
-                    result["name"] = city_name
-                    # 查找对应国家
-                    if city_name in self.european_cities:
-                        result["country"] = self.european_cities[city_name]
-                    break
-            if result:
-                break
-        # 特殊处理：国家+城市的组合（欧洲专用）
-        european_country_city_patterns = [
-            r'(\w+)的(\w+)', r'(\w+)(\w+)市', r'(\w+)(\w+)府',
-            r'(\w+)(\w+)州', r'(\w+)(\w+)省', r'(\w+)(\w+)岛'
-        ]
-        if not result:
-            for pattern in european_country_city_patterns:
-                matches = re.findall(pattern, text)
-                for country, city in matches:
-                    # 检查是否是已知的欧洲国家-城市组合
-                    if city in self.european_cities and self.european_cities[city] == country:
-                        result["name"] = city
-                        result["country"] = country
                         break
-                    elif self._is_valid_european_country(country) and self._is_valid_european_city(city):
-                        result["name"] = city
-                        result["country"] = country
                         break
-                if result:
-                    break
         return result
-    def _extract_duration(self, text: str) -> dict:
-        """提取时长信息 - 完整保留之前的实现"""
-        result = {}
-        # 天数提取模式 - 大幅扩展（保持原有完整实现）
-        day_patterns = [
-            # 基本数字+天
-            r'(\d+)天', r'(\d+)日', r'(\d+)号', r'(\d+)个天', r'(\d+)个日',
-            # 动词+天数
-            r'玩(\d+)天', r'住(\d+)天', r'呆(\d+)天', r'待(\d+)天', r'停留(\d+)天',
-            r'逗留(\d+)天', r'游(\d+)天', r'旅行(\d+)天', r'度假(\d+)天', r'休假(\d+)天',
-            # 行程相关
-            r'(\d+)天行程', r'(\d+)天旅程', r'(\d+)天旅行', r'(\d+)天游', r'(\d+)天之旅',
-            r'(\d+)天的行程', r'(\d+)天的旅程', r'(\d+)天的旅行', r'(\d+)天的假期',
-            r'行程(\d+)天', r'旅程(\d+)天', r'假期(\d+)天', r'休假(\d+)天',
-            # 时间修饰词
-            r'大概(\d+)天', r'约(\d+)天', r'差不多(\d+)天', r'左右(\d+)天', r'上下(\d+)天',
-            r'最多(\d+)天', r'最少(\d+)天', r'至少(\d+)天', r'不超过(\d+)天', r'超过(\d+)天',
-            r'将近(\d+)天', r'接近(\d+)天', r'快(\d+)天', r'足足(\d+)天', r'整整(\d+)天',
-            # 周相关
-            r'(\d+)周', r'(\d+)个周', r'(\d+)星期', r'(\d+)个星期', r'(\d+)礼拜', r'(\d+)个礼拜',
-            r'玩(\d+)周', r'住(\d+)周', r'呆(\d+)周', r'待(\d+)周', r'旅行(\d+)周',
-            r'(\d+)周的行程', r'(\d+)星期的旅行', r'(\d+)个礼拜的假期',
-            # 月相关
-            r'(\d+)月', r'(\d+)个月', r'(\d+)个月份',
-            r'玩(\d+)个月', r'住(\d+)个月', r'旅行(\d+)个月', r'度假(\d+)个月',
-            r'(\d+)个月的行程', r'(\d+)月的旅行', r'(\d+)个月的假期',
-            # 范围表达
-            r'(\d+)-(\d+)天', r'(\d+)到(\d+)天', r'(\d+)至(\d+)天', r'(\d+)~(\d+)天',
-            r'(\d+)天到(\d+)天', r'从(\d+)天到(\d+)天', r'介于(\d+)到(\d+)天',
-            # 中文数字
-            r'一天', r'二天', r'三天', r'四天', r'五天', r'六天', r'七天', r'八天', r'九天', r'十天',
-            r'两天', r'俩天', r'仨天', r'半天', r'一天半', r'两天半', r'三天半',
-            r'十一天', r'十二天', r'十三天', r'十四天', r'十五天', r'二十天', r'三十天',
-            # 特殊时长表达
-            r'周末', r'长周末', r'小长假', r'长假', r'黄金周', r'假期',
-            r'十一', r'国庆', r'春节', r'过年', r'五一', r'劳动节', r'清明', r'端午', r'中秋', r'元旦',
-            r'暑假', r'寒假', r'年假', r'蜜月', r'度蜜月',
-            r'短途', r'中途', r'长途', r'快闪', r'一日游', r'两日游', r'三日游', r'多日游'
-        ]
-        # 尝试提取时长（完整保留原有逻辑）
-        for pattern in day_patterns:
-            matches = re.findall(pattern, text)
-            for match in matches:
-                days = None
-                if isinstance(match, tuple):
-                    # 范围表达，取平均值
-                    try:
-                        start_days = int(match[0])
-                        end_days = int(match[1])
-                        days = (start_days + end_days) / 2
-                    except:
-                        days = int(match[0]) if match[0].isdigit() else None
-                elif match.isdigit():
-                    days = int(match)
-                    # 处理单位转换
-                    if '周' in pattern or '星期' in pattern or '礼拜' in pattern:
-                        days *= 7
-                    elif '月' in pattern:
-                        days *= 30
-                # 处理中文数字和特殊表达
-                elif match in self.chinese_numbers:
-                    days = self.chinese_numbers[match]
-                # 验证天数合理性
-                if days and 0.5 <= days <= 365:
-                    result["days"] = int(days) if days >= 1 else days
                     # 添加描述信息
                     if days <= 1:
-                        result["description"] = "当日往返"
                     elif days <= 3:
-                        result["description"] = "短途旅行"
                     elif days <= 7:
-                        result["description"] = "一周内旅行"
                     elif days <= 14:
-                        result["description"] = "中长途旅行"
                     elif days <= 30:
-                        result["description"] = "长途旅行"
                     else:
-                        result["description"] = "超长途旅行"
-                    # 保留原始匹配文本作为额外描述
-                    if not isinstance(match, tuple) and not match.isdigit():
-                        result["description"] = match
-                    break
-            if result:
                 break
         return result
-    def _extract_budget(self, text: str) -> dict:
-        """提取预算信息 - 针对欧洲旅行优化"""
         result = {}
-        text_lower = text.lower()
-        # 欧洲旅行常用货币的金���提取模式
-        amount_patterns = [
-            # === 欧元表达 - 优先级最高（欧洲旅行主要货币） ===
-            r'(\d+)欧元', r'(\d+)欧', r'€(\d+)', r'EUR(\d+)', r'eur(\d+)',
-            r'(\d+)euro', r'(\d+)Euro', r'(\d+)EURO',
-            r'(\d+\.?\d*)欧元', r'€(\d+\.?\d*)',
-            r'预算(\d+)欧', r'花费(\d+)欧', r'大概(\d+)欧', r'约(\d+)欧',
-            # === 人民币表达 ===
-            r'(\d+)元', r'(\d+)块', r'(\d+)块钱', r'(\d+)人民币', r'(\d+)rmb', r'(\d+)RMB',
-            r'¥(\d+)', r'￥(\d+)', r'CNY(\d+)', r'cny(\d+)',
-            # === 美元表达 ===
-            r'(\d+)美元', r'(\d+)美刀', r'(\d+)刀', r'\$(\d+)', r'USD(\d+)', r'usd(\d+)',
-            r'(\d+)dollar', r'(\d+)Dollar',
-            # === 英镑表达（英国旅行） ===
-            r'(\d+)英镑', r'(\d+)镑', r'£(\d+)', r'GBP(\d+)', r'gbp(\d+)',
-            r'(\d+)pound', r'(\d+)Pound',
-            # === 瑞士法郎（瑞士旅行） ===
-            r'(\d+)瑞士法郎', r'(\d+)法郎', r'CHF(\d+)', r'chf(\d+)',
-            r'(\d+)瑞郎', r'(\d+)swiss franc',
-            # === 预算相关表达 ===
-            r'预算(\d+)', r'预算是(\d+)', r'预算大概(\d+)', r'预算约(\d+)',
-            r'预算差不多(\d+)', r'预算在(\d+)', r'预算控制在(\d+)',
-            r'预算不超过(\d+)', r'预算最多(\d+)', r'预算最少(\d+)',
-            # === 花费相关表达 ===
-            r'花(\d+)', r'花费(\d+)', r'花销(\d+)', r'开销(\d+)', r'支出(\d+)',
-            r'费用(\d+)', r'成本(\d+)', r'总共(\d+)', r'一共(\d+)', r'总计(\d+)',
-            # === 万元表达 ===
-            r'(\d+)万', r'(\d+)万元', r'(\d+)万块', r'(\d+)万人民币',
-            r'(\d+)万欧', r'(\d+)万欧元', r'(\d+)万美元', r'(\d+)万英镑',
-            r'(\d+\.?\d*)万', r'(\d+\.?\d*)万元',
-            # === 千元表达 ===
-            r'(\d+)千', r'(\d+)千元', r'(\d+)千块', r'(\d+)k', r'(\d+)K',
-            r'(\d+)千欧', r'(\d+)千美元', r'(\d+)千英镑',
-            # === 范围表达 ===
-            r'(\d+)-(\d+)', r'(\d+)到(\d+)', r'(\d+)至(\d+)', r'(\d+)~(\d+)',
-            r'(\d+)左右', r'约(\d+)', r'差不多(\d+)', r'大概(\d+)',
-            # === 每人/每天相关 ===
-            r'每人(\d+)', r'人均(\d+)', r'单人(\d+)', r'每天(\d+)', r'日均(\d+)',
-            # === 中文数字金额 ===
-            r'一万', r'两万', r'三万', r'四万', r'五万', r'六万', r'七万', r'八万', r'九万', r'十万',
-            r'一千', r'两千', r'三千', r'四千', r'五千', r'六千', r'七千', r'八千', r'九千'
-        ]
-        # 中文数字金额映射
-        chinese_money = {
-            '一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000,
-            '六万': 60000, '七万': 70000, '八万': 80000, '九万': 90000, '十万': 100000,
-            '一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
-            '六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000
-        }
-        # 尝试提取金额
-        for pattern in amount_patterns:
-            matches = re.findall(pattern, text)
-            for match in matches:
-                amount = None
-                currency = "RMB"  # 默认货币
-                if isinstance(match, tuple):
-                    # 处理范围或多个捕获组
-                    if len(match) == 2 and all(m.replace('.','').isdigit() for m in match if m):
-                        try:
-                            amount = (float(match[0]) + float(match[1])) / 2
-                        except:
-                            amount = float(match[0]) if match[0].replace('.','').isdigit() else float(match[1])
-                    else:
-                        for m in match:
-                            if m and m.replace('.','').isdigit():
-                                amount = float(m)
-                                break
-                else:
-                    if match in chinese_money:
-                        amount = chinese_money[match]
-                    elif match.replace('.','').isdigit():
-                        amount = float(match)
-                if amount and amount > 0:
-                    # 处理单位转换
-                    if '万' in pattern:
                         amount *= 10000
-                    elif '千' in pattern or 'k' in pattern.lower():
                         amount *= 1000
                     result["amount"] = int(amount)
-                    # 确定货币类型（针对欧洲旅行优化）
-                    if any(keyword in pattern for keyword in ['欧元', '欧', '€', 'eur', 'euro']):
-                        result["currency"] = "EUR"
-                    elif any(keyword in pattern for keyword in ['英镑', '镑', '£', 'gbp', 'pound']):
-                        result["currency"] = "GBP"
-                    elif any(keyword in pattern for keyword in ['瑞士法郎', '法郎', '瑞郎', 'chf', 'swiss franc']):
-                        result["currency"] = "CHF"
-                    elif any(keyword in pattern for keyword in ['美元', '美刀', '刀',  'usd', 'dollar']):
-                        result["currency"] = "USD"
-                    else:
-                        result["currency"] = "RMB"
                     break
-            if result.get("amount"):
-                break
-        # 预算类型识别 - 针对欧洲旅行优化
         budget_type_keywords = {
             'economy': [
-                # 经济相关
                 '经济', '便宜', '省钱', '实惠', '节省', '穷游', '学生', '青年',
                 '预算有限', '钱不多', '不贵', '划算', '性价比', '背包客',
-                '简单', '基础', '低成本', '节约', '省着花', '紧巴巴',
-                # 欧洲特色经济住宿
-                '青年旅社', '青旅', 'hostel', '民宿', 'airbnb', '客栈',
-                '多人间', '床位', '宿舍', '胶囊', 'capsule',
-                # 欧洲经济交通
-                '大巴', '长途汽车', 'flixbus', '火车', '二等座', '经济舱',
-                '欧洲通票', '青年票', '学生票', '团体票',
-                # 经济餐饮
-                '自己做饭', '超市', '便利店', '快餐', '街头小吃', '外卖',
-                '麦当劳', '汉堡王', 'kebab', 'döner'
             ],
             'comfortable': [
-                # 舒适相关
                 '舒适', '中等', '适中', '一般', '标准', '普通', '正常', '常规',
-                '中档', '中级', '合理', '平均', '中间档次', '不高不低',
-                # 欧洲中档住宿
-                '三星', '四星', '酒店', 'hotel', '标间', '双人间', '大床房',
-                '民宿', 'apartment', '公寓', 'b&b', 'pension',
-                # 欧洲舒适交通
-                '火车', '一等座', '高铁', 'tgv', 'ice', '城际列车',
-                '租车', '自驾', '商务舱', '直飞',
-                # 中档餐饮
-                '餐厅', '当地菜', '特色菜', '中档餐厅', '酒吧', 'bistro'
             ],
             'luxury': [
-                # 奢华相关
-                '豪华', '奢华', '高端', '顶级', '精品', '奢侈', '贵族',
-                '贵一点', '不差钱', '任性', '土豪', '有钱', '不在乎钱',
-                '高消费', '享受', '奢享', '尊贵', '至尊', 'VIP',
-                # 欧洲豪华住宿
-                '五星', '六星', '豪华酒店', 'luxury hotel', '度假村', 'resort',
-                '别墅', 'villa', '城堡', 'castle', '套房', 'suite', '总统套房',
-                '丽思卡尔顿', '四季', '文华东方', '半岛', '香格里拉', '希尔顿',
-                'ritz carlton', 'four seasons', 'mandarin oriental', 'peninsula',
-                # 豪华交通
-                '头等舱', '商务舱', '私人飞机', 'private jet', '豪车', '奔驰', '宝马',
-                '奥迪', '保时捷', '法拉利', '兰博基尼', 'mercedes', 'bmw', 'audi',
-                # 奢华服务
-                '私人导游', '管家服务', 'concierge', '司机', '专车', '包车',
-                '定制旅行', '私人订制', '一对一服务', 'vip通道',
-                # 高端餐饮
-                '米其林', 'michelin', '米其林三星', '米其林餐厅', '高档餐厅',
-                '法式大餐', '意式料理', '分子料理', '酒庄', 'wine tasting'
             ]
         }
-        # 识别预算类型
-        for budget_type, keywords in budget_type_keywords.items():
-            matched_keywords = [kw for kw in keywords if kw in text_lower]
-            if matched_keywords:
-                result["type"] = budget_type
-                result["description"] = matched_keywords[0]
                 break
-        # 如果有金额但没有类型，根据金额和货币推断类型（欧洲标准）
         if result.get("amount") and not result.get("type"):
             amount = result["amount"]
             currency = result.get("currency", "RMB")
             # 根据欧洲旅行成本设置阈值
             if currency == "EUR":
-                if amount < 50:  # 每天50欧以下
                     result["type"] = "economy"
                     result["description"] = "经济预算"
-                elif amount < 150:  # 每天50-150欧
                     result["type"] = "comfortable"
                     result["description"] = "舒适预算"
-                else:  # 每天150欧以上
-                    result["type"] = "luxury"
-                    result["description"] = "豪华预算"
-            elif currency == "GBP":
-                if amount < 40:  # 每天40英镑以下
-                    result["type"] = "economy"
-                    result["description"] = "经济预算"
-                elif amount < 120:  # 每天40-120英镑
-                    result["type"] = "comfortable"
-                    result["description"] = "舒适预算"
-                else:  # 每天120英镑以上
                     result["type"] = "luxury"
                     result["description"] = "豪华预算"
-            elif currency == "CHF":
-                if amount < 60:  # 每天60瑞郎以下
                     result["type"] = "economy"
                     result["description"] = "经济预算"
-                elif amount < 180:  # 每天60-180瑞郎
                     result["type"] = "comfortable"
                     result["description"] = "舒适预算"
-                else:  # 每天180瑞郎以上
                     result["type"] = "luxury"
                     result["description"] = "豪华预算"
             elif currency == "RMB":
-                if amount < 300:  # 每天300元以下
-                    result["type"] = "economy"
-                    result["description"] = "经济预算"
-                elif amount < 800:  # 每天300-800元
-                    result["type"] = "comfortable"
-                    result["description"] = "舒适预算"
-                else:  # 每天800元以上
-                    result["type"] = "luxury"
-                    result["description"] = "豪华预算"
-            elif currency == "USD":
-                if amount < 60:  # 每天60美元以下
                     result["type"] = "economy"
                     result["description"] = "经济预算"
-                elif amount < 150:  # 每天60-150美元
                     result["type"] = "comfortable"
                     result["description"] = "舒适预算"
-                else:  # 每天150美元以上
                     result["type"] = "luxury"
                     result["description"] = "豪华预算"
-        return result
-    def _is_valid_european_city(self, name: str) -> bool:
-        """验证是否为有效的欧洲城市名称"""
-        if not name or len(name) < 1:
-            return False
-        # 排除数字和常见的非地名词汇
-        invalid_words = [
-            # 数字和时间
-            '天', '日', '号', '月', '年', '周', '小时', '分钟', '秒',
-            # 金钱相关
-            '元', '块', '钱', '万', '千', '百', '预算', '费用', '成本', '价格',
-            '美元', '欧元', '英镑', '瑞郎', '法郎',
-            # 旅行相关动词
-            '花', '费', '旅行', '旅游', '行程', '计划', '想', '去', '到', '的',
-            '在', '是', '个', '了', '和', '与', '或', '但', '而', '就', '都',
-            # 其他常见词
-            '人', '我', '你', '他', '她', '们', '这', '那', '什么', '怎么',
-            '好', '很', '非常', '特别', '大', '小', '新', '老'
-        ]
-        if name.isdigit() or name in invalid_words:
-            return False
-        # 检查是否包含数字（地名通常不包含数字）
-        if any(char.isdigit() for char in name):
-            return False
-        # 检查是否在欧洲城市列表中
-        if name in self.european_cities:
-            return True
-        # 检查是否在别名列表中
-        if name in self.european_city_aliases or name.lower() in self.european_city_aliases:
-            return True
-        # 城市名称长度检查
-        if len(name) > 15:
-            return False
-        # 检查是否包含特殊字符
-        if any(char in name for char in '!@#$%^&*()+={}[]|\\:";\'<>?,.`~'):
-            return False
-        return False  # 只接受明确在欧洲城市列表中的城市
-    def _is_valid_european_country(self, name: str) -> bool:
-        """验证是否为有效的欧洲国家名称"""
-        if not name or len(name) < 2:
-            return False
-        # 欧洲国家列表
-        european_countries = {
-            # 西欧
-            '法国', '德国', '英国', '荷兰', '比利时', '卢森堡',
-            # 南欧
-            '意大利', '西班牙', '葡萄牙', '希腊', '马耳他', '塞浦路斯',
-            # 中欧
-            '奥地利', '瑞士', '捷克', '斯洛伐克', '匈牙利', '波兰', '斯洛文尼亚',
-            # 北欧
-            '瑞典', '挪威', '丹麦', '芬兰', '冰岛',
-            # 东欧
-            '俄罗斯', '乌克兰', '白俄罗斯', '立陶宛', '拉脱维亚', '爱沙尼亚', '摩尔多瓦',
-            # 巴尔干半岛
-            '克罗地亚', '塞尔维亚', '波黑', '黑山', '北马其顿', '阿尔巴尼亚',
-            '保加利亚', '罗马尼亚', '土耳其'
         }
-        return name in european_countries
     # 保持向后兼容的验证方法
     def _validate_and_normalize(self, data: dict) -> dict:

 import json
 import re
 from utils.logger import log
+import jieba
+from typing import List, Tuple
 class InfoExtractor:
     def __init__(self):
+        self._init_tockenizer()
+        self._init_keyworkd_mapping()
         self.extraction_schema = {
             "destination": {"type": dict, "fields": {"name": str, "country": str}},
             "duration": {"type": dict, "fields": {"days": int, "description": str}},
             "翡翠岛": "都柏林", "欧洲之都": "布鲁塞尔", "高迪之城": "巴塞罗那",
         }
         self.chinese_numbers = {
             '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
             '两': 2, '半': 0.5, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10,
+            # 英文数字
+            'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10,
+            'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15,
             # 特殊时长表达
             '半个月': 15, '一个月': 30, '半年': 180, '一年': 365,
             '半天': 0.5, '一天': 1, '两天': 2, '三天': 3, '四天': 4, '五天': 5, '六天': 6, '七天': 7,
             '八天': 8, '九天': 9, '十天': 10, '半周': 3.5, '一周': 7, '两周': 14,
             # 假期相关
             '小长假': 3, '长假': 7, '十一': 7, '国庆': 7, '春节': 7, '五一': 3, '清明': 3,
+            '端午': 3, '中秋': 3, '元旦': 3, '暑假': 60, '寒假': 30, '周末': 2, '长周末': 3,
+            # 英文假期
+            'weekend': 2, 'week': 7, 'month': 30, 'vacation': 7, 'holiday': 3
         }
+        def extract(self, user_message: str) -> dict:
+        """使用分词策略进行信息提取"""
         # 输入验证
         if not user_message or not isinstance(user_message, str):
             log.warning("⚠️ 用户消息过短，跳过信息提取")
             return {}
+        log.info(f"🛠️ 使用分词策略提取信息：'{user_message[:50]}...'")
+        # 1. 智能分词
+        tokens = self._tokenize_message(user_message)
+        log.info(f"📝 分词结果：{tokens}")
+        # 2. 基于分词进行信息提取
         result = {}
+        # 提取目的地信息
+        destination_info = self._extract_destination_from_tokens(tokens)
         if destination_info:
             result["destination"] = destination_info
+        # 提取时长信息
+        duration_info = self._extract_duration_from_tokens(tokens)
         if duration_info:
             result["duration"] = duration_info
+        # 提取预算信息
+        budget_info = self._extract_budget_from_tokens(tokens)
         if budget_info:
             result["budget"] = budget_info
+        log.info(f"📊 分词提取结果: {result}")
         return result
+    def _tokenize_message(self, text: str) -> list:
+        """智能分词，支持中英文混合"""
+        # 预处理：统一标点符号和空格
+        text = text.replace('，', ',').replace('。', '.').replace('！', '!').replace('？', '?')
+        text = text.replace('（', '(').replace('）', ')').replace('【', '[').replace('】', ']')
+        tokens = []
+        current_token = ""
+        i = 0
+        while i < len(text):
+            char = text[i]
+            # 处理空格和标点符号
+            if char in ' ,，.。!！?？()（）[]【】:：;；':
+                if current_token:
+                    tokens.append(current_token)
+                    current_token = ""
+                if char.strip():  # 保留非空格的标点符号
+                    tokens.append(char)
+                i += 1
+                continue
+            # 处理数字（包括小数和货币符号）
+            if char.isdigit() or char in '¥$€£₩':
+                if current_token and not (current_token[-1].isdigit() or current_token[-1] in '¥$€£₩.'):
+                    tokens.append(current_token)
+                    current_token = char
+                else:
+                    current_token += char
+                # 继续读取数字部分
+                i += 1
+                while i < len(text) and (text[i].isdigit() or text[i] in '.,'):
+                    current_token += text[i]
+                    i += 1
+                # 检查货币单位
+                currency_units = ['元', '块', '钱', '欧', '美元', '英镑', '日元', '韩元', '瑞郎', 'rmb', 'usd', 'eur', 'gbp', 'jpy', 'krw', 'chf']
+                remaining_text = text[i:].lower()
+                for unit in currency_units:
+                    if remaining_text.startswith(unit):
+                        current_token += text[i:i+len(unit)]
+                        i += len(unit)
+                        break
+                tokens.append(current_token)
+                current_token = ""
+                continue
+            # 处理英文单词
+            if char.isalpha() and ord(char) < 128:  # ASCII字符
+                if current_token and not current_token[-1].isalpha():
+                    tokens.append(current_token)
+                    current_token = char
+                else:
+                    current_token += char
+                # 继续读取英文字符
+                i += 1
+                while i < len(text) and text[i].isalpha() and ord(text[i]) < 128:
+                    current_token += text[i]
+                    i += 1
+                tokens.append(current_token)
+                current_token = ""
+                continue
+            # 处理中文字符
+            if self._is_chinese_char(char):
+                if current_token and not self._is_chinese_char(current_token[-1]):
+                    tokens.append(current_token)
+                    current_token = ""
+                # 对于中文，我们需要智能分词
+                # 检查是否是多字符城市名、时间表达等
+                remaining_text = text[i:]
+                # 尝试匹配城市名
+                matched_city = self._match_city_name(remaining_text)
+                if matched_city:
+                    tokens.append(matched_city)
+                    i += len(matched_city)
+                    continue
+                # 尝试匹配时间表达
+                matched_time = self._match_time_expression(remaining_text)
+                if matched_time:
+                    tokens.append(matched_time)
+                    i += len(matched_time)
+                    continue
+                # 尝试匹配预算类型关键词
+                matched_budget_type = self._match_budget_type(remaining_text)
+                if matched_budget_type:
+                    tokens.append(matched_budget_type)
+                    i += len(matched_budget_type)
+                    continue
+                # 尝试匹配常见词汇
+                matched_word = self._match_common_word(remaining_text)
+                if matched_word:
+                    tokens.append(matched_word)
+                    i += len(matched_word)
+                    continue
+                # 单个中文字符
+                tokens.append(char)
+                i += 1
+            else:
+                # 其他字符
+                current_token += char
+                i += 1
+        # 处理最后的token
+        if current_token:
+            tokens.append(current_token)
+        # 后处理：合并一些相关的tokens
+        tokens = self._post_process_tokens(tokens)
+        return [token for token in tokens if token.strip()]  # 过滤空token
+    def _is_chinese_char(self, char: str) -> bool:
+        """判断是否为中文字符"""
+        return '\u4e00' <= char <= '\u9fff'
+    def _match_city_name(self, text: str) -> str:
+        """匹配城市名称"""
+        # 按长度从长到短排序，优先匹配长的城市名
+        all_cities = list(self.european_cities.keys()) + list(self.european_city_aliases.keys())
+        all_cities = sorted(set(all_cities), key=len, reverse=True)
+        for city in all_cities:
+            if text.startswith(city):
+                return city
+        return ""
+    def _match_time_expression(self, text: str) -> str:
+        """匹配时间表达"""
+        time_expressions = [
+            # 多字符时间表达
+            '半个月', '一个月', '两个月', '三个月', '半年', '一年',
+            '小长假', '长周末', '国庆节', '春节假期', '暑假', '寒假',
+            '一天半', '两天半', '三天半', '一周半', '两周',
+            # 英文时��表达
+            'one day', 'two days', 'three days', 'one week', 'two weeks',
+            'long weekend', 'vacation', 'holiday', 'spring break'
+        ]
+        # 按长度排序，优先匹配长表达
+        time_expressions = sorted(time_expressions, key=len, reverse=True)
+        text_lower = text.lower()
+        for expr in time_expressions:
+            if text_lower.startswith(expr.lower()):
+                return expr
+            if text.startswith(expr):
+                return expr
+        return ""
+    def _match_budget_type(self, text: str) -> str:
+        """匹配预算类型关键词"""
+        budget_keywords = [
+            # 经济型
+            '经济实惠', '省钱', '便宜', '实惠', '经济', '穷游', '背包客',
+            '青年旅社', '学生', '预算有限', '性价比',
+            # 舒适型
+            '舒适', '中等', '适中', '标准', '普通', '中档', '合理',
+            # 豪华型
+            '豪华', '奢华', '高端', '顶级', '精品', '五星', '不差钱',
+            '任性', '土豪', 'VIP', '贵族', '皇家'
         ]
+        # 按长度排序
+        budget_keywords = sorted(budget_keywords, key=len, reverse=True)
+        for keyword in budget_keywords:
+            if text.startswith(keyword):
+                return keyword
+        return ""
+    def _match_common_word(self, text: str) -> str:
+        """匹配常见词汇"""
+        common_words = [
+            # 旅行相关动词
+            '想去', '计划去', '打算去', '准备去', '希望去', '考虑去',
+            '前往', '旅行', '旅游', '游玩', '度假', '出发', '飞往',
+            # 时间相关
+            '三天', '四天', '五天', '六天', '七天', '八天', '九天', '十天',
+            '一天', '两天', '几天', '多天', '数天',
+            # 预算相关
+            '预算', '花费', '费用', '成本', '开销', '支出', '消费',
+            '总共', '一共', '大概', '约', '左右', '差不多',
+            # 其他
+            '行程', '计划', '安排', '路线', '攻略'
+        ]
+        # 按长度排序
+        common_words = sorted(common_words, key=len, reverse=True)
+        for word in common_words:
+            if text.startswith(word):
+                return word
+        return ""
+    def _post_process_tokens(self, tokens: list) -> list:
+        """后处理tokens，合并相关的片段"""
+        if not tokens:
+            return tokens
+        processed = []
+        i = 0
+        while i < len(tokens):
+            current_token = tokens[i]
+            # 合并数字+单位的组合
+            if i < len(tokens) - 1:
+                next_token = tokens[i + 1]
+                # 数字 + 货币单位
+                if (current_token.isdigit() and
+                    next_token.lower() in ['元', '块', '钱', '欧', '美元', '英镑', '日元', 'rmb', 'usd', 'eur', 'gbp', 'jpy']):
+                    processed.append(current_token + next_token)
+                    i += 2
+                    continue
+                # 数字 + 时间单位
+                if (current_token.isdigit() and
+                    next_token in ['天', '日', '周', '月', '年', 'days', 'weeks', 'months']):
+                    processed.append(current_token + next_token)
+                    i += 2
+                    continue
+                # 预算 + 数字
+                if current_token == '预算' and next_token.replace('.', '').replace(',', '').isdigit():
+                    if i < len(tokens) - 2 and tokens[i + 2] in ['元', '块', '钱', '欧', 'rmb', 'usd', 'eur']:
+                        processed.append(current_token + next_token + tokens[i + 2])
+                        i += 3
+                        continue
+                    else:
+                        processed.append(current_token + next_token)
+                        i += 2
+                        continue
+            processed.append(current_token)
+            i += 1
+        return processed
+    def _extract_destination_from_tokens(self, tokens: list) -> dict:
+        """从tokens中提取目的地信息"""
+        result = {}
+        # 查找城市名
+        for i, token in enumerate(tokens):
+            # 直接匹配城市名
+            city_name = self._normalize_city_name(token)
+            if city_name:
+                result["name"] = city_name
+                if city_name in self.european_cities:
+                    result["country"] = self.european_cities[city_name]
+                break
+            # 检查是否在动词后面
+            if i > 0:
+                prev_token = tokens[i - 1]
+                if prev_token in ['去', '到', '想去', '前往', '旅行', '游', '玩', 'go', 'to', 'visit', 'travel']:
+                    city_name = self._normalize_city_name(token)
+                    if city_name:
+                        result["name"] = city_name
+                        if city_name in self.european_cities:
+                            result["country"] = self.european_cities[city_name]
                         break
+        # 如果没有找到，尝试fuzzy匹配
+        if not result:
+            for token in tokens:
+                if len(token) >= 2:
+                    # 模糊匹配城市名
+                    for city, country in self.european_cities.items():
+                        if token in city or city in token:
+                            if len(token) >= len(city) * 0.6:  # 相似度阈值
+                                result["name"] = city
+                                result["country"] = country
+                                break
+                    if result:
                         break
         return result
+    def _normalize_city_name(self, token: str) -> str:
+        """标准化城市名称"""
+        if not token:
+            return ""
+        token_lower = token.lower().strip()
+        # 直接匹配
+        if token in self.european_cities:
+            return token
+        # 别名匹配
+        if token_lower in self.european_city_aliases:
+            return self.european_city_aliases[token_lower]
+        if token in self.european_city_aliases:
+            return self.european_city_aliases[token]
+        return ""
+    def _extract_duration_from_tokens(self, tokens: list) -> dict:
+        """从tokens中提取时长信息"""
+        result = {}
+        for i, token in enumerate(tokens):
+            days = None
+            description = ""
+            # 处理 "数字+天" 的token
+            if re.match(r'^\d+[天日]$', token):
+                days = int(re.findall(r'\d+', token)[0])
+            # 处理 "数字+weeks/days" 的token
+            elif re.match(r'^\d+(days?|weeks?|months?)$', token.lower()):
+                number = int(re.findall(r'\d+', token)[0])
+                unit = re.findall(r'[a-zA-Z]+', token.lower())[0]
+                if unit.startswith('day'):
+                    days = number
+                elif unit.startswith('week'):
+                    days = number * 7
+                elif unit.startswith('month'):
+                    days = number * 30
+            # 处理分离的数字和单位
+            elif token.isdigit() and i < len(tokens) - 1:
+                next_token = tokens[i + 1]
+                number = int(token)
+                if next_token in ['天', '日']:
+                    days = number
+                elif next_token in ['周', '星期', '礼拜', 'week', 'weeks']:
+                    days = number * 7
+                elif next_token in ['月', '个月', 'month', 'months']:
+                    days = number * 30
+            # 处理中文数字
+            elif token in self.chinese_numbers:
+                days = self.chinese_numbers[token]
+                description = token
+            # 处理特殊时长表达
+            elif token in ['周末', 'weekend']:
+                days = 2
+                description = token
+            elif token in ['长周末', 'long weekend']:
+                days = 3
+                description = token
+            elif token in ['小长假', 'vacation', 'holiday']:
+                days = 3
+                description = token
+            elif token in ['十一', '国庆', 'national day']:
+                days = 7
+                description = token
+            elif token in ['春节', 'spring festival']:
+                days = 7
+                description = token
+            elif token in ['暑假', 'summer vacation']:
+                days = 60
+                description = token
+            elif token in ['寒假', 'winter vacation']:
+                days = 30
+                description = token
+            # 处理复合表达 "三天两夜"
+            elif re.match(r'^[一二三四五六七八九十\d]+天', token):
+                # 提取数字部分
+                for num_token in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']:
+                    if token.startswith(num_token):
+                        days = self.chinese_numbers[num_token]
+                        description = token
+                        break
+                if not days and token[0].isdigit():
+                    days = int(token[0])
+                    description = token
+            # 验证天数合理性并设置结果
+            if days and 0.5 <= days <= 365:
+                result["days"] = int(days) if days >= 1 else days
+                if not description:
                     # 添加描述信息
                     if days <= 1:
+                        description = "当日往返"
                     elif days <= 3:
+                        description = "短途旅行"
                     elif days <= 7:
+                        description = "一周内旅行"
                     elif days <= 14:
+                        description = "中长途旅行"
                     elif days <= 30:
+                        description = "长途旅行"
                     else:
+                        description = "超长途旅行"
+                result["description"] = description
                 break
         return result
+    def _extract_budget_from_tokens(self, tokens: list) -> dict:
+        """从tokens中提取预算信息"""
         result = {}
+        # 1. 查找金额
+        for i, token in enumerate(tokens):
+            amount = None
+            currency = "RMB"  # 默认货币
+            # 处理包含货币的token "2000欧", "5000元"
+            currency_patterns = [
+                (r'(\d+(?:\.\d+)?)欧(?:元)?', 'EUR'),
+                (r'(\d+(?:\.\d+)?)元', 'RMB'),
+                (r'(\d+(?:\.\d+)?)块(?:钱)?', 'RMB'),
+                (r'(\d+(?:\.\d+)?)人民币', 'RMB'),
+                (r'(\d+(?:\.\d+)?)美元', 'USD'),
+                (r'(\d+(?:\.\d+)?)英镑', 'GBP'),
+                (r'(\d+(?:\.\d+)?)瑞(?:士)?法郎', 'CHF'),
+                (r'(\d+(?:\.\d+)?)日元', 'JPY'),
+                (r'(\d+(?:\.\d+)?)韩元', 'KRW'),
+                (r'¥(\d+(?:\.\d+)?)', 'RMB'),
+                (r'€(\d+(?:\.\d+)?)', 'EUR'),
+                (r'\$(\d+(?:\.\d+)?)', 'USD'),
+                (r'£(\d+(?:\.\d+)?)', 'GBP'),
+                (r'(\d+(?:\.\d+)?)rmb', 'RMB'),
+                (r'(\d+(?:\.\d+)?)usd', 'USD'),
+                (r'(\d+(?:\.\d+)?)eur', 'EUR'),
+                (r'(\d+(?:\.\d+)?)gbp', 'GBP'),
+                (r'(\d+(?:\.\d+)?)chf', 'CHF'),
+            ]
+            for pattern, curr in currency_patterns:
+                match = re.search(pattern, token.lower())
+                if match:
+                    amount = float(match.group(1))
+                    currency = curr
+                    break
+            # 处理纯数字token（需要查看上下文）
+            if not amount and re.match(r'^\d+(?:\.\d+)?
+                , token):
+                number = float(token)
+                # 检查前面的token是否有预算相关词汇
+                budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', '总共', '一共', 'budget', 'cost', 'spend']
+                has_budget_context = False
+                if i > 0 and tokens[i-1] in budget_indicators:
+                    has_budget_context = True
+                elif i > 1 and tokens[i-2] in budget_indicators:
+                    has_budget_context = True
+                # 检查后面是否有货币单位
+                if i < len(tokens) - 1:
+                    next_token = tokens[i + 1].lower()
+                    currency_units = {
+                        '元': 'RMB', '块': 'RMB', '钱': 'RMB', '人民币': 'RMB',
+                        '欧': 'EUR', '欧元': 'EUR', '美元': 'USD', '英镑': 'GBP',
+                        '瑞郎': 'CHF', '日元': 'JPY', '韩元': 'KRW',
+                        'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF'
+                    }
+                    if next_token in currency_units:
+                        amount = number
+                        currency = currency_units[next_token]
+                        has_budget_context = True
+                # 如果有预算上下文但没有明确货币单位，根据数字大小推断
+                if has_budget_context and not amount:
+                    if number < 100:  # 可能是欧元或美元
+                        # 查看是否有欧洲城市上下文
+                        has_european_context = any(self._normalize_city_name(t) for t in tokens)
+                        if has_european_context:
+                            currency = 'EUR'
+                        else:
+                            currency = 'USD'
+                    else:
+                        currency = 'RMB'  # 大数字更可能是人民币
+                    amount = number
+            # 处理万、千等单位
+            if amount:
+                # 检查是否有万、千修饰符
+                if i > 0:
+                    prev_token = tokens[i-1]
+                    if '万' in prev_token or 'w' in prev_token.lower():
                         amount *= 10000
+                    elif '千' in prev_token or 'k' in prev_token.lower():
                         amount *= 1000
+                elif i < len(tokens) - 1:
+                    next_token = tokens[i+1]
+                    if '万' in next_token or 'w' in next_token.lower():
+                        amount *= 10000
+                    elif '千' in next_token or 'k' in next_token.lower():
+                        amount *= 1000
+                if amount > 0:
                     result["amount"] = int(amount)
+                    result["currency"] = currency
                     break
+        # 2. 查找预算类型
         budget_type_keywords = {
             'economy': [
                 '经济', '便宜', '省钱', '实惠', '节省', '穷游', '学生', '青年',
                 '预算有限', '钱不多', '不贵', '划算', '性价比', '背包客',
+                '简单', '基础', '低成本', '节约', 'budget', 'cheap', 'economy', 'affordable'
             ],
             'comfortable': [
                 '舒适', '中等', '适中', '一般', '标准', '普通', '正常', '常规',
+                '中档', '中级', '合理', '平均', '中间档次', 'comfortable', 'standard', 'moderate'
             ],
             'luxury': [
+                '豪华', '奢华', '高端', '顶级', '精品', '奢侈', '贵族', '皇家',
+                '贵一点', '不差钱', '任性', '土豪', '有钱', '五星', 'VIP',
+                'luxury', 'premium', 'high-end', 'expensive', 'fancy'
             ]
         }
+        for token in tokens:
+            token_lower = token.lower()
+            for budget_type, keywords in budget_type_keywords.items():
+                if any(keyword in token_lower for keyword in keywords):
+                    result["type"] = budget_type
+                    # 找到第一个匹配的关键词作为描述
+                    for keyword in keywords:
+                        if keyword in token_lower:
+                            result["description"] = keyword if len(keyword) > 2 else token
+                            break
+                    break
+            if result.get("type"):
                 break
+        # 3. 如果有金额但没有类型，根据金额推断类型
         if result.get("amount") and not result.get("type"):
             amount = result["amount"]
             currency = result.get("currency", "RMB")
             # 根据欧洲旅行成本设置阈值
             if currency == "EUR":
+                if amount < 1500:  # 总预算
                     result["type"] = "economy"
                     result["description"] = "经济预算"
+                elif amount < 4000:
                     result["type"] = "comfortable"
                     result["description"] = "舒适预算"
+                else:
                     result["type"] = "luxury"
                     result["description"] = "豪华预算"
+            elif currency == "USD":
+                if amount < 2000:
                     result["type"] = "economy"
                     result["description"] = "经济预算"
+                elif amount < 5000:
                     result["type"] = "comfortable"
                     result["description"] = "舒适预算"
+                else:
                     result["type"] = "luxury"
                     result["description"] = "豪华预算"
             elif currency == "RMB":
+                if amount < 8000:
                     result["type"] = "economy"
                     result["description"] = "经济预算"
+                elif amount < 20000:
                     result["type"] = "comfortable"
                     result["description"] = "舒适预算"
+                else:
                     result["type"] = "luxury"
                     result["description"] = "豪华预算"
+        # 4. 处理中文数字金额
+        chinese_money_mapping = {
+            '一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
+            '六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000,
+            '一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000
         }
+        if not result.get("amount"):
+            for token in tokens:
+                if token in chinese_money_mapping:
+                    result["amount"] = chinese_money_mapping[token]
+                    result["currency"] = "RMB"
+                    break
+        return result
     # 保持向后兼容的验证方法
     def _validate_and_normalize(self, data: dict) -> dict: