Eliot0110 commited on
Commit
3ecb35b
·
1 Parent(s): 14f6b72

improve: tokenizer

Browse files
Files changed (1) hide show
  1. modules/info_extractor.py +546 -432
modules/info_extractor.py CHANGED
@@ -1,11 +1,15 @@
1
  import json
2
  import re
3
  from utils.logger import log
 
 
4
 
5
  class InfoExtractor:
6
  def __init__(self):
7
 
8
- # 预定义的提取结构,用于验证和规范化
 
 
9
  self.extraction_schema = {
10
  "destination": {"type": dict, "fields": {"name": str, "country": str}},
11
  "duration": {"type": dict, "fields": {"days": int, "description": str}},
@@ -267,21 +271,25 @@ class InfoExtractor:
267
  "翡翠岛": "都柏林", "欧洲之都": "布鲁塞尔", "高迪之城": "巴塞罗那",
268
  }
269
 
270
- # 中文数字映射(保持原有)
271
  self.chinese_numbers = {
272
  '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
273
  '两': 2, '半': 0.5, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10,
 
 
 
274
  # 特殊时长表达
275
  '半个月': 15, '一个月': 30, '半年': 180, '一年': 365,
276
  '半天': 0.5, '一天': 1, '两天': 2, '三天': 3, '四天': 4, '五天': 5, '六天': 6, '七天': 7,
277
  '八天': 8, '九天': 9, '十天': 10, '半周': 3.5, '一周': 7, '两周': 14,
278
  # 假期相关
279
  '小长假': 3, '长假': 7, '十一': 7, '国庆': 7, '春节': 7, '五一': 3, '清明': 3,
280
- '端午': 3, '中秋': 3, '元旦': 3, '暑假': 60, '寒假': 30, '周末': 2, '长周末': 3
 
 
281
  }
282
 
283
- def extract(self, user_message: str) -> dict:
284
- """使用纯正则表达式提取结构化信息 - 聚焦欧洲"""
285
 
286
  # 输入验证
287
  if not user_message or not isinstance(user_message, str):
@@ -292,525 +300,631 @@ class InfoExtractor:
292
  log.warning("⚠️ 用户消息过短,跳过信息提取")
293
  return {}
294
 
295
- log.info("🛠️ 使用正则表达式提取信息(聚焦欧洲)")
 
 
 
 
296
 
 
297
  result = {}
298
 
299
- # 1. 提取目的地信息
300
- destination_info = self._extract_european_destination(user_message)
301
  if destination_info:
302
  result["destination"] = destination_info
303
 
304
- # 2. 提取时长信息
305
- duration_info = self._extract_duration(user_message)
306
  if duration_info:
307
  result["duration"] = duration_info
308
 
309
- # 3. 提取预算信息
310
- budget_info = self._extract_budget(user_message)
311
  if budget_info:
312
  result["budget"] = budget_info
313
 
314
- log.info(f"📊 欧洲城市正则提取结果: {result}")
315
  return result
316
 
317
- def _extract_european_destination(self, text: str) -> dict:
318
- """提取欧洲目的地信息 - 专门针对欧洲城市"""
319
- result = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
- # 目的地提取模式(复用之前的完整模式)
322
- destination_patterns = [
323
- # 基本动词 + 地点
324
- r'去(\w+)', r'到(\w+)', r'想去(\w+)', r'前往(\w+)', r'计划去(\w+)', r'打算去(\w+)',
325
- r'准备去(\w+)', r'希望去(\w+)', r'考虑去(\w+)', r'决定去(\w+)', r'选择去(\w+)',
326
- r'旅行(\w+)', r'游(\w+)', r'玩(\w+)', r'访问(\w+)', r'探索(\w+)', r'体验(\w+)',
327
- r'出发去(\w+)', r'飞去(\w+)', r'飞往(\w+)', r'飞到(\w+)', r'坐车去(\w+)', r'开车去(\w+)',
328
-
329
- # 目的地关键词
330
- r'目的地[\s是::]*(\w+)', r'地方[\s是::]*(\w+)', r'城市[\s是::]*(\w+)',
331
- r'国家[\s是::]*(\w+)', r'地区[\s是::]*(\w+)', r'景点[\s是::]*(\w+)',
332
-
333
- # 在某地表达
334
- r'在(\w+)旅游', r'在(\w+)游玩', r'在(\w+)度假', r'在(\w+)旅行', r'在(\w+)玩',
335
- r'在(\w+)观光', r'在(\w+)游览', r'在(\w+)休假', r'在(\w+)放松', r'在(\w+)散心',
336
-
337
- # 某地 + 行程/之旅
338
- r'(\w+)之旅', r'(\w+)行程', r'(\w+)旅程', r'(\w+)游', r'(\w+)行', r'(\w+)之行',
339
- r'(\w+)深度游', r'(\w+)自由行', r'(\w+)跟团游', r'(\w+)自驾游', r'(\w+)蜜月游',
340
-
341
- # 包含"的"的表达
342
- r'(\w+)的旅行', r'(\w+)的行程', r'(\w+)的攻略', r'(\w+)的景点', r'(\w+)的美食',
343
- r'(\w+)的文化', r'(\w+)的历史', r'(\w+)的风景', r'(\w+)的特色', r'(\w+)的魅力',
344
-
345
- # 特殊交通方式表达
346
- r'飞(\w+)', r'坐船去(\w+)', r'坐火车去(\w+)', r'自驾去(\w+)', r'徒步去(\w+)',
347
- r'骑行去(\w+)', r'背包去(\w+)', r'穷游去(\w+)',
348
-
349
- # 旅行类型 + 地点
350
- r'自由行(\w+)', r'跟团(\w+)', r'自驾(\w+)', r'蜜月(\w+)', r'毕业(\w+)',
351
- r'亲子(\w+)', r'家庭(\w+)', r'情侣(\w+)', r'闺蜜(\w+)', r'独自(\w+)',
352
-
353
- # 度假/休闲表达
354
- r'度假去(\w+)', r'休闲去(\w+)', r'放松去(\w+)', r'散心去(\w+)', r'疗养去(\w+)',
355
-
356
- # 其他变体
357
- r'想要去(\w+)', r'渴望去(\w+)', r'梦想去(\w+)', r'向往(\w+)', r'憧憬(\w+)',
358
- r'安排去(\w+)', r'规划去(\w+)', r'预定(\w+)', r'订(\w+)的票', r'买(\w+)机票'
359
  ]
360
 
361
- # 尝试所有模式
362
- for pattern in destination_patterns:
363
- matches = re.findall(pattern, text)
364
- for match in matches:
365
- city_name = match.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
- # 首先检查别名映射
368
- if city_name.lower() in self.european_city_aliases:
369
- city_name = self.european_city_aliases[city_name.lower()]
370
- elif city_name in self.european_city_aliases:
371
- city_name = self.european_city_aliases[city_name]
 
372
 
373
- # 验证是否为欧洲城市
374
- if self._is_valid_european_city(city_name):
375
- result["name"] = city_name
376
- # 查找对应国家
377
- if city_name in self.european_cities:
378
- result["country"] = self.european_cities[city_name]
379
- break
380
- if result:
381
- break
 
 
 
 
 
 
 
 
 
 
 
382
 
383
- # 特殊处理:国家+城市的组合(欧洲专用)
384
- european_country_city_patterns = [
385
- r'(\w+)的(\w+)', r'(\w+)(\w+)市', r'(\w+)(\w+)府',
386
- r'(\w+)(\w+)州', r'(\w+)(\w+)省', r'(\w+)(\w+)岛'
387
- ]
388
 
389
- if not result:
390
- for pattern in european_country_city_patterns:
391
- matches = re.findall(pattern, text)
392
- for country, city in matches:
393
- # 检查是否是已知的欧洲国家-城市组合
394
- if city in self.european_cities and self.european_cities[city] == country:
395
- result["name"] = city
396
- result["country"] = country
 
 
 
 
 
 
 
 
 
 
 
397
  break
398
- elif self._is_valid_european_country(country) and self._is_valid_european_city(city):
399
- result["name"] = city
400
- result["country"] = country
 
 
 
 
 
 
 
 
 
 
401
  break
402
- if result:
403
- break
404
 
405
  return result
406
 
407
- def _extract_duration(self, text: str) -> dict:
408
- """提取时长信息 - 完整保留之前的实现"""
409
- result = {}
 
410
 
411
- # 天数提取模式 - 大幅扩展(保持原有完整实现)
412
- day_patterns = [
413
- # 基本数字+天
414
- r'(\d+)天', r'(\d+)日', r'(\d+)号', r'(\d+)个天', r'(\d+)个日',
415
-
416
- # 动词+天数
417
- r'玩(\d+)天', r'住(\d+)天', r'呆(\d+)天', r'待(\d+)天', r'停留(\d+)天',
418
- r'逗留(\d+)天', r'游(\d+)天', r'旅行(\d+)天', r'度假(\d+)天', r'休假(\d+)天',
419
-
420
- # 行程相关
421
- r'(\d+)天行程', r'(\d+)天旅程', r'(\d+)天旅行', r'(\d+)天游', r'(\d+)天之旅',
422
- r'(\d+)天的行程', r'(\d+)天的旅程', r'(\d+)天的旅行', r'(\d+)天的假期',
423
- r'行程(\d+)天', r'旅程(\d+)天', r'假期(\d+)天', r'休假(\d+)天',
424
-
425
- # 时间修饰词
426
- r'大概(\d+)天', r'约(\d+)天', r'差不多(\d+)天', r'左右(\d+)天', r'上下(\d+)天',
427
- r'最多(\d+)天', r'最少(\d+)天', r'至少(\d+)天', r'不超过(\d+)天', r'超过(\d+)天',
428
- r'将近(\d+)天', r'接近(\d+)天', r'快(\d+)天', r'足足(\d+)天', r'整整(\d+)天',
429
-
430
- # 周相关
431
- r'(\d+)周', r'(\d+)个周', r'(\d+)星期', r'(\d+)个星期', r'(\d+)礼拜', r'(\d+)个礼拜',
432
- r'玩(\d+)周', r'住(\d+)周', r'呆(\d+)周', r'待(\d+)周', r'旅行(\d+)周',
433
- r'(\d+)周的行程', r'(\d+)星期的旅行', r'(\d+)个礼拜的假期',
434
-
435
- # 月相关
436
- r'(\d+)月', r'(\d+)个月', r'(\d+)个月份',
437
- r'玩(\d+)个月', r'住(\d+)个月', r'旅行(\d+)个月', r'度假(\d+)个月',
438
- r'(\d+)个月的行程', r'(\d+)月的旅行', r'(\d+)个月的假期',
439
-
440
- # 范围表达
441
- r'(\d+)-(\d+)天', r'(\d+)到(\d+)天', r'(\d+)至(\d+)天', r'(\d+)~(\d+)天',
442
- r'(\d+)天到(\d+)天', r'从(\d+)天到(\d+)天', r'介于(\d+)到(\d+)天',
443
-
444
- # 中文数字
445
- r'一天', r'二天', r'三天', r'四天', r'五天', r'六天', r'七天', r'八天', r'九天', r'十天',
446
- r'两天', r'俩天', r'仨天', r'半天', r'一天半', r'两天半', r'三天半',
447
- r'十一天', r'十二天', r'十三天', r'十四天', r'十五天', r'二十天', r'三十天',
448
-
449
- # 特殊时长表达
450
- r'周末', r'长周末', r'小长假', r'长假', r'黄金周', r'假期',
451
- r'十一', r'国庆', r'春节', r'过年', r'五一', r'劳动节', r'清明', r'端午', r'中秋', r'元旦',
452
- r'暑假', r'寒假', r'年假', r'蜜月', r'度蜜月',
453
- r'短途', r'中途', r'长途', r'快闪', r'一日游', r'两日游', r'三日游', r'多日游'
454
- ]
455
 
456
- # 尝试提取时长(完整保留原有逻辑)
457
- for pattern in day_patterns:
458
- matches = re.findall(pattern, text)
459
- for match in matches:
460
- days = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
 
462
- if isinstance(match, tuple):
463
- # 范围表达,取平均值
464
- try:
465
- start_days = int(match[0])
466
- end_days = int(match[1])
467
- days = (start_days + end_days) / 2
468
- except:
469
- days = int(match[0]) if match[0].isdigit() else None
470
- elif match.isdigit():
471
- days = int(match)
472
-
473
- # 处理单位转换
474
- if '周' in pattern or '星期' in pattern or '礼拜' in pattern:
475
- days *= 7
476
- elif '月' in pattern:
477
- days *= 30
478
-
479
- # 处理中文数字和特殊表达
480
- elif match in self.chinese_numbers:
481
- days = self.chinese_numbers[match]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
 
483
- # 验证天数合理性
484
- if days and 0.5 <= days <= 365:
485
- result["days"] = int(days) if days >= 1 else days
486
-
487
  # 添加描述信息
488
  if days <= 1:
489
- result["description"] = "当日往返"
490
  elif days <= 3:
491
- result["description"] = "短途旅行"
492
  elif days <= 7:
493
- result["description"] = "一周内旅行"
494
  elif days <= 14:
495
- result["description"] = "中长途旅行"
496
  elif days <= 30:
497
- result["description"] = "长途旅行"
498
  else:
499
- result["description"] = "超长途旅行"
500
-
501
- # 保留原始匹配文本作为额外描述
502
- if not isinstance(match, tuple) and not match.isdigit():
503
- result["description"] = match
504
-
505
- break
506
- if result:
507
  break
508
 
509
  return result
510
 
511
- def _extract_budget(self, text: str) -> dict:
512
- """提取预算信息 - 针对欧洲旅行优化"""
513
  result = {}
514
- text_lower = text.lower()
515
-
516
- # 欧洲旅行常用货币的金���提取模式
517
- amount_patterns = [
518
- # === 欧元表达 - 优先级最高(欧洲旅行主要货币) ===
519
- r'(\d+)欧元', r'(\d+)欧', r'€(\d+)', r'EUR(\d+)', r'eur(\d+)',
520
- r'(\d+)euro', r'(\d+)Euro', r'(\d+)EURO',
521
- r'(\d+\.?\d*)欧元', r'€(\d+\.?\d*)',
522
- r'预算(\d+)欧', r'花费(\d+)欧', r'大概(\d+)欧', r'约(\d+)欧',
523
-
524
- # === 人民币表达 ===
525
- r'(\d+)元', r'(\d+)块', r'(\d+)块钱', r'(\d+)人民币', r'(\d+)rmb', r'(\d+)RMB',
526
- r'¥(\d+)', r'¥(\d+)', r'CNY(\d+)', r'cny(\d+)',
527
-
528
- # === 美元表达 ===
529
- r'(\d+)美元', r'(\d+)美刀', r'(\d+)刀', r'\$(\d+)', r'USD(\d+)', r'usd(\d+)',
530
- r'(\d+)dollar', r'(\d+)Dollar',
531
-
532
- # === 英镑表达(英国旅行) ===
533
- r'(\d+)英镑', r'(\d+)镑', r'£(\d+)', r'GBP(\d+)', r'gbp(\d+)',
534
- r'(\d+)pound', r'(\d+)Pound',
535
-
536
- # === 瑞士法郎(瑞士旅行) ===
537
- r'(\d+)瑞士法郎', r'(\d+)法郎', r'CHF(\d+)', r'chf(\d+)',
538
- r'(\d+)瑞郎', r'(\d+)swiss franc',
539
-
540
- # === 预算相关表达 ===
541
- r'预算(\d+)', r'预算是(\d+)', r'预算大概(\d+)', r'预算约(\d+)',
542
- r'预算差不多(\d+)', r'预算在(\d+)', r'预算控制在(\d+)',
543
- r'预算不超过(\d+)', r'预算最多(\d+)', r'预算最少(\d+)',
544
-
545
- # === 花费相关表达 ===
546
- r'花(\d+)', r'花费(\d+)', r'花销(\d+)', r'开销(\d+)', r'支出(\d+)',
547
- r'费用(\d+)', r'成本(\d+)', r'总共(\d+)', r'一共(\d+)', r'总计(\d+)',
548
-
549
- # === 万元表达 ===
550
- r'(\d+)万', r'(\d+)万元', r'(\d+)万块', r'(\d+)万人民币',
551
- r'(\d+)万欧', r'(\d+)万欧元', r'(\d+)万美元', r'(\d+)万英镑',
552
- r'(\d+\.?\d*)万', r'(\d+\.?\d*)万元',
553
-
554
- # === 千元表达 ===
555
- r'(\d+)千', r'(\d+)千元', r'(\d+)千块', r'(\d+)k', r'(\d+)K',
556
- r'(\d+)千欧', r'(\d+)千美元', r'(\d+)千英镑',
557
-
558
- # === 范围表达 ===
559
- r'(\d+)-(\d+)', r'(\d+)到(\d+)', r'(\d+)至(\d+)', r'(\d+)~(\d+)',
560
- r'(\d+)左右', r'约(\d+)', r'差不多(\d+)', r'大概(\d+)',
561
-
562
- # === 每人/每天相关 ===
563
- r'每人(\d+)', r'人均(\d+)', r'单人(\d+)', r'每天(\d+)', r'日均(\d+)',
564
-
565
- # === 中文数字金额 ===
566
- r'一万', r'两万', r'三万', r'四万', r'五万', r'六万', r'七万', r'八万', r'九万', r'十万',
567
- r'一千', r'两千', r'三千', r'四千', r'五千', r'六千', r'七千', r'八千', r'九千'
568
- ]
569
 
570
- # 中文数字金额映射
571
- chinese_money = {
572
- '一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000,
573
- '六万': 60000, '七万': 70000, '八万': 80000, '九万': 90000, '十万': 100000,
574
- '一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
575
- '六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000
576
- }
577
-
578
- # 尝试提取金额
579
- for pattern in amount_patterns:
580
- matches = re.findall(pattern, text)
581
- for match in matches:
582
- amount = None
583
- currency = "RMB" # 默认货币
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
584
 
585
- if isinstance(match, tuple):
586
- # 处理范围或多个捕获组
587
- if len(match) == 2 and all(m.replace('.','').isdigit() for m in match if m):
588
- try:
589
- amount = (float(match[0]) + float(match[1])) / 2
590
- except:
591
- amount = float(match[0]) if match[0].replace('.','').isdigit() else float(match[1])
592
- else:
593
- for m in match:
594
- if m and m.replace('.','').isdigit():
595
- amount = float(m)
596
- break
597
- else:
598
- if match in chinese_money:
599
- amount = chinese_money[match]
600
- elif match.replace('.','').isdigit():
601
- amount = float(match)
 
 
 
 
 
 
602
 
603
- if amount and amount > 0:
604
- # 处理单位转换
605
- if '万' in pattern:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
  amount *= 10000
607
- elif '千' in pattern or 'k' in pattern.lower():
608
  amount *= 1000
609
-
 
 
 
 
 
 
 
610
  result["amount"] = int(amount)
611
-
612
- # 确定货币类型(针对欧洲旅行优化)
613
- if any(keyword in pattern for keyword in ['欧元', '欧', '€', 'eur', 'euro']):
614
- result["currency"] = "EUR"
615
- elif any(keyword in pattern for keyword in ['英镑', '镑', '£', 'gbp', 'pound']):
616
- result["currency"] = "GBP"
617
- elif any(keyword in pattern for keyword in ['瑞士法郎', '法郎', '瑞郎', 'chf', 'swiss franc']):
618
- result["currency"] = "CHF"
619
- elif any(keyword in pattern for keyword in ['美元', '美刀', '刀', 'usd', 'dollar']):
620
- result["currency"] = "USD"
621
- else:
622
- result["currency"] = "RMB"
623
  break
624
- if result.get("amount"):
625
- break
626
 
627
- # 预算类型识别 - 针对欧洲旅行优化
628
  budget_type_keywords = {
629
  'economy': [
630
- # 经济相关
631
  '经济', '便宜', '省钱', '实惠', '节省', '穷游', '学生', '青年',
632
  '预算有限', '钱不多', '不贵', '划算', '性价比', '背包客',
633
- '简单', '基础', '低成本', '节约', '省着花', '紧巴巴',
634
- # 欧洲特色经济住宿
635
- '青年旅社', '青旅', 'hostel', '民宿', 'airbnb', '客栈',
636
- '多人间', '床位', '宿舍', '胶囊', 'capsule',
637
- # 欧洲经济交通
638
- '大巴', '长途汽车', 'flixbus', '火车', '二等座', '经济舱',
639
- '欧洲通票', '青年票', '学生票', '团体票',
640
- # 经济餐饮
641
- '自己做饭', '超市', '便利店', '快餐', '街头小吃', '外卖',
642
- '麦当劳', '汉堡王', 'kebab', 'döner'
643
  ],
644
  'comfortable': [
645
- # 舒适相关
646
  '舒适', '中等', '适中', '一般', '标准', '普通', '正常', '常规',
647
- '中档', '中级', '合理', '平均', '中间档次', '不高不低',
648
- # 欧洲中档住宿
649
- '三星', '四星', '酒店', 'hotel', '标间', '双人间', '大床房',
650
- '民宿', 'apartment', '公寓', 'b&b', 'pension',
651
- # 欧洲舒适交通
652
- '火车', '一等座', '高铁', 'tgv', 'ice', '城际列车',
653
- '租车', '自驾', '商务舱', '直飞',
654
- # 中档餐饮
655
- '餐厅', '当地菜', '特色菜', '中档餐厅', '酒吧', 'bistro'
656
  ],
657
  'luxury': [
658
- # 奢华相关
659
- '豪华', '奢华', '高端', '顶级', '精品', '奢侈', '贵族',
660
- '贵一点', '不差钱', '任性', '土豪', '有钱', '不在乎钱',
661
- '高消费', '享受', '奢享', '尊贵', '至尊', 'VIP',
662
- # 欧洲豪华住宿
663
- '五星', '六星', '豪华酒店', 'luxury hotel', '度假村', 'resort',
664
- '别墅', 'villa', '城堡', 'castle', '套房', 'suite', '总统套房',
665
- '丽思卡尔顿', '四季', '文华东方', '半岛', '香格里拉', '希尔顿',
666
- 'ritz carlton', 'four seasons', 'mandarin oriental', 'peninsula',
667
- # 豪华交通
668
- '头等舱', '商务舱', '私人飞机', 'private jet', '豪车', '奔驰', '宝马',
669
- '奥迪', '保时捷', '法拉利', '兰博基尼', 'mercedes', 'bmw', 'audi',
670
- # 奢华服务
671
- '私人导游', '管家服务', 'concierge', '司机', '专车', '包车',
672
- '定制旅行', '私人订制', '一对一服务', 'vip通道',
673
- # 高端餐饮
674
- '米其林', 'michelin', '米其林三星', '米其林餐厅', '高档餐厅',
675
- '法式大餐', '意式料理', '分子料理', '酒庄', 'wine tasting'
676
  ]
677
  }
678
 
679
- # 识别预算类型
680
- for budget_type, keywords in budget_type_keywords.items():
681
- matched_keywords = [kw for kw in keywords if kw in text_lower]
682
- if matched_keywords:
683
- result["type"] = budget_type
684
- result["description"] = matched_keywords[0]
 
 
 
 
 
 
 
685
  break
686
 
687
- # 如果有金额但没有类型,根据金额和货币推断类型(欧洲标准)
688
  if result.get("amount") and not result.get("type"):
689
  amount = result["amount"]
690
  currency = result.get("currency", "RMB")
691
 
692
  # 根据欧洲旅行成本设置阈值
693
  if currency == "EUR":
694
- if amount < 50: # 每天50欧以下
695
  result["type"] = "economy"
696
  result["description"] = "经济预算"
697
- elif amount < 150: # 每天50-150欧
698
  result["type"] = "comfortable"
699
  result["description"] = "舒适预算"
700
- else: # 每天150欧以上
701
- result["type"] = "luxury"
702
- result["description"] = "豪华预算"
703
- elif currency == "GBP":
704
- if amount < 40: # 每天40英镑以下
705
- result["type"] = "economy"
706
- result["description"] = "经济预算"
707
- elif amount < 120: # 每天40-120英镑
708
- result["type"] = "comfortable"
709
- result["description"] = "舒适预算"
710
- else: # 每天120英镑以上
711
  result["type"] = "luxury"
712
  result["description"] = "豪华预算"
713
- elif currency == "CHF":
714
- if amount < 60: # 每天60瑞郎以下
715
  result["type"] = "economy"
716
  result["description"] = "经济预算"
717
- elif amount < 180: # 每天60-180瑞郎
718
  result["type"] = "comfortable"
719
  result["description"] = "舒适预算"
720
- else: # 每天180瑞郎以上
721
  result["type"] = "luxury"
722
  result["description"] = "豪华预算"
723
  elif currency == "RMB":
724
- if amount < 300: # 每天300元以下
725
- result["type"] = "economy"
726
- result["description"] = "经济预算"
727
- elif amount < 800: # 每天300-800元
728
- result["type"] = "comfortable"
729
- result["description"] = "舒适预算"
730
- else: # 每天800元以上
731
- result["type"] = "luxury"
732
- result["description"] = "豪华预算"
733
- elif currency == "USD":
734
- if amount < 60: # 每天60美元以下
735
  result["type"] = "economy"
736
  result["description"] = "经济预算"
737
- elif amount < 150: # 每天60-150美元
738
  result["type"] = "comfortable"
739
  result["description"] = "舒适预算"
740
- else: # 每天150美元以上
741
  result["type"] = "luxury"
742
  result["description"] = "豪华预算"
743
 
744
- return result
745
-
746
- def _is_valid_european_city(self, name: str) -> bool:
747
- """验证是否为有效的欧洲城市名称"""
748
- if not name or len(name) < 1:
749
- return False
750
-
751
- # 排除数字和常见的非地名词汇
752
- invalid_words = [
753
- # 数字和时间
754
- '天', '日', '号', '月', '年', '周', '小时', '分钟', '秒',
755
- # 金钱相关
756
- '元', '块', '钱', '万', '千', '百', '预算', '费用', '成本', '价格',
757
- '美元', '欧元', '英镑', '瑞郎', '法郎',
758
- # 旅行相关动词
759
- '花', '费', '旅行', '旅游', '行程', '计划', '想', '去', '到', '的',
760
- '在', '是', '个', '了', '和', '与', '或', '但', '而', '就', '都',
761
- # 其他常见词
762
- '人', '我', '你', '他', '她', '们', '这', '那', '什么', '怎么',
763
- '好', '很', '非常', '特别', '大', '小', '新', '老'
764
- ]
765
-
766
- if name.isdigit() or name in invalid_words:
767
- return False
768
-
769
- # 检查是否包含数字(地名通常不包含数字)
770
- if any(char.isdigit() for char in name):
771
- return False
772
-
773
- # 检查是否在欧洲城市列表中
774
- if name in self.european_cities:
775
- return True
776
-
777
- # 检查是否在别名列表中
778
- if name in self.european_city_aliases or name.lower() in self.european_city_aliases:
779
- return True
780
-
781
- # 城市名称长度检查
782
- if len(name) > 15:
783
- return False
784
-
785
- # 检查是否包含特殊字符
786
- if any(char in name for char in '!@#$%^&*()+={}[]|\\:";\'<>?,.`~'):
787
- return False
788
-
789
- return False # 只接受明确在欧洲城市列表中的城市
790
-
791
- def _is_valid_european_country(self, name: str) -> bool:
792
- """验证是否为有效的欧洲国家名称"""
793
- if not name or len(name) < 2:
794
- return False
795
-
796
- # 欧洲国家列表
797
- european_countries = {
798
- # 西欧
799
- '法国', '德国', '英国', '荷兰', '比利时', '卢森堡',
800
- # 南欧
801
- '意大利', '西班牙', '葡萄牙', '希腊', '马耳他', '塞浦路斯',
802
- # 中欧
803
- '奥地利', '瑞士', '捷克', '斯洛伐克', '匈牙利', '波兰', '斯洛文尼亚',
804
- # 北欧
805
- '瑞典', '挪威', '丹麦', '芬兰', '冰岛',
806
- # 东欧
807
- '俄罗斯', '乌克兰', '白俄罗斯', '立陶宛', '拉脱维亚', '爱沙尼亚', '摩尔多瓦',
808
- # 巴尔干半岛
809
- '克罗地亚', '塞尔维亚', '波黑', '黑山', '北马其顿', '阿尔巴尼亚',
810
- '保加利亚', '罗马尼亚', '土耳其'
811
  }
812
 
813
- return name in european_countries
 
 
 
 
 
 
 
814
 
815
  # 保持向后兼容的验证方法
816
  def _validate_and_normalize(self, data: dict) -> dict:
 
1
  import json
2
  import re
3
  from utils.logger import log
4
+ import jieba
5
+ from typing import List, Tuple
6
 
7
  class InfoExtractor:
8
  def __init__(self):
9
 
10
+ self._init_tockenizer()
11
+ self._init_keyworkd_mapping()
12
+
13
  self.extraction_schema = {
14
  "destination": {"type": dict, "fields": {"name": str, "country": str}},
15
  "duration": {"type": dict, "fields": {"days": int, "description": str}},
 
271
  "翡翠岛": "都柏林", "欧洲之都": "布鲁塞尔", "高迪之城": "巴塞罗那",
272
  }
273
 
 
274
  self.chinese_numbers = {
275
  '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
276
  '两': 2, '半': 0.5, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10,
277
+ # 英文数字
278
+ 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10,
279
+ 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15,
280
  # 特殊时长表达
281
  '半个月': 15, '一个月': 30, '半年': 180, '一年': 365,
282
  '半天': 0.5, '一天': 1, '两天': 2, '三天': 3, '四天': 4, '五天': 5, '六天': 6, '七天': 7,
283
  '八天': 8, '九天': 9, '十天': 10, '半周': 3.5, '一周': 7, '两周': 14,
284
  # 假期相关
285
  '小长假': 3, '长假': 7, '十一': 7, '国庆': 7, '春节': 7, '五一': 3, '清明': 3,
286
+ '端午': 3, '中秋': 3, '元旦': 3, '暑假': 60, '寒假': 30, '周末': 2, '长周末': 3,
287
+ # 英文假期
288
+ 'weekend': 2, 'week': 7, 'month': 30, 'vacation': 7, 'holiday': 3
289
  }
290
 
291
+ def extract(self, user_message: str) -> dict:
292
+ """使用分词策略进行信息提取"""
293
 
294
  # 输入验证
295
  if not user_message or not isinstance(user_message, str):
 
300
  log.warning("⚠️ 用户消息过短,跳过信息提取")
301
  return {}
302
 
303
+ log.info(f"🛠️ 使用分词策略提取信息:'{user_message[:50]}...'")
304
+
305
+ # 1. 智能分词
306
+ tokens = self._tokenize_message(user_message)
307
+ log.info(f"📝 分词结果:{tokens}")
308
 
309
+ # 2. 基于分词进行信息提取
310
  result = {}
311
 
312
+ # 提取目的地信息
313
+ destination_info = self._extract_destination_from_tokens(tokens)
314
  if destination_info:
315
  result["destination"] = destination_info
316
 
317
+ # 提取时长信息
318
+ duration_info = self._extract_duration_from_tokens(tokens)
319
  if duration_info:
320
  result["duration"] = duration_info
321
 
322
+ # 提取预算信息
323
+ budget_info = self._extract_budget_from_tokens(tokens)
324
  if budget_info:
325
  result["budget"] = budget_info
326
 
327
+ log.info(f"📊 分词提取结果: {result}")
328
  return result
329
 
330
+ def _tokenize_message(self, text: str) -> list:
331
+ """智能分词,支持中英文混合"""
332
+
333
+ # 预处理:统一标点符号和空格
334
+ text = text.replace(',', ',').replace('。', '.').replace('!', '!').replace('?', '?')
335
+ text = text.replace('(', '(').replace(')', ')').replace('【', '[').replace('】', ']')
336
+
337
+ tokens = []
338
+ current_token = ""
339
+ i = 0
340
+
341
+ while i < len(text):
342
+ char = text[i]
343
+
344
+ # 处理空格和标点符号
345
+ if char in ' ,,.。!!??()()[]【】::;;':
346
+ if current_token:
347
+ tokens.append(current_token)
348
+ current_token = ""
349
+ if char.strip(): # 保留非空格的标点符号
350
+ tokens.append(char)
351
+ i += 1
352
+ continue
353
+
354
+ # 处理数字(包括小数和货币符号)
355
+ if char.isdigit() or char in '¥$€£₩':
356
+ if current_token and not (current_token[-1].isdigit() or current_token[-1] in '¥$€£₩.'):
357
+ tokens.append(current_token)
358
+ current_token = char
359
+ else:
360
+ current_token += char
361
+
362
+ # 继续读取数字部分
363
+ i += 1
364
+ while i < len(text) and (text[i].isdigit() or text[i] in '.,'):
365
+ current_token += text[i]
366
+ i += 1
367
+
368
+ # 检查货币单位
369
+ currency_units = ['元', '块', '钱', '欧', '美元', '英镑', '日元', '韩元', '瑞郎', 'rmb', 'usd', 'eur', 'gbp', 'jpy', 'krw', 'chf']
370
+ remaining_text = text[i:].lower()
371
+ for unit in currency_units:
372
+ if remaining_text.startswith(unit):
373
+ current_token += text[i:i+len(unit)]
374
+ i += len(unit)
375
+ break
376
+
377
+ tokens.append(current_token)
378
+ current_token = ""
379
+ continue
380
+
381
+ # 处理英文单词
382
+ if char.isalpha() and ord(char) < 128: # ASCII字符
383
+ if current_token and not current_token[-1].isalpha():
384
+ tokens.append(current_token)
385
+ current_token = char
386
+ else:
387
+ current_token += char
388
+
389
+ # 继续读取英文字符
390
+ i += 1
391
+ while i < len(text) and text[i].isalpha() and ord(text[i]) < 128:
392
+ current_token += text[i]
393
+ i += 1
394
+
395
+ tokens.append(current_token)
396
+ current_token = ""
397
+ continue
398
+
399
+ # 处理中文字符
400
+ if self._is_chinese_char(char):
401
+ if current_token and not self._is_chinese_char(current_token[-1]):
402
+ tokens.append(current_token)
403
+ current_token = ""
404
+
405
+ # 对于中文,我们需要智能分词
406
+ # 检查是否是多字符城市名、时间表达等
407
+ remaining_text = text[i:]
408
+
409
+ # 尝试匹配城市名
410
+ matched_city = self._match_city_name(remaining_text)
411
+ if matched_city:
412
+ tokens.append(matched_city)
413
+ i += len(matched_city)
414
+ continue
415
+
416
+ # 尝试匹配时间表达
417
+ matched_time = self._match_time_expression(remaining_text)
418
+ if matched_time:
419
+ tokens.append(matched_time)
420
+ i += len(matched_time)
421
+ continue
422
+
423
+ # 尝试匹配预算类型关键词
424
+ matched_budget_type = self._match_budget_type(remaining_text)
425
+ if matched_budget_type:
426
+ tokens.append(matched_budget_type)
427
+ i += len(matched_budget_type)
428
+ continue
429
+
430
+ # 尝试匹配常见词汇
431
+ matched_word = self._match_common_word(remaining_text)
432
+ if matched_word:
433
+ tokens.append(matched_word)
434
+ i += len(matched_word)
435
+ continue
436
+
437
+ # 单个中文字符
438
+ tokens.append(char)
439
+ i += 1
440
+ else:
441
+ # 其他字符
442
+ current_token += char
443
+ i += 1
444
+
445
+ # 处理最后的token
446
+ if current_token:
447
+ tokens.append(current_token)
448
+
449
+ # 后处理:合并一些相关的tokens
450
+ tokens = self._post_process_tokens(tokens)
451
+
452
+ return [token for token in tokens if token.strip()] # 过滤空token
453
+
454
+ def _is_chinese_char(self, char: str) -> bool:
455
+ """判断是否为中文字符"""
456
+ return '\u4e00' <= char <= '\u9fff'
457
+
458
+ def _match_city_name(self, text: str) -> str:
459
+ """匹配城市名称"""
460
+ # 按长度从长到短排序,优先匹配长的城市名
461
+ all_cities = list(self.european_cities.keys()) + list(self.european_city_aliases.keys())
462
+ all_cities = sorted(set(all_cities), key=len, reverse=True)
463
+
464
+ for city in all_cities:
465
+ if text.startswith(city):
466
+ return city
467
+ return ""
468
+
469
+ def _match_time_expression(self, text: str) -> str:
470
+ """匹配时间表达"""
471
+ time_expressions = [
472
+ # 多字符时间表达
473
+ '半个月', '一个月', '两个月', '三个月', '半年', '一年',
474
+ '小长假', '长周末', '国庆节', '春节假期', '暑假', '寒假',
475
+ '一天半', '两天半', '三天半', '一周半', '两周',
476
+ # 英文时��表达
477
+ 'one day', 'two days', 'three days', 'one week', 'two weeks',
478
+ 'long weekend', 'vacation', 'holiday', 'spring break'
479
+ ]
480
 
481
+ # 按长度排序,优先匹配长表达
482
+ time_expressions = sorted(time_expressions, key=len, reverse=True)
483
+
484
+ text_lower = text.lower()
485
+ for expr in time_expressions:
486
+ if text_lower.startswith(expr.lower()):
487
+ return expr
488
+ if text.startswith(expr):
489
+ return expr
490
+ return ""
491
+
492
+ def _match_budget_type(self, text: str) -> str:
493
+ """匹配预算类型关键词"""
494
+ budget_keywords = [
495
+ # 经济型
496
+ '经济实惠', '省钱', '便宜', '实惠', '经济', '穷游', '背包客',
497
+ '青年旅社', '学生', '预算有限', '性价比',
498
+ # 舒适型
499
+ '舒适', '中等', '适中', '标准', '普通', '中档', '合理',
500
+ # 豪华型
501
+ '豪华', '奢华', '高端', '顶级', '精品', '五星', '不差钱',
502
+ '任性', '土豪', 'VIP', '贵族', '皇家'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503
  ]
504
 
505
+ # 按长度排序
506
+ budget_keywords = sorted(budget_keywords, key=len, reverse=True)
507
+
508
+ for keyword in budget_keywords:
509
+ if text.startswith(keyword):
510
+ return keyword
511
+ return ""
512
+
513
+ def _match_common_word(self, text: str) -> str:
514
+ """匹配常见词汇"""
515
+ common_words = [
516
+ # 旅行相关动词
517
+ '想去', '计划去', '打算去', '准备去', '希望去', '考虑去',
518
+ '前往', '旅行', '旅游', '游玩', '度假', '出发', '飞往',
519
+ # 时间相关
520
+ '三天', '四天', '五天', '六天', '七天', '八天', '九天', '十天',
521
+ '一天', '两天', '几天', '多天', '数天',
522
+ # 预算相关
523
+ '预算', '花费', '费用', '成本', '开销', '支出', '消费',
524
+ '总共', '一共', '大概', '约', '左右', '差不多',
525
+ # 其他
526
+ '行程', '计划', '安排', '路线', '攻略'
527
+ ]
528
+
529
+ # 按长度排序
530
+ common_words = sorted(common_words, key=len, reverse=True)
531
+
532
+ for word in common_words:
533
+ if text.startswith(word):
534
+ return word
535
+ return ""
536
+
537
+ def _post_process_tokens(self, tokens: list) -> list:
538
+ """后处理tokens,合并相关的片段"""
539
+ if not tokens:
540
+ return tokens
541
+
542
+ processed = []
543
+ i = 0
544
+
545
+ while i < len(tokens):
546
+ current_token = tokens[i]
547
+
548
+ # 合并数字+单位的组合
549
+ if i < len(tokens) - 1:
550
+ next_token = tokens[i + 1]
551
 
552
+ # 数字 + 货币单位
553
+ if (current_token.isdigit() and
554
+ next_token.lower() in ['元', '块', '钱', '欧', '美元', '英镑', '日元', 'rmb', 'usd', 'eur', 'gbp', 'jpy']):
555
+ processed.append(current_token + next_token)
556
+ i += 2
557
+ continue
558
 
559
+ # 数字 + 时间单位
560
+ if (current_token.isdigit() and
561
+ next_token in ['天', '日', '周', '月', '年', 'days', 'weeks', 'months']):
562
+ processed.append(current_token + next_token)
563
+ i += 2
564
+ continue
565
+
566
+ # 预算 + 数字
567
+ if current_token == '预算' and next_token.replace('.', '').replace(',', '').isdigit():
568
+ if i < len(tokens) - 2 and tokens[i + 2] in ['元', '块', '钱', '欧', 'rmb', 'usd', 'eur']:
569
+ processed.append(current_token + next_token + tokens[i + 2])
570
+ i += 3
571
+ continue
572
+ else:
573
+ processed.append(current_token + next_token)
574
+ i += 2
575
+ continue
576
+
577
+ processed.append(current_token)
578
+ i += 1
579
 
580
+ return processed
581
+
582
+ def _extract_destination_from_tokens(self, tokens: list) -> dict:
583
+ """从tokens中提取目的地信息"""
584
+ result = {}
585
 
586
+ # 查找城市名
587
+ for i, token in enumerate(tokens):
588
+ # 直接匹配城市名
589
+ city_name = self._normalize_city_name(token)
590
+ if city_name:
591
+ result["name"] = city_name
592
+ if city_name in self.european_cities:
593
+ result["country"] = self.european_cities[city_name]
594
+ break
595
+
596
+ # 检查是否在动词后面
597
+ if i > 0:
598
+ prev_token = tokens[i - 1]
599
+ if prev_token in ['去', '到', '想去', '前往', '旅行', '游', '玩', 'go', 'to', 'visit', 'travel']:
600
+ city_name = self._normalize_city_name(token)
601
+ if city_name:
602
+ result["name"] = city_name
603
+ if city_name in self.european_cities:
604
+ result["country"] = self.european_cities[city_name]
605
  break
606
+
607
+ # 如果没有找到,尝试fuzzy匹配
608
+ if not result:
609
+ for token in tokens:
610
+ if len(token) >= 2:
611
+ # 模糊匹配城市名
612
+ for city, country in self.european_cities.items():
613
+ if token in city or city in token:
614
+ if len(token) >= len(city) * 0.6: # 相似度阈值
615
+ result["name"] = city
616
+ result["country"] = country
617
+ break
618
+ if result:
619
  break
 
 
620
 
621
  return result
622
 
623
+ def _normalize_city_name(self, token: str) -> str:
624
+ """标准化城市名称"""
625
+ if not token:
626
+ return ""
627
 
628
+ token_lower = token.lower().strip()
629
+
630
+ # 直接匹配
631
+ if token in self.european_cities:
632
+ return token
633
+
634
+ # 别名匹配
635
+ if token_lower in self.european_city_aliases:
636
+ return self.european_city_aliases[token_lower]
637
+
638
+ if token in self.european_city_aliases:
639
+ return self.european_city_aliases[token]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
640
 
641
+ return ""
642
+
643
+ def _extract_duration_from_tokens(self, tokens: list) -> dict:
644
+ """从tokens中提取时长信息"""
645
+ result = {}
646
+
647
+ for i, token in enumerate(tokens):
648
+ days = None
649
+ description = ""
650
+
651
+ # 处理 "数字+天" 的token
652
+ if re.match(r'^\d+[天日]$', token):
653
+ days = int(re.findall(r'\d+', token)[0])
654
+
655
+ # 处理 "数字+weeks/days" 的token
656
+ elif re.match(r'^\d+(days?|weeks?|months?)$', token.lower()):
657
+ number = int(re.findall(r'\d+', token)[0])
658
+ unit = re.findall(r'[a-zA-Z]+', token.lower())[0]
659
+ if unit.startswith('day'):
660
+ days = number
661
+ elif unit.startswith('week'):
662
+ days = number * 7
663
+ elif unit.startswith('month'):
664
+ days = number * 30
665
+
666
+ # 处理分离的数字和单位
667
+ elif token.isdigit() and i < len(tokens) - 1:
668
+ next_token = tokens[i + 1]
669
+ number = int(token)
670
 
671
+ if next_token in ['天', '日']:
672
+ days = number
673
+ elif next_token in ['周', '星期', '礼拜', 'week', 'weeks']:
674
+ days = number * 7
675
+ elif next_token in ['月', '个月', 'month', 'months']:
676
+ days = number * 30
677
+
678
+ # 处理中文数字
679
+ elif token in self.chinese_numbers:
680
+ days = self.chinese_numbers[token]
681
+ description = token
682
+
683
+ # 处理特殊时长表达
684
+ elif token in ['周末', 'weekend']:
685
+ days = 2
686
+ description = token
687
+ elif token in ['长周末', 'long weekend']:
688
+ days = 3
689
+ description = token
690
+ elif token in ['小长假', 'vacation', 'holiday']:
691
+ days = 3
692
+ description = token
693
+ elif token in ['十一', '国庆', 'national day']:
694
+ days = 7
695
+ description = token
696
+ elif token in ['春节', 'spring festival']:
697
+ days = 7
698
+ description = token
699
+ elif token in ['暑假', 'summer vacation']:
700
+ days = 60
701
+ description = token
702
+ elif token in ['寒假', 'winter vacation']:
703
+ days = 30
704
+ description = token
705
+
706
+ # 处理复合表达 "三天两夜"
707
+ elif re.match(r'^[一二三四五六七八九十\d]+天', token):
708
+ # 提取数字部分
709
+ for num_token in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']:
710
+ if token.startswith(num_token):
711
+ days = self.chinese_numbers[num_token]
712
+ description = token
713
+ break
714
+ if not days and token[0].isdigit():
715
+ days = int(token[0])
716
+ description = token
717
+
718
+ # 验证天数合理性并设置结果
719
+ if days and 0.5 <= days <= 365:
720
+ result["days"] = int(days) if days >= 1 else days
721
 
722
+ if not description:
 
 
 
723
  # 添加描述信息
724
  if days <= 1:
725
+ description = "当日往返"
726
  elif days <= 3:
727
+ description = "短途旅行"
728
  elif days <= 7:
729
+ description = "一周内旅行"
730
  elif days <= 14:
731
+ description = "中长途旅行"
732
  elif days <= 30:
733
+ description = "长途旅行"
734
  else:
735
+ description = "超长途旅行"
736
+
737
+ result["description"] = description
 
 
 
 
 
738
  break
739
 
740
  return result
741
 
742
+ def _extract_budget_from_tokens(self, tokens: list) -> dict:
743
+ """从tokens中提取预算信息"""
744
  result = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
745
 
746
+ # 1. 查找金额
747
+ for i, token in enumerate(tokens):
748
+ amount = None
749
+ currency = "RMB" # 默认货币
750
+
751
+ # 处理包含货币的token "2000欧", "5000元"
752
+ currency_patterns = [
753
+ (r'(\d+(?:\.\d+)?)欧(?:元)?', 'EUR'),
754
+ (r'(\d+(?:\.\d+)?)元', 'RMB'),
755
+ (r'(\d+(?:\.\d+)?)块(?:钱)?', 'RMB'),
756
+ (r'(\d+(?:\.\d+)?)人民币', 'RMB'),
757
+ (r'(\d+(?:\.\d+)?)美元', 'USD'),
758
+ (r'(\d+(?:\.\d+)?)英镑', 'GBP'),
759
+ (r'(\d+(?:\.\d+)?)瑞(?:士)?法郎', 'CHF'),
760
+ (r'(\d+(?:\.\d+)?)日元', 'JPY'),
761
+ (r'(\d+(?:\.\d+)?)韩元', 'KRW'),
762
+ (r'¥(\d+(?:\.\d+)?)', 'RMB'),
763
+ (r'€(\d+(?:\.\d+)?)', 'EUR'),
764
+ (r'\$(\d+(?:\.\d+)?)', 'USD'),
765
+ (r'£(\d+(?:\.\d+)?)', 'GBP'),
766
+ (r'(\d+(?:\.\d+)?)rmb', 'RMB'),
767
+ (r'(\d+(?:\.\d+)?)usd', 'USD'),
768
+ (r'(\d+(?:\.\d+)?)eur', 'EUR'),
769
+ (r'(\d+(?:\.\d+)?)gbp', 'GBP'),
770
+ (r'(\d+(?:\.\d+)?)chf', 'CHF'),
771
+ ]
772
+
773
+ for pattern, curr in currency_patterns:
774
+ match = re.search(pattern, token.lower())
775
+ if match:
776
+ amount = float(match.group(1))
777
+ currency = curr
778
+ break
779
+
780
+ # 处理纯数字token(需要查看上下文)
781
+ if not amount and re.match(r'^\d+(?:\.\d+)?
782
+ , token):
783
+ number = float(token)
784
 
785
+ # 检查前面的token是否有预算相关词汇
786
+ budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', '总共', '一共', 'budget', 'cost', 'spend']
787
+ has_budget_context = False
788
+
789
+ if i > 0 and tokens[i-1] in budget_indicators:
790
+ has_budget_context = True
791
+ elif i > 1 and tokens[i-2] in budget_indicators:
792
+ has_budget_context = True
793
+
794
+ # 检查后面是否有货币单位
795
+ if i < len(tokens) - 1:
796
+ next_token = tokens[i + 1].lower()
797
+ currency_units = {
798
+ '元': 'RMB', '块': 'RMB', '钱': 'RMB', '人民币': 'RMB',
799
+ '欧': 'EUR', '欧元': 'EUR', '美元': 'USD', '英镑': 'GBP',
800
+ '瑞郎': 'CHF', '日元': 'JPY', '韩元': 'KRW',
801
+ 'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF'
802
+ }
803
+
804
+ if next_token in currency_units:
805
+ amount = number
806
+ currency = currency_units[next_token]
807
+ has_budget_context = True
808
 
809
+ # 如果有预算上下文但没有明确货币单位,根据数字大小推断
810
+ if has_budget_context and not amount:
811
+ if number < 100: # 可能是欧元或美元
812
+ # 查看是否有欧洲城市上下文
813
+ has_european_context = any(self._normalize_city_name(t) for t in tokens)
814
+ if has_european_context:
815
+ currency = 'EUR'
816
+ else:
817
+ currency = 'USD'
818
+ else:
819
+ currency = 'RMB' # 大数字更可能是人民币
820
+ amount = number
821
+
822
+ # 处理万、千等单位
823
+ if amount:
824
+ # 检查是否有万、千修饰符
825
+ if i > 0:
826
+ prev_token = tokens[i-1]
827
+ if '万' in prev_token or 'w' in prev_token.lower():
828
  amount *= 10000
829
+ elif '千' in prev_token or 'k' in prev_token.lower():
830
  amount *= 1000
831
+ elif i < len(tokens) - 1:
832
+ next_token = tokens[i+1]
833
+ if '万' in next_token or 'w' in next_token.lower():
834
+ amount *= 10000
835
+ elif '千' in next_token or 'k' in next_token.lower():
836
+ amount *= 1000
837
+
838
+ if amount > 0:
839
  result["amount"] = int(amount)
840
+ result["currency"] = currency
 
 
 
 
 
 
 
 
 
 
 
841
  break
 
 
842
 
843
+ # 2. 查找预算类型
844
  budget_type_keywords = {
845
  'economy': [
 
846
  '经济', '便宜', '省钱', '实惠', '节省', '穷游', '学生', '青年',
847
  '预算有限', '钱不多', '不贵', '划算', '性价比', '背包客',
848
+ '简单', '基础', '低成本', '节约', 'budget', 'cheap', 'economy', 'affordable'
 
 
 
 
 
 
 
 
 
849
  ],
850
  'comfortable': [
 
851
  '舒适', '中等', '适中', '一般', '标准', '普通', '正常', '常规',
852
+ '中档', '中级', '合理', '平均', '中间档次', 'comfortable', 'standard', 'moderate'
 
 
 
 
 
 
 
 
853
  ],
854
  'luxury': [
855
+ '豪华', '奢华', '高端', '顶级', '精品', '奢侈', '贵族', '皇家',
856
+ '贵一点', '不差钱', '任性', '土豪', '有钱', '五星', 'VIP',
857
+ 'luxury', 'premium', 'high-end', 'expensive', 'fancy'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
858
  ]
859
  }
860
 
861
+ for token in tokens:
862
+ token_lower = token.lower()
863
+ for budget_type, keywords in budget_type_keywords.items():
864
+ if any(keyword in token_lower for keyword in keywords):
865
+ result["type"] = budget_type
866
+
867
+ # 找到第一个匹配的关键词作为描述
868
+ for keyword in keywords:
869
+ if keyword in token_lower:
870
+ result["description"] = keyword if len(keyword) > 2 else token
871
+ break
872
+ break
873
+ if result.get("type"):
874
  break
875
 
876
+ # 3. 如果有金额但没有类型,根据金额推断类型
877
  if result.get("amount") and not result.get("type"):
878
  amount = result["amount"]
879
  currency = result.get("currency", "RMB")
880
 
881
  # 根据欧洲旅行成本设置阈值
882
  if currency == "EUR":
883
+ if amount < 1500: # 总预算
884
  result["type"] = "economy"
885
  result["description"] = "经济预算"
886
+ elif amount < 4000:
887
  result["type"] = "comfortable"
888
  result["description"] = "舒适预算"
889
+ else:
 
 
 
 
 
 
 
 
 
 
890
  result["type"] = "luxury"
891
  result["description"] = "豪华预算"
892
+ elif currency == "USD":
893
+ if amount < 2000:
894
  result["type"] = "economy"
895
  result["description"] = "经济预算"
896
+ elif amount < 5000:
897
  result["type"] = "comfortable"
898
  result["description"] = "舒适预算"
899
+ else:
900
  result["type"] = "luxury"
901
  result["description"] = "豪华预算"
902
  elif currency == "RMB":
903
+ if amount < 8000:
 
 
 
 
 
 
 
 
 
 
904
  result["type"] = "economy"
905
  result["description"] = "经济预算"
906
+ elif amount < 20000:
907
  result["type"] = "comfortable"
908
  result["description"] = "舒适预算"
909
+ else:
910
  result["type"] = "luxury"
911
  result["description"] = "豪华预算"
912
 
913
+ # 4. 处理中文数字金额
914
+ chinese_money_mapping = {
915
+ '一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
916
+ '六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000,
917
+ '一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
918
  }
919
 
920
+ if not result.get("amount"):
921
+ for token in tokens:
922
+ if token in chinese_money_mapping:
923
+ result["amount"] = chinese_money_mapping[token]
924
+ result["currency"] = "RMB"
925
+ break
926
+
927
+ return result
928
 
929
  # 保持向后兼容的验证方法
930
  def _validate_and_normalize(self, data: dict) -> dict: