Spaces:
Sleeping
Sleeping
improve: tokenizer
Browse files- modules/info_extractor.py +546 -432
modules/info_extractor.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
| 1 |
import json
|
| 2 |
import re
|
| 3 |
from utils.logger import log
|
|
|
|
|
|
|
| 4 |
|
| 5 |
class InfoExtractor:
|
| 6 |
def __init__(self):
|
| 7 |
|
| 8 |
-
|
|
|
|
|
|
|
| 9 |
self.extraction_schema = {
|
| 10 |
"destination": {"type": dict, "fields": {"name": str, "country": str}},
|
| 11 |
"duration": {"type": dict, "fields": {"days": int, "description": str}},
|
|
@@ -267,21 +271,25 @@ class InfoExtractor:
|
|
| 267 |
"翡翠岛": "都柏林", "欧洲之都": "布鲁塞尔", "高迪之城": "巴塞罗那",
|
| 268 |
}
|
| 269 |
|
| 270 |
-
# 中文数字映射(保持原有)
|
| 271 |
self.chinese_numbers = {
|
| 272 |
'一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
|
| 273 |
'两': 2, '半': 0.5, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10,
|
|
|
|
|
|
|
|
|
|
| 274 |
# 特殊时长表达
|
| 275 |
'半个月': 15, '一个月': 30, '半年': 180, '一年': 365,
|
| 276 |
'半天': 0.5, '一天': 1, '两天': 2, '三天': 3, '四天': 4, '五天': 5, '六天': 6, '七天': 7,
|
| 277 |
'八天': 8, '九天': 9, '十天': 10, '半周': 3.5, '一周': 7, '两周': 14,
|
| 278 |
# 假期相关
|
| 279 |
'小长假': 3, '长假': 7, '十一': 7, '国庆': 7, '春节': 7, '五一': 3, '清明': 3,
|
| 280 |
-
'端午': 3, '中秋': 3, '元旦': 3, '暑假': 60, '寒假': 30, '周末': 2, '长周末': 3
|
|
|
|
|
|
|
| 281 |
}
|
| 282 |
|
| 283 |
-
|
| 284 |
-
"""
|
| 285 |
|
| 286 |
# 输入验证
|
| 287 |
if not user_message or not isinstance(user_message, str):
|
|
@@ -292,525 +300,631 @@ class InfoExtractor:
|
|
| 292 |
log.warning("⚠️ 用户消息过短,跳过信息提取")
|
| 293 |
return {}
|
| 294 |
|
| 295 |
-
log.info("🛠️
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
|
|
|
|
| 297 |
result = {}
|
| 298 |
|
| 299 |
-
#
|
| 300 |
-
destination_info = self.
|
| 301 |
if destination_info:
|
| 302 |
result["destination"] = destination_info
|
| 303 |
|
| 304 |
-
#
|
| 305 |
-
duration_info = self.
|
| 306 |
if duration_info:
|
| 307 |
result["duration"] = duration_info
|
| 308 |
|
| 309 |
-
#
|
| 310 |
-
budget_info = self.
|
| 311 |
if budget_info:
|
| 312 |
result["budget"] = budget_info
|
| 313 |
|
| 314 |
-
log.info(f"📊
|
| 315 |
return result
|
| 316 |
|
| 317 |
-
def
|
| 318 |
-
"""
|
| 319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
-
#
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
r'(\w+)的文化', r'(\w+)的历史', r'(\w+)的风景', r'(\w+)的特色', r'(\w+)的魅力',
|
| 344 |
-
|
| 345 |
-
# 特殊交通方式表达
|
| 346 |
-
r'飞(\w+)', r'坐船去(\w+)', r'坐火车去(\w+)', r'自驾去(\w+)', r'徒步去(\w+)',
|
| 347 |
-
r'骑行去(\w+)', r'背包去(\w+)', r'穷游去(\w+)',
|
| 348 |
-
|
| 349 |
-
# 旅行类型 + 地点
|
| 350 |
-
r'自由行(\w+)', r'跟团(\w+)', r'自驾(\w+)', r'蜜月(\w+)', r'毕业(\w+)',
|
| 351 |
-
r'亲子(\w+)', r'家庭(\w+)', r'情侣(\w+)', r'闺蜜(\w+)', r'独自(\w+)',
|
| 352 |
-
|
| 353 |
-
# 度假/休闲表达
|
| 354 |
-
r'度假去(\w+)', r'休闲去(\w+)', r'放松去(\w+)', r'散心去(\w+)', r'疗养去(\w+)',
|
| 355 |
-
|
| 356 |
-
# 其他变体
|
| 357 |
-
r'想要去(\w+)', r'渴望去(\w+)', r'梦想去(\w+)', r'向往(\w+)', r'憧憬(\w+)',
|
| 358 |
-
r'安排去(\w+)', r'规划去(\w+)', r'预定(\w+)', r'订(\w+)的票', r'买(\w+)机票'
|
| 359 |
]
|
| 360 |
|
| 361 |
-
#
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
|
| 367 |
-
#
|
| 368 |
-
if
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
|
|
|
| 372 |
|
| 373 |
-
#
|
| 374 |
-
if
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
break
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
break
|
| 402 |
-
if result:
|
| 403 |
-
break
|
| 404 |
|
| 405 |
return result
|
| 406 |
|
| 407 |
-
def
|
| 408 |
-
"""
|
| 409 |
-
|
|
|
|
| 410 |
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
r'行程(\d+)天', r'旅程(\d+)天', r'假期(\d+)天', r'休假(\d+)天',
|
| 424 |
-
|
| 425 |
-
# 时间修饰词
|
| 426 |
-
r'大概(\d+)天', r'约(\d+)天', r'差不多(\d+)天', r'左右(\d+)天', r'上下(\d+)天',
|
| 427 |
-
r'最多(\d+)天', r'最少(\d+)天', r'至少(\d+)天', r'不超过(\d+)天', r'超过(\d+)天',
|
| 428 |
-
r'将近(\d+)天', r'接近(\d+)天', r'快(\d+)天', r'足足(\d+)天', r'整整(\d+)天',
|
| 429 |
-
|
| 430 |
-
# 周相关
|
| 431 |
-
r'(\d+)周', r'(\d+)个周', r'(\d+)星期', r'(\d+)个星期', r'(\d+)礼拜', r'(\d+)个礼拜',
|
| 432 |
-
r'玩(\d+)周', r'住(\d+)周', r'呆(\d+)周', r'待(\d+)周', r'旅行(\d+)周',
|
| 433 |
-
r'(\d+)周的行程', r'(\d+)星期的旅行', r'(\d+)个礼拜的假期',
|
| 434 |
-
|
| 435 |
-
# 月相关
|
| 436 |
-
r'(\d+)月', r'(\d+)个月', r'(\d+)个月份',
|
| 437 |
-
r'玩(\d+)个月', r'住(\d+)个月', r'旅行(\d+)个月', r'度假(\d+)个月',
|
| 438 |
-
r'(\d+)个月的行程', r'(\d+)月的旅行', r'(\d+)个月的假期',
|
| 439 |
-
|
| 440 |
-
# 范围表达
|
| 441 |
-
r'(\d+)-(\d+)天', r'(\d+)到(\d+)天', r'(\d+)至(\d+)天', r'(\d+)~(\d+)天',
|
| 442 |
-
r'(\d+)天到(\d+)天', r'从(\d+)天到(\d+)天', r'介于(\d+)到(\d+)天',
|
| 443 |
-
|
| 444 |
-
# 中文数字
|
| 445 |
-
r'一天', r'二天', r'三天', r'四天', r'五天', r'六天', r'七天', r'八天', r'九天', r'十天',
|
| 446 |
-
r'两天', r'俩天', r'仨天', r'半天', r'一天半', r'两天半', r'三天半',
|
| 447 |
-
r'十一天', r'十二天', r'十三天', r'十四天', r'十五天', r'二十天', r'三十天',
|
| 448 |
-
|
| 449 |
-
# 特殊时长表达
|
| 450 |
-
r'周末', r'长周末', r'小长假', r'长假', r'黄金周', r'假期',
|
| 451 |
-
r'十一', r'国庆', r'春节', r'过年', r'五一', r'劳动节', r'清明', r'端午', r'中秋', r'元旦',
|
| 452 |
-
r'暑假', r'寒假', r'年假', r'蜜月', r'度蜜月',
|
| 453 |
-
r'短途', r'中途', r'长途', r'快闪', r'一日游', r'两日游', r'三日游', r'多日游'
|
| 454 |
-
]
|
| 455 |
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
|
| 462 |
-
if
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
|
| 483 |
-
|
| 484 |
-
if days and 0.5 <= days <= 365:
|
| 485 |
-
result["days"] = int(days) if days >= 1 else days
|
| 486 |
-
|
| 487 |
# 添加描述信息
|
| 488 |
if days <= 1:
|
| 489 |
-
|
| 490 |
elif days <= 3:
|
| 491 |
-
|
| 492 |
elif days <= 7:
|
| 493 |
-
|
| 494 |
elif days <= 14:
|
| 495 |
-
|
| 496 |
elif days <= 30:
|
| 497 |
-
|
| 498 |
else:
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
if not isinstance(match, tuple) and not match.isdigit():
|
| 503 |
-
result["description"] = match
|
| 504 |
-
|
| 505 |
-
break
|
| 506 |
-
if result:
|
| 507 |
break
|
| 508 |
|
| 509 |
return result
|
| 510 |
|
| 511 |
-
def
|
| 512 |
-
"""
|
| 513 |
result = {}
|
| 514 |
-
text_lower = text.lower()
|
| 515 |
-
|
| 516 |
-
# 欧洲旅行常用货币的金���提取模式
|
| 517 |
-
amount_patterns = [
|
| 518 |
-
# === 欧元表达 - 优先级最高(欧洲旅行主要货币) ===
|
| 519 |
-
r'(\d+)欧元', r'(\d+)欧', r'€(\d+)', r'EUR(\d+)', r'eur(\d+)',
|
| 520 |
-
r'(\d+)euro', r'(\d+)Euro', r'(\d+)EURO',
|
| 521 |
-
r'(\d+\.?\d*)欧元', r'€(\d+\.?\d*)',
|
| 522 |
-
r'预算(\d+)欧', r'花费(\d+)欧', r'大概(\d+)欧', r'约(\d+)欧',
|
| 523 |
-
|
| 524 |
-
# === 人民币表达 ===
|
| 525 |
-
r'(\d+)元', r'(\d+)块', r'(\d+)块钱', r'(\d+)人民币', r'(\d+)rmb', r'(\d+)RMB',
|
| 526 |
-
r'¥(\d+)', r'¥(\d+)', r'CNY(\d+)', r'cny(\d+)',
|
| 527 |
-
|
| 528 |
-
# === 美元表达 ===
|
| 529 |
-
r'(\d+)美元', r'(\d+)美刀', r'(\d+)刀', r'\$(\d+)', r'USD(\d+)', r'usd(\d+)',
|
| 530 |
-
r'(\d+)dollar', r'(\d+)Dollar',
|
| 531 |
-
|
| 532 |
-
# === 英镑表达(英国旅行) ===
|
| 533 |
-
r'(\d+)英镑', r'(\d+)镑', r'£(\d+)', r'GBP(\d+)', r'gbp(\d+)',
|
| 534 |
-
r'(\d+)pound', r'(\d+)Pound',
|
| 535 |
-
|
| 536 |
-
# === 瑞士法郎(瑞士旅行) ===
|
| 537 |
-
r'(\d+)瑞士法郎', r'(\d+)法郎', r'CHF(\d+)', r'chf(\d+)',
|
| 538 |
-
r'(\d+)瑞郎', r'(\d+)swiss franc',
|
| 539 |
-
|
| 540 |
-
# === 预算相关表达 ===
|
| 541 |
-
r'预算(\d+)', r'预算是(\d+)', r'预算大概(\d+)', r'预算约(\d+)',
|
| 542 |
-
r'预算差不多(\d+)', r'预算在(\d+)', r'预算控制在(\d+)',
|
| 543 |
-
r'预算不超过(\d+)', r'预算最多(\d+)', r'预算最少(\d+)',
|
| 544 |
-
|
| 545 |
-
# === 花费相关表达 ===
|
| 546 |
-
r'花(\d+)', r'花费(\d+)', r'花销(\d+)', r'开销(\d+)', r'支出(\d+)',
|
| 547 |
-
r'费用(\d+)', r'成本(\d+)', r'总共(\d+)', r'一共(\d+)', r'总计(\d+)',
|
| 548 |
-
|
| 549 |
-
# === 万元表达 ===
|
| 550 |
-
r'(\d+)万', r'(\d+)万元', r'(\d+)万块', r'(\d+)万人民币',
|
| 551 |
-
r'(\d+)万欧', r'(\d+)万欧元', r'(\d+)万美元', r'(\d+)万英镑',
|
| 552 |
-
r'(\d+\.?\d*)万', r'(\d+\.?\d*)万元',
|
| 553 |
-
|
| 554 |
-
# === 千元表达 ===
|
| 555 |
-
r'(\d+)千', r'(\d+)千元', r'(\d+)千块', r'(\d+)k', r'(\d+)K',
|
| 556 |
-
r'(\d+)千欧', r'(\d+)千美元', r'(\d+)千英镑',
|
| 557 |
-
|
| 558 |
-
# === 范围表达 ===
|
| 559 |
-
r'(\d+)-(\d+)', r'(\d+)到(\d+)', r'(\d+)至(\d+)', r'(\d+)~(\d+)',
|
| 560 |
-
r'(\d+)左右', r'约(\d+)', r'差不多(\d+)', r'大概(\d+)',
|
| 561 |
-
|
| 562 |
-
# === 每人/每天相关 ===
|
| 563 |
-
r'每人(\d+)', r'人均(\d+)', r'单人(\d+)', r'每天(\d+)', r'日均(\d+)',
|
| 564 |
-
|
| 565 |
-
# === 中文数字金额 ===
|
| 566 |
-
r'一万', r'两万', r'三万', r'四万', r'五万', r'六万', r'七万', r'八万', r'九万', r'十万',
|
| 567 |
-
r'一千', r'两千', r'三千', r'四千', r'五千', r'六千', r'七千', r'八千', r'九千'
|
| 568 |
-
]
|
| 569 |
|
| 570 |
-
#
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
amount *= 10000
|
| 607 |
-
elif '千' in
|
| 608 |
amount *= 1000
|
| 609 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 610 |
result["amount"] = int(amount)
|
| 611 |
-
|
| 612 |
-
# 确定货币类型(针对欧洲旅行优化)
|
| 613 |
-
if any(keyword in pattern for keyword in ['欧元', '欧', '€', 'eur', 'euro']):
|
| 614 |
-
result["currency"] = "EUR"
|
| 615 |
-
elif any(keyword in pattern for keyword in ['英镑', '镑', '£', 'gbp', 'pound']):
|
| 616 |
-
result["currency"] = "GBP"
|
| 617 |
-
elif any(keyword in pattern for keyword in ['瑞士法郎', '法郎', '瑞郎', 'chf', 'swiss franc']):
|
| 618 |
-
result["currency"] = "CHF"
|
| 619 |
-
elif any(keyword in pattern for keyword in ['美元', '美刀', '刀', 'usd', 'dollar']):
|
| 620 |
-
result["currency"] = "USD"
|
| 621 |
-
else:
|
| 622 |
-
result["currency"] = "RMB"
|
| 623 |
break
|
| 624 |
-
if result.get("amount"):
|
| 625 |
-
break
|
| 626 |
|
| 627 |
-
#
|
| 628 |
budget_type_keywords = {
|
| 629 |
'economy': [
|
| 630 |
-
# 经济相关
|
| 631 |
'经济', '便宜', '省钱', '实惠', '节省', '穷游', '学生', '青年',
|
| 632 |
'预算有限', '钱不多', '不贵', '划算', '性价比', '背包客',
|
| 633 |
-
'简单', '基础', '低成本', '节约', '
|
| 634 |
-
# 欧洲特色经济住宿
|
| 635 |
-
'青年旅社', '青旅', 'hostel', '民宿', 'airbnb', '客栈',
|
| 636 |
-
'多人间', '床位', '宿舍', '胶囊', 'capsule',
|
| 637 |
-
# 欧洲经济交通
|
| 638 |
-
'大巴', '长途汽车', 'flixbus', '火车', '二等座', '经济舱',
|
| 639 |
-
'欧洲通票', '青年票', '学生票', '团体票',
|
| 640 |
-
# 经济餐饮
|
| 641 |
-
'自己做饭', '超市', '便利店', '快餐', '街头小吃', '外卖',
|
| 642 |
-
'麦当劳', '汉堡王', 'kebab', 'döner'
|
| 643 |
],
|
| 644 |
'comfortable': [
|
| 645 |
-
# 舒适相关
|
| 646 |
'舒适', '中等', '适中', '一般', '标准', '普通', '正常', '常规',
|
| 647 |
-
'中档', '中级', '合理', '平均', '中间档次', '
|
| 648 |
-
# 欧洲中档住宿
|
| 649 |
-
'三星', '四星', '酒店', 'hotel', '标间', '双人间', '大床房',
|
| 650 |
-
'民宿', 'apartment', '公寓', 'b&b', 'pension',
|
| 651 |
-
# 欧洲舒适交通
|
| 652 |
-
'火车', '一等座', '高铁', 'tgv', 'ice', '城际列车',
|
| 653 |
-
'租车', '自驾', '商务舱', '直飞',
|
| 654 |
-
# 中档餐饮
|
| 655 |
-
'餐厅', '当地菜', '特色菜', '中档餐厅', '酒吧', 'bistro'
|
| 656 |
],
|
| 657 |
'luxury': [
|
| 658 |
-
|
| 659 |
-
'
|
| 660 |
-
'
|
| 661 |
-
'高消费', '享受', '奢享', '尊贵', '至尊', 'VIP',
|
| 662 |
-
# 欧洲豪华住宿
|
| 663 |
-
'五星', '六星', '豪华酒店', 'luxury hotel', '度假村', 'resort',
|
| 664 |
-
'别墅', 'villa', '城堡', 'castle', '套房', 'suite', '总统套房',
|
| 665 |
-
'丽思卡尔顿', '四季', '文华东方', '半岛', '香格里拉', '希尔顿',
|
| 666 |
-
'ritz carlton', 'four seasons', 'mandarin oriental', 'peninsula',
|
| 667 |
-
# 豪华交通
|
| 668 |
-
'头等舱', '商务舱', '私人飞机', 'private jet', '豪车', '奔驰', '宝马',
|
| 669 |
-
'奥迪', '保时捷', '法拉利', '兰博基尼', 'mercedes', 'bmw', 'audi',
|
| 670 |
-
# 奢华服务
|
| 671 |
-
'私人导游', '管家服务', 'concierge', '司机', '专车', '包车',
|
| 672 |
-
'定制旅行', '私人订制', '一对一服务', 'vip通道',
|
| 673 |
-
# 高端餐饮
|
| 674 |
-
'米其林', 'michelin', '米其林三星', '米其林餐厅', '高档餐厅',
|
| 675 |
-
'法式大餐', '意式料理', '分子料理', '酒庄', 'wine tasting'
|
| 676 |
]
|
| 677 |
}
|
| 678 |
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 685 |
break
|
| 686 |
|
| 687 |
-
#
|
| 688 |
if result.get("amount") and not result.get("type"):
|
| 689 |
amount = result["amount"]
|
| 690 |
currency = result.get("currency", "RMB")
|
| 691 |
|
| 692 |
# 根据欧洲旅行成本设置阈值
|
| 693 |
if currency == "EUR":
|
| 694 |
-
if amount <
|
| 695 |
result["type"] = "economy"
|
| 696 |
result["description"] = "经济预算"
|
| 697 |
-
elif amount <
|
| 698 |
result["type"] = "comfortable"
|
| 699 |
result["description"] = "舒适预算"
|
| 700 |
-
else:
|
| 701 |
-
result["type"] = "luxury"
|
| 702 |
-
result["description"] = "豪华预算"
|
| 703 |
-
elif currency == "GBP":
|
| 704 |
-
if amount < 40: # 每天40英镑以下
|
| 705 |
-
result["type"] = "economy"
|
| 706 |
-
result["description"] = "经济预算"
|
| 707 |
-
elif amount < 120: # 每天40-120英镑
|
| 708 |
-
result["type"] = "comfortable"
|
| 709 |
-
result["description"] = "舒适预算"
|
| 710 |
-
else: # 每天120英镑以上
|
| 711 |
result["type"] = "luxury"
|
| 712 |
result["description"] = "豪华预算"
|
| 713 |
-
elif currency == "
|
| 714 |
-
if amount <
|
| 715 |
result["type"] = "economy"
|
| 716 |
result["description"] = "经济预算"
|
| 717 |
-
elif amount <
|
| 718 |
result["type"] = "comfortable"
|
| 719 |
result["description"] = "舒适预算"
|
| 720 |
-
else:
|
| 721 |
result["type"] = "luxury"
|
| 722 |
result["description"] = "豪华预算"
|
| 723 |
elif currency == "RMB":
|
| 724 |
-
if amount <
|
| 725 |
-
result["type"] = "economy"
|
| 726 |
-
result["description"] = "经济预算"
|
| 727 |
-
elif amount < 800: # 每天300-800元
|
| 728 |
-
result["type"] = "comfortable"
|
| 729 |
-
result["description"] = "舒适预算"
|
| 730 |
-
else: # 每天800元以上
|
| 731 |
-
result["type"] = "luxury"
|
| 732 |
-
result["description"] = "豪华预算"
|
| 733 |
-
elif currency == "USD":
|
| 734 |
-
if amount < 60: # 每天60美元以下
|
| 735 |
result["type"] = "economy"
|
| 736 |
result["description"] = "经济预算"
|
| 737 |
-
elif amount <
|
| 738 |
result["type"] = "comfortable"
|
| 739 |
result["description"] = "舒适预算"
|
| 740 |
-
else:
|
| 741 |
result["type"] = "luxury"
|
| 742 |
result["description"] = "豪华预算"
|
| 743 |
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
return False
|
| 750 |
-
|
| 751 |
-
# 排除数字和常见的非地名词汇
|
| 752 |
-
invalid_words = [
|
| 753 |
-
# 数字和时间
|
| 754 |
-
'天', '日', '号', '月', '年', '周', '小时', '分钟', '秒',
|
| 755 |
-
# 金钱相关
|
| 756 |
-
'元', '块', '钱', '万', '千', '百', '预算', '费用', '成本', '价格',
|
| 757 |
-
'美元', '欧元', '英镑', '瑞郎', '法郎',
|
| 758 |
-
# 旅行相关动词
|
| 759 |
-
'花', '费', '旅行', '旅游', '行程', '计划', '想', '去', '到', '的',
|
| 760 |
-
'在', '是', '个', '了', '和', '与', '或', '但', '而', '就', '都',
|
| 761 |
-
# 其他常见词
|
| 762 |
-
'人', '我', '你', '他', '她', '们', '这', '那', '什么', '怎么',
|
| 763 |
-
'好', '很', '非常', '特别', '大', '小', '新', '老'
|
| 764 |
-
]
|
| 765 |
-
|
| 766 |
-
if name.isdigit() or name in invalid_words:
|
| 767 |
-
return False
|
| 768 |
-
|
| 769 |
-
# 检查是否包含数字(地名通常不包含数字)
|
| 770 |
-
if any(char.isdigit() for char in name):
|
| 771 |
-
return False
|
| 772 |
-
|
| 773 |
-
# 检查是否在欧洲城市列表中
|
| 774 |
-
if name in self.european_cities:
|
| 775 |
-
return True
|
| 776 |
-
|
| 777 |
-
# 检查是否在别名列表中
|
| 778 |
-
if name in self.european_city_aliases or name.lower() in self.european_city_aliases:
|
| 779 |
-
return True
|
| 780 |
-
|
| 781 |
-
# 城市名称长度检查
|
| 782 |
-
if len(name) > 15:
|
| 783 |
-
return False
|
| 784 |
-
|
| 785 |
-
# 检查是否包含特殊字符
|
| 786 |
-
if any(char in name for char in '!@#$%^&*()+={}[]|\\:";\'<>?,.`~'):
|
| 787 |
-
return False
|
| 788 |
-
|
| 789 |
-
return False # 只接受明确在欧洲城市列表中的城市
|
| 790 |
-
|
| 791 |
-
def _is_valid_european_country(self, name: str) -> bool:
|
| 792 |
-
"""验证是否为有效的欧洲国家名称"""
|
| 793 |
-
if not name or len(name) < 2:
|
| 794 |
-
return False
|
| 795 |
-
|
| 796 |
-
# 欧洲国家列表
|
| 797 |
-
european_countries = {
|
| 798 |
-
# 西欧
|
| 799 |
-
'法国', '德国', '英国', '荷兰', '比利时', '卢森堡',
|
| 800 |
-
# 南欧
|
| 801 |
-
'意大利', '西班牙', '葡萄牙', '希腊', '马耳他', '塞浦路斯',
|
| 802 |
-
# 中欧
|
| 803 |
-
'奥地利', '瑞士', '捷克', '斯洛伐克', '匈牙利', '波兰', '斯洛文尼亚',
|
| 804 |
-
# 北欧
|
| 805 |
-
'瑞典', '挪威', '丹麦', '芬兰', '冰岛',
|
| 806 |
-
# 东欧
|
| 807 |
-
'俄罗斯', '乌克兰', '白俄罗斯', '立陶宛', '拉脱维亚', '爱沙尼亚', '摩尔多瓦',
|
| 808 |
-
# 巴尔干半岛
|
| 809 |
-
'克罗地亚', '塞尔维亚', '波黑', '黑山', '北马其顿', '阿尔巴尼亚',
|
| 810 |
-
'保加利亚', '罗马尼亚', '土耳其'
|
| 811 |
}
|
| 812 |
|
| 813 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 814 |
|
| 815 |
# 保持向后兼容的验证方法
|
| 816 |
def _validate_and_normalize(self, data: dict) -> dict:
|
|
|
|
| 1 |
import json
|
| 2 |
import re
|
| 3 |
from utils.logger import log
|
| 4 |
+
import jieba
|
| 5 |
+
from typing import List, Tuple
|
| 6 |
|
| 7 |
class InfoExtractor:
|
| 8 |
def __init__(self):
|
| 9 |
|
| 10 |
+
self._init_tockenizer()
|
| 11 |
+
self._init_keyworkd_mapping()
|
| 12 |
+
|
| 13 |
self.extraction_schema = {
|
| 14 |
"destination": {"type": dict, "fields": {"name": str, "country": str}},
|
| 15 |
"duration": {"type": dict, "fields": {"days": int, "description": str}},
|
|
|
|
| 271 |
"翡翠岛": "都柏林", "欧洲之都": "布鲁塞尔", "高迪之城": "巴塞罗那",
|
| 272 |
}
|
| 273 |
|
|
|
|
| 274 |
self.chinese_numbers = {
|
| 275 |
'一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
|
| 276 |
'两': 2, '半': 0.5, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10,
|
| 277 |
+
# 英文数字
|
| 278 |
+
'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10,
|
| 279 |
+
'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15,
|
| 280 |
# 特殊时长表达
|
| 281 |
'半个月': 15, '一个月': 30, '半年': 180, '一年': 365,
|
| 282 |
'半天': 0.5, '一天': 1, '两天': 2, '三天': 3, '四天': 4, '五天': 5, '六天': 6, '七天': 7,
|
| 283 |
'八天': 8, '九天': 9, '十天': 10, '半周': 3.5, '一周': 7, '两周': 14,
|
| 284 |
# 假期相关
|
| 285 |
'小长假': 3, '长假': 7, '十一': 7, '国庆': 7, '春节': 7, '五一': 3, '清明': 3,
|
| 286 |
+
'端午': 3, '中秋': 3, '元旦': 3, '暑假': 60, '寒假': 30, '周末': 2, '长周末': 3,
|
| 287 |
+
# 英文假期
|
| 288 |
+
'weekend': 2, 'week': 7, 'month': 30, 'vacation': 7, 'holiday': 3
|
| 289 |
}
|
| 290 |
|
| 291 |
+
def extract(self, user_message: str) -> dict:
|
| 292 |
+
"""使用分词策略进行信息提取"""
|
| 293 |
|
| 294 |
# 输入验证
|
| 295 |
if not user_message or not isinstance(user_message, str):
|
|
|
|
| 300 |
log.warning("⚠️ 用户消息过短,跳过信息提取")
|
| 301 |
return {}
|
| 302 |
|
| 303 |
+
log.info(f"🛠️ 使用分词策略提取信息:'{user_message[:50]}...'")
|
| 304 |
+
|
| 305 |
+
# 1. 智能分词
|
| 306 |
+
tokens = self._tokenize_message(user_message)
|
| 307 |
+
log.info(f"📝 分词结果:{tokens}")
|
| 308 |
|
| 309 |
+
# 2. 基于分词进行信息提取
|
| 310 |
result = {}
|
| 311 |
|
| 312 |
+
# 提取目的地信息
|
| 313 |
+
destination_info = self._extract_destination_from_tokens(tokens)
|
| 314 |
if destination_info:
|
| 315 |
result["destination"] = destination_info
|
| 316 |
|
| 317 |
+
# 提取时长信息
|
| 318 |
+
duration_info = self._extract_duration_from_tokens(tokens)
|
| 319 |
if duration_info:
|
| 320 |
result["duration"] = duration_info
|
| 321 |
|
| 322 |
+
# 提取预算信息
|
| 323 |
+
budget_info = self._extract_budget_from_tokens(tokens)
|
| 324 |
if budget_info:
|
| 325 |
result["budget"] = budget_info
|
| 326 |
|
| 327 |
+
log.info(f"📊 分词提取结果: {result}")
|
| 328 |
return result
|
| 329 |
|
| 330 |
+
def _tokenize_message(self, text: str) -> list:
|
| 331 |
+
"""智能分词,支持中英文混合"""
|
| 332 |
+
|
| 333 |
+
# 预处理:统一标点符号和空格
|
| 334 |
+
text = text.replace(',', ',').replace('。', '.').replace('!', '!').replace('?', '?')
|
| 335 |
+
text = text.replace('(', '(').replace(')', ')').replace('【', '[').replace('】', ']')
|
| 336 |
+
|
| 337 |
+
tokens = []
|
| 338 |
+
current_token = ""
|
| 339 |
+
i = 0
|
| 340 |
+
|
| 341 |
+
while i < len(text):
|
| 342 |
+
char = text[i]
|
| 343 |
+
|
| 344 |
+
# 处理空格和标点符号
|
| 345 |
+
if char in ' ,,.。!!??()()[]【】::;;':
|
| 346 |
+
if current_token:
|
| 347 |
+
tokens.append(current_token)
|
| 348 |
+
current_token = ""
|
| 349 |
+
if char.strip(): # 保留非空格的标点符号
|
| 350 |
+
tokens.append(char)
|
| 351 |
+
i += 1
|
| 352 |
+
continue
|
| 353 |
+
|
| 354 |
+
# 处理数字(包括小数和货币符号)
|
| 355 |
+
if char.isdigit() or char in '¥$€£₩':
|
| 356 |
+
if current_token and not (current_token[-1].isdigit() or current_token[-1] in '¥$€£₩.'):
|
| 357 |
+
tokens.append(current_token)
|
| 358 |
+
current_token = char
|
| 359 |
+
else:
|
| 360 |
+
current_token += char
|
| 361 |
+
|
| 362 |
+
# 继续读取数字部分
|
| 363 |
+
i += 1
|
| 364 |
+
while i < len(text) and (text[i].isdigit() or text[i] in '.,'):
|
| 365 |
+
current_token += text[i]
|
| 366 |
+
i += 1
|
| 367 |
+
|
| 368 |
+
# 检查货币单位
|
| 369 |
+
currency_units = ['元', '块', '钱', '欧', '美元', '英镑', '日元', '韩元', '瑞郎', 'rmb', 'usd', 'eur', 'gbp', 'jpy', 'krw', 'chf']
|
| 370 |
+
remaining_text = text[i:].lower()
|
| 371 |
+
for unit in currency_units:
|
| 372 |
+
if remaining_text.startswith(unit):
|
| 373 |
+
current_token += text[i:i+len(unit)]
|
| 374 |
+
i += len(unit)
|
| 375 |
+
break
|
| 376 |
+
|
| 377 |
+
tokens.append(current_token)
|
| 378 |
+
current_token = ""
|
| 379 |
+
continue
|
| 380 |
+
|
| 381 |
+
# 处理英文单词
|
| 382 |
+
if char.isalpha() and ord(char) < 128: # ASCII字符
|
| 383 |
+
if current_token and not current_token[-1].isalpha():
|
| 384 |
+
tokens.append(current_token)
|
| 385 |
+
current_token = char
|
| 386 |
+
else:
|
| 387 |
+
current_token += char
|
| 388 |
+
|
| 389 |
+
# 继续读取英文字符
|
| 390 |
+
i += 1
|
| 391 |
+
while i < len(text) and text[i].isalpha() and ord(text[i]) < 128:
|
| 392 |
+
current_token += text[i]
|
| 393 |
+
i += 1
|
| 394 |
+
|
| 395 |
+
tokens.append(current_token)
|
| 396 |
+
current_token = ""
|
| 397 |
+
continue
|
| 398 |
+
|
| 399 |
+
# 处理中文字符
|
| 400 |
+
if self._is_chinese_char(char):
|
| 401 |
+
if current_token and not self._is_chinese_char(current_token[-1]):
|
| 402 |
+
tokens.append(current_token)
|
| 403 |
+
current_token = ""
|
| 404 |
+
|
| 405 |
+
# 对于中文,我们需要智能分词
|
| 406 |
+
# 检查是否是多字符城市名、时间表达等
|
| 407 |
+
remaining_text = text[i:]
|
| 408 |
+
|
| 409 |
+
# 尝试匹配城市名
|
| 410 |
+
matched_city = self._match_city_name(remaining_text)
|
| 411 |
+
if matched_city:
|
| 412 |
+
tokens.append(matched_city)
|
| 413 |
+
i += len(matched_city)
|
| 414 |
+
continue
|
| 415 |
+
|
| 416 |
+
# 尝试匹配时间表达
|
| 417 |
+
matched_time = self._match_time_expression(remaining_text)
|
| 418 |
+
if matched_time:
|
| 419 |
+
tokens.append(matched_time)
|
| 420 |
+
i += len(matched_time)
|
| 421 |
+
continue
|
| 422 |
+
|
| 423 |
+
# 尝试匹配预算类型关键词
|
| 424 |
+
matched_budget_type = self._match_budget_type(remaining_text)
|
| 425 |
+
if matched_budget_type:
|
| 426 |
+
tokens.append(matched_budget_type)
|
| 427 |
+
i += len(matched_budget_type)
|
| 428 |
+
continue
|
| 429 |
+
|
| 430 |
+
# 尝试匹配常见词汇
|
| 431 |
+
matched_word = self._match_common_word(remaining_text)
|
| 432 |
+
if matched_word:
|
| 433 |
+
tokens.append(matched_word)
|
| 434 |
+
i += len(matched_word)
|
| 435 |
+
continue
|
| 436 |
+
|
| 437 |
+
# 单个中文字符
|
| 438 |
+
tokens.append(char)
|
| 439 |
+
i += 1
|
| 440 |
+
else:
|
| 441 |
+
# 其他字符
|
| 442 |
+
current_token += char
|
| 443 |
+
i += 1
|
| 444 |
+
|
| 445 |
+
# 处理最后的token
|
| 446 |
+
if current_token:
|
| 447 |
+
tokens.append(current_token)
|
| 448 |
+
|
| 449 |
+
# 后处理:合并一些相关的tokens
|
| 450 |
+
tokens = self._post_process_tokens(tokens)
|
| 451 |
+
|
| 452 |
+
return [token for token in tokens if token.strip()] # 过滤空token
|
| 453 |
+
|
| 454 |
+
def _is_chinese_char(self, char: str) -> bool:
|
| 455 |
+
"""判断是否为中文字符"""
|
| 456 |
+
return '\u4e00' <= char <= '\u9fff'
|
| 457 |
+
|
| 458 |
+
def _match_city_name(self, text: str) -> str:
|
| 459 |
+
"""匹配城市名称"""
|
| 460 |
+
# 按长度从长到短排序,优先匹配长的城市名
|
| 461 |
+
all_cities = list(self.european_cities.keys()) + list(self.european_city_aliases.keys())
|
| 462 |
+
all_cities = sorted(set(all_cities), key=len, reverse=True)
|
| 463 |
+
|
| 464 |
+
for city in all_cities:
|
| 465 |
+
if text.startswith(city):
|
| 466 |
+
return city
|
| 467 |
+
return ""
|
| 468 |
+
|
| 469 |
+
def _match_time_expression(self, text: str) -> str:
|
| 470 |
+
"""匹配时间表达"""
|
| 471 |
+
time_expressions = [
|
| 472 |
+
# 多字符时间表达
|
| 473 |
+
'半个月', '一个月', '两个月', '三个月', '半年', '一年',
|
| 474 |
+
'小长假', '长周末', '国庆节', '春节假期', '暑假', '寒假',
|
| 475 |
+
'一天半', '两天半', '三天半', '一周半', '两周',
|
| 476 |
+
# 英文时��表达
|
| 477 |
+
'one day', 'two days', 'three days', 'one week', 'two weeks',
|
| 478 |
+
'long weekend', 'vacation', 'holiday', 'spring break'
|
| 479 |
+
]
|
| 480 |
|
| 481 |
+
# 按长度排序,优先匹配长表达
|
| 482 |
+
time_expressions = sorted(time_expressions, key=len, reverse=True)
|
| 483 |
+
|
| 484 |
+
text_lower = text.lower()
|
| 485 |
+
for expr in time_expressions:
|
| 486 |
+
if text_lower.startswith(expr.lower()):
|
| 487 |
+
return expr
|
| 488 |
+
if text.startswith(expr):
|
| 489 |
+
return expr
|
| 490 |
+
return ""
|
| 491 |
+
|
| 492 |
+
def _match_budget_type(self, text: str) -> str:
|
| 493 |
+
"""匹配预算类型关键词"""
|
| 494 |
+
budget_keywords = [
|
| 495 |
+
# 经济型
|
| 496 |
+
'经济实惠', '省钱', '便宜', '实惠', '经济', '穷游', '背包客',
|
| 497 |
+
'青年旅社', '学生', '预算有限', '性价比',
|
| 498 |
+
# 舒适型
|
| 499 |
+
'舒适', '中等', '适中', '标准', '普通', '中档', '合理',
|
| 500 |
+
# 豪华型
|
| 501 |
+
'豪华', '奢华', '高端', '顶级', '精品', '五星', '不差钱',
|
| 502 |
+
'任性', '土豪', 'VIP', '贵族', '皇家'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 503 |
]
|
| 504 |
|
| 505 |
+
# 按长度排序
|
| 506 |
+
budget_keywords = sorted(budget_keywords, key=len, reverse=True)
|
| 507 |
+
|
| 508 |
+
for keyword in budget_keywords:
|
| 509 |
+
if text.startswith(keyword):
|
| 510 |
+
return keyword
|
| 511 |
+
return ""
|
| 512 |
+
|
| 513 |
+
def _match_common_word(self, text: str) -> str:
|
| 514 |
+
"""匹配常见词汇"""
|
| 515 |
+
common_words = [
|
| 516 |
+
# 旅行相关动词
|
| 517 |
+
'想去', '计划去', '打算去', '准备去', '希望去', '考虑去',
|
| 518 |
+
'前往', '旅行', '旅游', '游玩', '度假', '出发', '飞往',
|
| 519 |
+
# 时间相关
|
| 520 |
+
'三天', '四天', '五天', '六天', '七天', '八天', '九天', '十天',
|
| 521 |
+
'一天', '两天', '几天', '多天', '数天',
|
| 522 |
+
# 预算相关
|
| 523 |
+
'预算', '花费', '费用', '成本', '开销', '支出', '消费',
|
| 524 |
+
'总共', '一共', '大概', '约', '左右', '差不多',
|
| 525 |
+
# 其他
|
| 526 |
+
'行程', '计划', '安排', '路线', '攻略'
|
| 527 |
+
]
|
| 528 |
+
|
| 529 |
+
# 按长度排序
|
| 530 |
+
common_words = sorted(common_words, key=len, reverse=True)
|
| 531 |
+
|
| 532 |
+
for word in common_words:
|
| 533 |
+
if text.startswith(word):
|
| 534 |
+
return word
|
| 535 |
+
return ""
|
| 536 |
+
|
| 537 |
+
def _post_process_tokens(self, tokens: list) -> list:
|
| 538 |
+
"""后处理tokens,合并相关的片段"""
|
| 539 |
+
if not tokens:
|
| 540 |
+
return tokens
|
| 541 |
+
|
| 542 |
+
processed = []
|
| 543 |
+
i = 0
|
| 544 |
+
|
| 545 |
+
while i < len(tokens):
|
| 546 |
+
current_token = tokens[i]
|
| 547 |
+
|
| 548 |
+
# 合并数字+单位的组合
|
| 549 |
+
if i < len(tokens) - 1:
|
| 550 |
+
next_token = tokens[i + 1]
|
| 551 |
|
| 552 |
+
# 数字 + 货币单位
|
| 553 |
+
if (current_token.isdigit() and
|
| 554 |
+
next_token.lower() in ['元', '块', '钱', '欧', '美元', '英镑', '日元', 'rmb', 'usd', 'eur', 'gbp', 'jpy']):
|
| 555 |
+
processed.append(current_token + next_token)
|
| 556 |
+
i += 2
|
| 557 |
+
continue
|
| 558 |
|
| 559 |
+
# 数字 + 时间单位
|
| 560 |
+
if (current_token.isdigit() and
|
| 561 |
+
next_token in ['天', '日', '周', '月', '年', 'days', 'weeks', 'months']):
|
| 562 |
+
processed.append(current_token + next_token)
|
| 563 |
+
i += 2
|
| 564 |
+
continue
|
| 565 |
+
|
| 566 |
+
# 预算 + 数字
|
| 567 |
+
if current_token == '预算' and next_token.replace('.', '').replace(',', '').isdigit():
|
| 568 |
+
if i < len(tokens) - 2 and tokens[i + 2] in ['元', '块', '钱', '欧', 'rmb', 'usd', 'eur']:
|
| 569 |
+
processed.append(current_token + next_token + tokens[i + 2])
|
| 570 |
+
i += 3
|
| 571 |
+
continue
|
| 572 |
+
else:
|
| 573 |
+
processed.append(current_token + next_token)
|
| 574 |
+
i += 2
|
| 575 |
+
continue
|
| 576 |
+
|
| 577 |
+
processed.append(current_token)
|
| 578 |
+
i += 1
|
| 579 |
|
| 580 |
+
return processed
|
| 581 |
+
|
| 582 |
+
def _extract_destination_from_tokens(self, tokens: list) -> dict:
|
| 583 |
+
"""从tokens中提取目的地信息"""
|
| 584 |
+
result = {}
|
| 585 |
|
| 586 |
+
# 查找城市名
|
| 587 |
+
for i, token in enumerate(tokens):
|
| 588 |
+
# 直接匹配城市名
|
| 589 |
+
city_name = self._normalize_city_name(token)
|
| 590 |
+
if city_name:
|
| 591 |
+
result["name"] = city_name
|
| 592 |
+
if city_name in self.european_cities:
|
| 593 |
+
result["country"] = self.european_cities[city_name]
|
| 594 |
+
break
|
| 595 |
+
|
| 596 |
+
# 检查是否在动词后面
|
| 597 |
+
if i > 0:
|
| 598 |
+
prev_token = tokens[i - 1]
|
| 599 |
+
if prev_token in ['去', '到', '想去', '前往', '旅行', '游', '玩', 'go', 'to', 'visit', 'travel']:
|
| 600 |
+
city_name = self._normalize_city_name(token)
|
| 601 |
+
if city_name:
|
| 602 |
+
result["name"] = city_name
|
| 603 |
+
if city_name in self.european_cities:
|
| 604 |
+
result["country"] = self.european_cities[city_name]
|
| 605 |
break
|
| 606 |
+
|
| 607 |
+
# 如果没有找到,尝试fuzzy匹配
|
| 608 |
+
if not result:
|
| 609 |
+
for token in tokens:
|
| 610 |
+
if len(token) >= 2:
|
| 611 |
+
# 模糊匹配城市名
|
| 612 |
+
for city, country in self.european_cities.items():
|
| 613 |
+
if token in city or city in token:
|
| 614 |
+
if len(token) >= len(city) * 0.6: # 相似度阈值
|
| 615 |
+
result["name"] = city
|
| 616 |
+
result["country"] = country
|
| 617 |
+
break
|
| 618 |
+
if result:
|
| 619 |
break
|
|
|
|
|
|
|
| 620 |
|
| 621 |
return result
|
| 622 |
|
| 623 |
+
def _normalize_city_name(self, token: str) -> str:
|
| 624 |
+
"""标准化城市名称"""
|
| 625 |
+
if not token:
|
| 626 |
+
return ""
|
| 627 |
|
| 628 |
+
token_lower = token.lower().strip()
|
| 629 |
+
|
| 630 |
+
# 直接匹配
|
| 631 |
+
if token in self.european_cities:
|
| 632 |
+
return token
|
| 633 |
+
|
| 634 |
+
# 别名匹配
|
| 635 |
+
if token_lower in self.european_city_aliases:
|
| 636 |
+
return self.european_city_aliases[token_lower]
|
| 637 |
+
|
| 638 |
+
if token in self.european_city_aliases:
|
| 639 |
+
return self.european_city_aliases[token]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
|
| 641 |
+
return ""
|
| 642 |
+
|
| 643 |
+
def _extract_duration_from_tokens(self, tokens: list) -> dict:
|
| 644 |
+
"""从tokens中提取时长信息"""
|
| 645 |
+
result = {}
|
| 646 |
+
|
| 647 |
+
for i, token in enumerate(tokens):
|
| 648 |
+
days = None
|
| 649 |
+
description = ""
|
| 650 |
+
|
| 651 |
+
# 处理 "数字+天" 的token
|
| 652 |
+
if re.match(r'^\d+[天日]$', token):
|
| 653 |
+
days = int(re.findall(r'\d+', token)[0])
|
| 654 |
+
|
| 655 |
+
# 处理 "数字+weeks/days" 的token
|
| 656 |
+
elif re.match(r'^\d+(days?|weeks?|months?)$', token.lower()):
|
| 657 |
+
number = int(re.findall(r'\d+', token)[0])
|
| 658 |
+
unit = re.findall(r'[a-zA-Z]+', token.lower())[0]
|
| 659 |
+
if unit.startswith('day'):
|
| 660 |
+
days = number
|
| 661 |
+
elif unit.startswith('week'):
|
| 662 |
+
days = number * 7
|
| 663 |
+
elif unit.startswith('month'):
|
| 664 |
+
days = number * 30
|
| 665 |
+
|
| 666 |
+
# 处理分离的数字和单位
|
| 667 |
+
elif token.isdigit() and i < len(tokens) - 1:
|
| 668 |
+
next_token = tokens[i + 1]
|
| 669 |
+
number = int(token)
|
| 670 |
|
| 671 |
+
if next_token in ['天', '日']:
|
| 672 |
+
days = number
|
| 673 |
+
elif next_token in ['周', '星期', '礼拜', 'week', 'weeks']:
|
| 674 |
+
days = number * 7
|
| 675 |
+
elif next_token in ['月', '个月', 'month', 'months']:
|
| 676 |
+
days = number * 30
|
| 677 |
+
|
| 678 |
+
# 处理中文数字
|
| 679 |
+
elif token in self.chinese_numbers:
|
| 680 |
+
days = self.chinese_numbers[token]
|
| 681 |
+
description = token
|
| 682 |
+
|
| 683 |
+
# 处理特殊时长表达
|
| 684 |
+
elif token in ['周末', 'weekend']:
|
| 685 |
+
days = 2
|
| 686 |
+
description = token
|
| 687 |
+
elif token in ['长周末', 'long weekend']:
|
| 688 |
+
days = 3
|
| 689 |
+
description = token
|
| 690 |
+
elif token in ['小长假', 'vacation', 'holiday']:
|
| 691 |
+
days = 3
|
| 692 |
+
description = token
|
| 693 |
+
elif token in ['十一', '国庆', 'national day']:
|
| 694 |
+
days = 7
|
| 695 |
+
description = token
|
| 696 |
+
elif token in ['春节', 'spring festival']:
|
| 697 |
+
days = 7
|
| 698 |
+
description = token
|
| 699 |
+
elif token in ['暑假', 'summer vacation']:
|
| 700 |
+
days = 60
|
| 701 |
+
description = token
|
| 702 |
+
elif token in ['寒假', 'winter vacation']:
|
| 703 |
+
days = 30
|
| 704 |
+
description = token
|
| 705 |
+
|
| 706 |
+
# 处理复合表达 "三天两夜"
|
| 707 |
+
elif re.match(r'^[一二三四五六七八九十\d]+天', token):
|
| 708 |
+
# 提取数字部分
|
| 709 |
+
for num_token in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']:
|
| 710 |
+
if token.startswith(num_token):
|
| 711 |
+
days = self.chinese_numbers[num_token]
|
| 712 |
+
description = token
|
| 713 |
+
break
|
| 714 |
+
if not days and token[0].isdigit():
|
| 715 |
+
days = int(token[0])
|
| 716 |
+
description = token
|
| 717 |
+
|
| 718 |
+
# 验证天数合理性并设置结果
|
| 719 |
+
if days and 0.5 <= days <= 365:
|
| 720 |
+
result["days"] = int(days) if days >= 1 else days
|
| 721 |
|
| 722 |
+
if not description:
|
|
|
|
|
|
|
|
|
|
| 723 |
# 添加描述信息
|
| 724 |
if days <= 1:
|
| 725 |
+
description = "当日往返"
|
| 726 |
elif days <= 3:
|
| 727 |
+
description = "短途旅行"
|
| 728 |
elif days <= 7:
|
| 729 |
+
description = "一周内旅行"
|
| 730 |
elif days <= 14:
|
| 731 |
+
description = "中长途旅行"
|
| 732 |
elif days <= 30:
|
| 733 |
+
description = "长途旅行"
|
| 734 |
else:
|
| 735 |
+
description = "超长途旅行"
|
| 736 |
+
|
| 737 |
+
result["description"] = description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 738 |
break
|
| 739 |
|
| 740 |
return result
|
| 741 |
|
| 742 |
+
def _extract_budget_from_tokens(self, tokens: list) -> dict:
|
| 743 |
+
"""从tokens中提取预算信息"""
|
| 744 |
result = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 745 |
|
| 746 |
+
# 1. 查找金额
|
| 747 |
+
for i, token in enumerate(tokens):
|
| 748 |
+
amount = None
|
| 749 |
+
currency = "RMB" # 默认货币
|
| 750 |
+
|
| 751 |
+
# 处理包含货币的token "2000欧", "5000元"
|
| 752 |
+
currency_patterns = [
|
| 753 |
+
(r'(\d+(?:\.\d+)?)欧(?:元)?', 'EUR'),
|
| 754 |
+
(r'(\d+(?:\.\d+)?)元', 'RMB'),
|
| 755 |
+
(r'(\d+(?:\.\d+)?)块(?:钱)?', 'RMB'),
|
| 756 |
+
(r'(\d+(?:\.\d+)?)人民币', 'RMB'),
|
| 757 |
+
(r'(\d+(?:\.\d+)?)美元', 'USD'),
|
| 758 |
+
(r'(\d+(?:\.\d+)?)英镑', 'GBP'),
|
| 759 |
+
(r'(\d+(?:\.\d+)?)瑞(?:士)?法郎', 'CHF'),
|
| 760 |
+
(r'(\d+(?:\.\d+)?)日元', 'JPY'),
|
| 761 |
+
(r'(\d+(?:\.\d+)?)韩元', 'KRW'),
|
| 762 |
+
(r'¥(\d+(?:\.\d+)?)', 'RMB'),
|
| 763 |
+
(r'€(\d+(?:\.\d+)?)', 'EUR'),
|
| 764 |
+
(r'\$(\d+(?:\.\d+)?)', 'USD'),
|
| 765 |
+
(r'£(\d+(?:\.\d+)?)', 'GBP'),
|
| 766 |
+
(r'(\d+(?:\.\d+)?)rmb', 'RMB'),
|
| 767 |
+
(r'(\d+(?:\.\d+)?)usd', 'USD'),
|
| 768 |
+
(r'(\d+(?:\.\d+)?)eur', 'EUR'),
|
| 769 |
+
(r'(\d+(?:\.\d+)?)gbp', 'GBP'),
|
| 770 |
+
(r'(\d+(?:\.\d+)?)chf', 'CHF'),
|
| 771 |
+
]
|
| 772 |
+
|
| 773 |
+
for pattern, curr in currency_patterns:
|
| 774 |
+
match = re.search(pattern, token.lower())
|
| 775 |
+
if match:
|
| 776 |
+
amount = float(match.group(1))
|
| 777 |
+
currency = curr
|
| 778 |
+
break
|
| 779 |
+
|
| 780 |
+
# 处理纯数字token(需要查看上下文)
|
| 781 |
+
if not amount and re.match(r'^\d+(?:\.\d+)?
|
| 782 |
+
, token):
|
| 783 |
+
number = float(token)
|
| 784 |
|
| 785 |
+
# 检查前面的token是否有预算相关词汇
|
| 786 |
+
budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', '总共', '一共', 'budget', 'cost', 'spend']
|
| 787 |
+
has_budget_context = False
|
| 788 |
+
|
| 789 |
+
if i > 0 and tokens[i-1] in budget_indicators:
|
| 790 |
+
has_budget_context = True
|
| 791 |
+
elif i > 1 and tokens[i-2] in budget_indicators:
|
| 792 |
+
has_budget_context = True
|
| 793 |
+
|
| 794 |
+
# 检查后面是否有货币单位
|
| 795 |
+
if i < len(tokens) - 1:
|
| 796 |
+
next_token = tokens[i + 1].lower()
|
| 797 |
+
currency_units = {
|
| 798 |
+
'元': 'RMB', '块': 'RMB', '钱': 'RMB', '人民币': 'RMB',
|
| 799 |
+
'欧': 'EUR', '欧元': 'EUR', '美元': 'USD', '英镑': 'GBP',
|
| 800 |
+
'瑞郎': 'CHF', '日元': 'JPY', '韩元': 'KRW',
|
| 801 |
+
'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF'
|
| 802 |
+
}
|
| 803 |
+
|
| 804 |
+
if next_token in currency_units:
|
| 805 |
+
amount = number
|
| 806 |
+
currency = currency_units[next_token]
|
| 807 |
+
has_budget_context = True
|
| 808 |
|
| 809 |
+
# 如果有预算上下文但没有明确货币单位,根据数字大小推断
|
| 810 |
+
if has_budget_context and not amount:
|
| 811 |
+
if number < 100: # 可能是欧元或美元
|
| 812 |
+
# 查看是否有欧洲城市上下文
|
| 813 |
+
has_european_context = any(self._normalize_city_name(t) for t in tokens)
|
| 814 |
+
if has_european_context:
|
| 815 |
+
currency = 'EUR'
|
| 816 |
+
else:
|
| 817 |
+
currency = 'USD'
|
| 818 |
+
else:
|
| 819 |
+
currency = 'RMB' # 大数字更可能是人民币
|
| 820 |
+
amount = number
|
| 821 |
+
|
| 822 |
+
# 处理万、千等单位
|
| 823 |
+
if amount:
|
| 824 |
+
# 检查是否有万、千修饰符
|
| 825 |
+
if i > 0:
|
| 826 |
+
prev_token = tokens[i-1]
|
| 827 |
+
if '万' in prev_token or 'w' in prev_token.lower():
|
| 828 |
amount *= 10000
|
| 829 |
+
elif '千' in prev_token or 'k' in prev_token.lower():
|
| 830 |
amount *= 1000
|
| 831 |
+
elif i < len(tokens) - 1:
|
| 832 |
+
next_token = tokens[i+1]
|
| 833 |
+
if '万' in next_token or 'w' in next_token.lower():
|
| 834 |
+
amount *= 10000
|
| 835 |
+
elif '千' in next_token or 'k' in next_token.lower():
|
| 836 |
+
amount *= 1000
|
| 837 |
+
|
| 838 |
+
if amount > 0:
|
| 839 |
result["amount"] = int(amount)
|
| 840 |
+
result["currency"] = currency
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
break
|
|
|
|
|
|
|
| 842 |
|
| 843 |
+
# 2. 查找预算类型
|
| 844 |
budget_type_keywords = {
|
| 845 |
'economy': [
|
|
|
|
| 846 |
'经济', '便宜', '省钱', '实惠', '节省', '穷游', '学生', '青年',
|
| 847 |
'预算有限', '钱不多', '不贵', '划算', '性价比', '背包客',
|
| 848 |
+
'简单', '基础', '低成本', '节约', 'budget', 'cheap', 'economy', 'affordable'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 849 |
],
|
| 850 |
'comfortable': [
|
|
|
|
| 851 |
'舒适', '中等', '适中', '一般', '标准', '普通', '正常', '常规',
|
| 852 |
+
'中档', '中级', '合理', '平均', '中间档次', 'comfortable', 'standard', 'moderate'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 853 |
],
|
| 854 |
'luxury': [
|
| 855 |
+
'豪华', '奢华', '高端', '顶级', '精品', '奢侈', '贵族', '皇家',
|
| 856 |
+
'贵一点', '不差钱', '任性', '土豪', '有钱', '五星', 'VIP',
|
| 857 |
+
'luxury', 'premium', 'high-end', 'expensive', 'fancy'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 858 |
]
|
| 859 |
}
|
| 860 |
|
| 861 |
+
for token in tokens:
|
| 862 |
+
token_lower = token.lower()
|
| 863 |
+
for budget_type, keywords in budget_type_keywords.items():
|
| 864 |
+
if any(keyword in token_lower for keyword in keywords):
|
| 865 |
+
result["type"] = budget_type
|
| 866 |
+
|
| 867 |
+
# 找到第一个匹配的关键词作为描述
|
| 868 |
+
for keyword in keywords:
|
| 869 |
+
if keyword in token_lower:
|
| 870 |
+
result["description"] = keyword if len(keyword) > 2 else token
|
| 871 |
+
break
|
| 872 |
+
break
|
| 873 |
+
if result.get("type"):
|
| 874 |
break
|
| 875 |
|
| 876 |
+
# 3. 如果有金额但没有类型,根据金额推断类型
|
| 877 |
if result.get("amount") and not result.get("type"):
|
| 878 |
amount = result["amount"]
|
| 879 |
currency = result.get("currency", "RMB")
|
| 880 |
|
| 881 |
# 根据欧洲旅行成本设置阈值
|
| 882 |
if currency == "EUR":
|
| 883 |
+
if amount < 1500: # 总预算
|
| 884 |
result["type"] = "economy"
|
| 885 |
result["description"] = "经济预算"
|
| 886 |
+
elif amount < 4000:
|
| 887 |
result["type"] = "comfortable"
|
| 888 |
result["description"] = "舒适预算"
|
| 889 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 890 |
result["type"] = "luxury"
|
| 891 |
result["description"] = "豪华预算"
|
| 892 |
+
elif currency == "USD":
|
| 893 |
+
if amount < 2000:
|
| 894 |
result["type"] = "economy"
|
| 895 |
result["description"] = "经济预算"
|
| 896 |
+
elif amount < 5000:
|
| 897 |
result["type"] = "comfortable"
|
| 898 |
result["description"] = "舒适预算"
|
| 899 |
+
else:
|
| 900 |
result["type"] = "luxury"
|
| 901 |
result["description"] = "豪华预算"
|
| 902 |
elif currency == "RMB":
|
| 903 |
+
if amount < 8000:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 904 |
result["type"] = "economy"
|
| 905 |
result["description"] = "经济预算"
|
| 906 |
+
elif amount < 20000:
|
| 907 |
result["type"] = "comfortable"
|
| 908 |
result["description"] = "舒适预算"
|
| 909 |
+
else:
|
| 910 |
result["type"] = "luxury"
|
| 911 |
result["description"] = "豪华预算"
|
| 912 |
|
| 913 |
+
# 4. 处理中文数字金额
|
| 914 |
+
chinese_money_mapping = {
|
| 915 |
+
'一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
|
| 916 |
+
'六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000,
|
| 917 |
+
'一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 918 |
}
|
| 919 |
|
| 920 |
+
if not result.get("amount"):
|
| 921 |
+
for token in tokens:
|
| 922 |
+
if token in chinese_money_mapping:
|
| 923 |
+
result["amount"] = chinese_money_mapping[token]
|
| 924 |
+
result["currency"] = "RMB"
|
| 925 |
+
break
|
| 926 |
+
|
| 927 |
+
return result
|
| 928 |
|
| 929 |
# 保持向后兼容的验证方法
|
| 930 |
def _validate_and_normalize(self, data: dict) -> dict:
|