Eliot0110 commited on
Commit
c22dbae
·
1 Parent(s): 7f130dd

improve: budget mapping

Browse files
Files changed (1) hide show
  1. modules/info_extractor.py +78 -83
modules/info_extractor.py CHANGED
@@ -756,90 +756,85 @@ class InfoExtractor:
756
  def _extract_budget_from_tokens(self, tokens: list) -> dict:
757
 
758
  result = {}
759
-
760
- # 1. 查找金额和货币单位(不再有默认值)
761
- for i, token in enumerate(tokens):
762
- amount = None
763
- currency = None # <--- 修改点:不再预设 "RMB",初始为 None
764
-
765
- # 处理包含货币的token "2000欧", "5000元"
766
- currency_patterns = [
767
- (r'(\d+(?:\.\d+)?)欧(?:元)?', 'EUR'),
768
- (r'(\d+(?:\.\d+)?)元', 'RMB'),
769
- (r'(\d+(?:\.\d+)?)块(?:钱)?', 'RMB'),
770
- (r'(\d+(?:\.\d+)?)人民币', 'RMB'),
771
- (r'(\d+(?:\.\d+)?)美元', 'USD'),
772
- (r'(\d+(?:\.\d+)?)英镑', 'GBP'),
773
- (r'(\d+(?:\.\d+)?)瑞(?:士)?法郎', 'CHF'),
774
- (r'(\d+(?:\.\d+)?)日元', 'JPY'),
775
- (r'(\d+(?:\.\d+)?)韩元', 'KRW'),
776
- (r'¥(\d+(?:\.\d+)?)', 'RMB'),
777
- (r'€(\d+(?:\.\d+)?)', 'EUR'),
778
- (r'\$(\d+(?:\.\d+)?)', 'USD'),
779
- (r'£(\d+(?:\.\d+)?)', 'GBP'),
780
- (r'(\d+(?:\.\d+)?)rmb', 'RMB'),
781
- (r'(\d+(?:\.\d+)?)usd', 'USD'),
782
- (r'(\d+(?:\.\d+)?)eur', 'EUR'),
783
- (r'(\d+(?:\.\d+)?)gbp', 'GBP'),
784
- (r'(\d+(?:\.\d+)?)chf', 'CHF'),
785
- ]
786
-
787
- for pattern, curr in currency_patterns:
788
- match = re.search(pattern, token.lower())
789
- if match:
790
- amount = float(match.group(1))
791
- currency = curr
792
- break
793
-
794
- # 处理纯数字token(需要查看上下文)
795
- if not amount and re.match(r'^\\d+(?:\\.\\d+)?$', token):
796
- number = float(token)
797
-
798
- # 检查前面的token是否有预算相关词汇
799
- budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', '总共', '一共', 'budget', 'cost', 'spend']
800
- has_budget_context = False
801
- if i > 0 and tokens[i-1] in budget_indicators:
802
- has_budget_context = True
803
- elif i > 1 and tokens[i-2] in budget_indicators:
804
- has_budget_context = True
805
-
806
- # 检查后面是否有货币单位
807
- if i < len(tokens) - 1:
808
- next_token = tokens[i + 1].lower()
809
- currency_units = {
810
- '元': 'RMB', '块': 'RMB', '钱': 'RMB', '人民币': 'RMB',
811
- '欧': 'EUR', '欧元': 'EUR', '美元': 'USD', '英镑': 'GBP',
812
- '瑞郎': 'CHF', '日元': 'JPY', '韩元': 'KRW',
813
- 'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF'
814
- }
815
- if next_token in currency_units:
816
- amount = number
817
- currency = currency_units[next_token]
818
-
819
- if has_budget_context and not currency:
820
- amount = number
821
 
822
- # 处理万、千等单位
823
- if amount:
824
- # (这部分逻辑保持不变)
825
- if i > 0:
826
- prev_token = tokens[i-1]
827
- if '' in prev_token or 'w' in prev_token.lower():
828
- amount *= 10000
829
- elif '' in prev_token or 'k' in prev_token.lower():
830
- amount *= 1000
831
- elif i < len(tokens) - 1:
832
- next_token = tokens[i+1]
833
- if '' in next_token or 'w' in next_token.lower():
834
- amount *= 10000
835
- elif '' in next_token or 'k' in next_token.lower():
836
- amount *= 1000
837
-
838
- if amount > 0:
839
- result["amount"] = int(amount)
840
- if currency: # 只有当识别到货币时才赋值
841
- result["currency"] = currency
842
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
843
 
844
  # 2. 查找预算类型(此部分逻辑与金额完全无关)
845
  budget_type_keywords = {
 
756
  def _extract_budget_from_tokens(self, tokens: list) -> dict:
757
 
758
  result = {}
759
+ text = "".join(tokens).lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
760
 
761
+ # --- 1. 统一提取金额和货币 ---
762
+ # 按优先级排列正则表达式,越精确的模式越靠前
763
+ # 模式捕获: (金额数字, 乘数单位[千/万/k/w], 货币单位[元/欧/usd...])
764
+ patterns = [
765
+ {'regex': r'¥\s*(\d+\.?\d*)', 'groups': {'amount': 1}, 'currency': 'RMB'},
766
+ {'regex': r'€\s*(\d+\.?\d*)', 'groups': {'amount': 1}, 'currency': 'EUR'},
767
+ {'regex': r'\$\s*(\d+\.?\d*)', 'groups': {'amount': 1}, 'currency': 'USD'},
768
+ {'regex': r'£\s*(\d+\.?\d*)', 'groups': {'amount': 1}, 'currency': 'GBP'},
769
+ {'regex': r'(usd|rmb|eur|gbp|chf|jpy)\s*(\d+\.?\d*)\s*(百|hundred|千|k|thousand|万|w)?',
770
+ 'groups': {'currency': 1, 'amount': 2, 'multiplier': 3}},
771
+ {'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(人民币|元|块|块钱|rmb)',
772
+ 'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'RMB'},
773
+ {'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(欧元|欧|euros?|eur)',
774
+ 'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'EUR'},
775
+ {'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(美元|dollars?|dollar|usd)',
776
+ 'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'USD'},
777
+ {'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(英镑|pounds?|pound|gbp)',
778
+ 'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'GBP'},
779
+ {'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(日元|yen|jpy)',
780
+ 'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'JPY'},
781
+ {'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(瑞郎|瑞士法郎|chf)',
782
+ 'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'CHF'},
783
+
784
+ {'regex': r'(\d+\.?\d+)\s*(十|百|hundred|千|k|thousand|万|w)',
785
+ 'groups': {'amount': 1, 'multiplier': 2}, 'context_needed': True},
786
+ {'regex': r'(\d+\.?\d+)',
787
+ 'groups': {'amount': 1}, 'context_needed': True},
788
+ ]
789
+
790
+ amount_found = False
791
+ for pattern, default_multiplier, default_currency in patterns:
792
+ match = re.search(pattern, text)
793
+ if match:
794
+ # 检查是否是纯数字模式,是的话需要上下文
795
+ if pattern in [r'(\d+\.?\d+)\s*(千|k|万|w)', r'(\d+\.?\d+)']:
796
+ budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', 'budget', 'cost']
797
+ if not any(indicator in text for indicator in budget_indicators):
798
+ continue # 如果没有上下文,则跳过纯数字匹配
799
+
800
+ groups = match.groups()
801
+
802
+ # 提取金额
803
+ amount = float(groups[0])
804
+
805
+ # 确定乘数
806
+ multiplier = 1
807
+ multiplier_token = ''
808
+ if 'multiplier' in p['groups'] and p['groups']['multiplier'] <= len(groups) and groups[p['groups']['multiplier']-1]:
809
+ multiplier_token = groups[p['groups']['multiplier']-1]
810
+
811
+ if '十' in multiplier_token:
812
+ multiplier = 10
813
+ elif '百' in multiplier_token or 'hundred' in multiplier_token:
814
+ multiplier = 100
815
+ elif '千' in multiplier_token or 'k' in multiplier_token or 'thousand' in multiplier_token:
816
+ multiplier = 1000
817
+ elif '万' in multiplier_token or 'w' in multiplier_token:
818
+ multiplier = 10000
819
+
820
+ final_amount = amount * multiplier
821
+ result['amount'] = int(final_amount)
822
+
823
+ # 确定货币
824
+ currency_token = ''
825
+ if default_currency:
826
+ result['currency'] = default_currency
827
+ elif len(groups) > 2 and groups[2]:
828
+ currency_token = groups[2]
829
+
830
+ currency_map = {
831
+ 'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF', 'jpy': 'JPY'
832
+ }
833
+ if currency_token in currency_map:
834
+ result['currency'] = currency_map[currency_token]
835
+
836
+ amount_found = True
837
+ break
838
 
839
  # 2. 查找预算类型(此部分逻辑与金额完全无关)
840
  budget_type_keywords = {