Spaces:
Sleeping
Sleeping
improve: budget mapping
Browse files- modules/info_extractor.py +78 -83
modules/info_extractor.py
CHANGED
|
@@ -756,90 +756,85 @@ class InfoExtractor:
|
|
| 756 |
def _extract_budget_from_tokens(self, tokens: list) -> dict:
|
| 757 |
|
| 758 |
result = {}
|
| 759 |
-
|
| 760 |
-
# 1. 查找金额和货币单位(不再有默认值)
|
| 761 |
-
for i, token in enumerate(tokens):
|
| 762 |
-
amount = None
|
| 763 |
-
currency = None # <--- 修改点:不再预设 "RMB",初始为 None
|
| 764 |
-
|
| 765 |
-
# 处理包含货币的token "2000欧", "5000元"
|
| 766 |
-
currency_patterns = [
|
| 767 |
-
(r'(\d+(?:\.\d+)?)欧(?:元)?', 'EUR'),
|
| 768 |
-
(r'(\d+(?:\.\d+)?)元', 'RMB'),
|
| 769 |
-
(r'(\d+(?:\.\d+)?)块(?:钱)?', 'RMB'),
|
| 770 |
-
(r'(\d+(?:\.\d+)?)人民币', 'RMB'),
|
| 771 |
-
(r'(\d+(?:\.\d+)?)美元', 'USD'),
|
| 772 |
-
(r'(\d+(?:\.\d+)?)英镑', 'GBP'),
|
| 773 |
-
(r'(\d+(?:\.\d+)?)瑞(?:士)?法郎', 'CHF'),
|
| 774 |
-
(r'(\d+(?:\.\d+)?)日元', 'JPY'),
|
| 775 |
-
(r'(\d+(?:\.\d+)?)韩元', 'KRW'),
|
| 776 |
-
(r'¥(\d+(?:\.\d+)?)', 'RMB'),
|
| 777 |
-
(r'€(\d+(?:\.\d+)?)', 'EUR'),
|
| 778 |
-
(r'\$(\d+(?:\.\d+)?)', 'USD'),
|
| 779 |
-
(r'£(\d+(?:\.\d+)?)', 'GBP'),
|
| 780 |
-
(r'(\d+(?:\.\d+)?)rmb', 'RMB'),
|
| 781 |
-
(r'(\d+(?:\.\d+)?)usd', 'USD'),
|
| 782 |
-
(r'(\d+(?:\.\d+)?)eur', 'EUR'),
|
| 783 |
-
(r'(\d+(?:\.\d+)?)gbp', 'GBP'),
|
| 784 |
-
(r'(\d+(?:\.\d+)?)chf', 'CHF'),
|
| 785 |
-
]
|
| 786 |
-
|
| 787 |
-
for pattern, curr in currency_patterns:
|
| 788 |
-
match = re.search(pattern, token.lower())
|
| 789 |
-
if match:
|
| 790 |
-
amount = float(match.group(1))
|
| 791 |
-
currency = curr
|
| 792 |
-
break
|
| 793 |
-
|
| 794 |
-
# 处理纯数字token(需要查看上下文)
|
| 795 |
-
if not amount and re.match(r'^\\d+(?:\\.\\d+)?$', token):
|
| 796 |
-
number = float(token)
|
| 797 |
-
|
| 798 |
-
# 检查前面的token是否有预算相关词汇
|
| 799 |
-
budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', '总共', '一共', 'budget', 'cost', 'spend']
|
| 800 |
-
has_budget_context = False
|
| 801 |
-
if i > 0 and tokens[i-1] in budget_indicators:
|
| 802 |
-
has_budget_context = True
|
| 803 |
-
elif i > 1 and tokens[i-2] in budget_indicators:
|
| 804 |
-
has_budget_context = True
|
| 805 |
-
|
| 806 |
-
# 检查后面是否有货币单位
|
| 807 |
-
if i < len(tokens) - 1:
|
| 808 |
-
next_token = tokens[i + 1].lower()
|
| 809 |
-
currency_units = {
|
| 810 |
-
'元': 'RMB', '块': 'RMB', '钱': 'RMB', '人民币': 'RMB',
|
| 811 |
-
'欧': 'EUR', '欧元': 'EUR', '美元': 'USD', '英镑': 'GBP',
|
| 812 |
-
'瑞郎': 'CHF', '日元': 'JPY', '韩元': 'KRW',
|
| 813 |
-
'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF'
|
| 814 |
-
}
|
| 815 |
-
if next_token in currency_units:
|
| 816 |
-
amount = number
|
| 817 |
-
currency = currency_units[next_token]
|
| 818 |
-
|
| 819 |
-
if has_budget_context and not currency:
|
| 820 |
-
amount = number
|
| 821 |
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 843 |
|
| 844 |
# 2. 查找预算类型(此部分逻辑与金额完全无关)
|
| 845 |
budget_type_keywords = {
|
|
|
|
| 756 |
def _extract_budget_from_tokens(self, tokens: list) -> dict:
|
| 757 |
|
| 758 |
result = {}
|
| 759 |
+
text = "".join(tokens).lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 760 |
|
| 761 |
+
# --- 1. 统一提取金额和货币 ---
|
| 762 |
+
# 按优先级排列正则表达式,越精确的模式越靠前
|
| 763 |
+
# 模式捕获: (金额数字, 乘数单位[千/万/k/w], 货币单位[元/欧/usd...])
|
| 764 |
+
patterns = [
|
| 765 |
+
{'regex': r'¥\s*(\d+\.?\d*)', 'groups': {'amount': 1}, 'currency': 'RMB'},
|
| 766 |
+
{'regex': r'€\s*(\d+\.?\d*)', 'groups': {'amount': 1}, 'currency': 'EUR'},
|
| 767 |
+
{'regex': r'\$\s*(\d+\.?\d*)', 'groups': {'amount': 1}, 'currency': 'USD'},
|
| 768 |
+
{'regex': r'£\s*(\d+\.?\d*)', 'groups': {'amount': 1}, 'currency': 'GBP'},
|
| 769 |
+
{'regex': r'(usd|rmb|eur|gbp|chf|jpy)\s*(\d+\.?\d*)\s*(百|hundred|千|k|thousand|万|w)?',
|
| 770 |
+
'groups': {'currency': 1, 'amount': 2, 'multiplier': 3}},
|
| 771 |
+
{'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(人民币|元|块|块钱|rmb)',
|
| 772 |
+
'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'RMB'},
|
| 773 |
+
{'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(欧元|欧|euros?|eur)',
|
| 774 |
+
'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'EUR'},
|
| 775 |
+
{'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(美元|dollars?|dollar|usd)',
|
| 776 |
+
'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'USD'},
|
| 777 |
+
{'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(英镑|pounds?|pound|gbp)',
|
| 778 |
+
'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'GBP'},
|
| 779 |
+
{'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(日元|yen|jpy)',
|
| 780 |
+
'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'JPY'},
|
| 781 |
+
{'regex': r'(\d+\.?\d*)\s*(十|百|hundred|千|k|thousand|万|w)?\s*(瑞郎|瑞士法郎|chf)',
|
| 782 |
+
'groups': {'amount': 1, 'multiplier': 2}, 'currency': 'CHF'},
|
| 783 |
+
|
| 784 |
+
{'regex': r'(\d+\.?\d+)\s*(十|百|hundred|千|k|thousand|万|w)',
|
| 785 |
+
'groups': {'amount': 1, 'multiplier': 2}, 'context_needed': True},
|
| 786 |
+
{'regex': r'(\d+\.?\d+)',
|
| 787 |
+
'groups': {'amount': 1}, 'context_needed': True},
|
| 788 |
+
]
|
| 789 |
+
|
| 790 |
+
amount_found = False
|
| 791 |
+
for pattern, default_multiplier, default_currency in patterns:
|
| 792 |
+
match = re.search(pattern, text)
|
| 793 |
+
if match:
|
| 794 |
+
# 检查是否是纯数字模式,是的话需要上下文
|
| 795 |
+
if pattern in [r'(\d+\.?\d+)\s*(千|k|万|w)', r'(\d+\.?\d+)']:
|
| 796 |
+
budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', 'budget', 'cost']
|
| 797 |
+
if not any(indicator in text for indicator in budget_indicators):
|
| 798 |
+
continue # 如果没有上下文,则跳过纯数字匹配
|
| 799 |
+
|
| 800 |
+
groups = match.groups()
|
| 801 |
+
|
| 802 |
+
# 提取金额
|
| 803 |
+
amount = float(groups[0])
|
| 804 |
+
|
| 805 |
+
# 确定乘数
|
| 806 |
+
multiplier = 1
|
| 807 |
+
multiplier_token = ''
|
| 808 |
+
if 'multiplier' in p['groups'] and p['groups']['multiplier'] <= len(groups) and groups[p['groups']['multiplier']-1]:
|
| 809 |
+
multiplier_token = groups[p['groups']['multiplier']-1]
|
| 810 |
+
|
| 811 |
+
if '十' in multiplier_token:
|
| 812 |
+
multiplier = 10
|
| 813 |
+
elif '百' in multiplier_token or 'hundred' in multiplier_token:
|
| 814 |
+
multiplier = 100
|
| 815 |
+
elif '千' in multiplier_token or 'k' in multiplier_token or 'thousand' in multiplier_token:
|
| 816 |
+
multiplier = 1000
|
| 817 |
+
elif '万' in multiplier_token or 'w' in multiplier_token:
|
| 818 |
+
multiplier = 10000
|
| 819 |
+
|
| 820 |
+
final_amount = amount * multiplier
|
| 821 |
+
result['amount'] = int(final_amount)
|
| 822 |
+
|
| 823 |
+
# 确定货币
|
| 824 |
+
currency_token = ''
|
| 825 |
+
if default_currency:
|
| 826 |
+
result['currency'] = default_currency
|
| 827 |
+
elif len(groups) > 2 and groups[2]:
|
| 828 |
+
currency_token = groups[2]
|
| 829 |
+
|
| 830 |
+
currency_map = {
|
| 831 |
+
'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF', 'jpy': 'JPY'
|
| 832 |
+
}
|
| 833 |
+
if currency_token in currency_map:
|
| 834 |
+
result['currency'] = currency_map[currency_token]
|
| 835 |
+
|
| 836 |
+
amount_found = True
|
| 837 |
+
break
|
| 838 |
|
| 839 |
# 2. 查找预算类型(此部分逻辑与金额完全无关)
|
| 840 |
budget_type_keywords = {
|