Spaces:
Running
Running
File size: 6,157 Bytes
07b50c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import re
import json
from config import FALLBACK_AMOUNT_REGEX, CURRENCY_SYMBOLS # Import regex and symbols
def parse_entities(entities, full_text: str):
"""
Extracts amount, currency, and item description from NER entities and full text.
Args:
entities: List of dictionaries from the NER pipeline.
full_text: The original input text string.
Returns:
A tuple: (amount, currency, item)
"""
amount, currency, item = None, None, None
potential_amounts = []
# 1. Use the FALLBACK_AMOUNT_REGEX on the full text first - it's often more reliable
# Regex groups:
# 1: Symbol/Code before number ($, EUR, etc.)
# 2: Number when symbol/code is before
# 3: Number when symbol/code is after
# 4: Symbol/Code after number (rs, dollars, etc.)
# 5: Standalone number
for match in FALLBACK_AMOUNT_REGEX.finditer(full_text):
num_str = None
curr_symbol = None
curr_code = None
if match.group(1) and match.group(2): # Symbol/Code before
curr_symbol = match.group(1)
num_str = match.group(2)
elif match.group(3) and match.group(4): # Symbol/Code after
num_str = match.group(3)
curr_code = match.group(4)
elif match.group(5) and not match.group(1) and not match.group(4): # Standalone number
num_str = match.group(5)
if num_str:
try:
value = float(num_str.replace(",", ""))
# Basic validation: avoid huge numbers unless they have decimals (might be IDs)
if value < 1_000_000 or '.' in num_str:
potential_amounts.append({
"value": value,
"currency_symbol": curr_symbol,
"currency_code": curr_code,
"match_obj": match # Store match object for position info later if needed
})
except ValueError:
continue # Ignore invalid numbers like "1,2,3"
# 2. Determine Amount and Currency from regex matches
if potential_amounts:
# Prioritize matches that included a currency symbol/code
currency_matches = [p for p in potential_amounts if p["currency_symbol"] or p["currency_code"]]
if currency_matches:
# Often the largest value with currency is the main one
best_match = max(currency_matches, key=lambda x: x["value"])
amount = best_match["value"]
# Determine currency from symbol/code
symbol = best_match["currency_symbol"]
code = best_match["currency_code"]
if symbol:
if "₹" in symbol: currency = "INR"
elif "$" in symbol: currency = "USD"
elif "€" in symbol: currency = "EUR"
elif "£" in symbol: currency = "GBP"
elif code:
code_lower = code.lower()
if code_lower in ["inr", "rs", "rupees"]: currency = "INR"
elif code_lower in ["usd", "dollars"]: currency = "USD"
elif code_lower in ["eur", "euros"]: currency = "EUR"
elif code_lower in ["gbp", "pounds"]: currency = "GBP"
else:
# If no currency found, take the largest standalone number as amount
best_match = max(potential_amounts, key=lambda x: x["value"])
amount = best_match["value"]
currency = None # Explicitly None if not found
# 3. Extract Item using NER entities (excluding amounts/currency)
item_parts = []
if entities:
# Get text segments identified as potential amounts by the regex
amount_texts = set()
for p in potential_amounts:
amount_texts.add(p["match_obj"].group(0)) # Add the full matched string
for entity in entities:
entity_group = entity.get("entity_group", "")
word = entity.get("word", "")
# Skip if the entity word is part of a detected amount or is just a currency symbol
if word in amount_texts or word in CURRENCY_SYMBOLS:
continue
# Skip if it's classified as MONEY by NER (already handled by regex)
# Allow CARDINAL if it wasn't part of a regex match (e.g., quantity "2 coffees")
if "MONEY" in entity_group:
continue
# Include relevant entity types for item description
if entity_group in ["MISC", "ORG", "PRODUCT", "EVENT", "WORK_OF_ART", "LOC", "PER", "CARDINAL", "QUANTITY"]:
# Clean up sub-word tokens like ##ing
cleaned_word = word.replace(" ##", "").strip()
if cleaned_word:
item_parts.append(cleaned_word)
if item_parts:
item = " ".join(item_parts).strip()
# Further clean-up (optional): remove leading/trailing punctuation if desired
item = re.sub(r"^[^\w]+|[^\w]+$", "", item)
# 4. Final checks and return
# If amount is found but currency is None, consider a default (optional, decided against for now)
# if amount is not None and currency is None:
# currency = "INR" # Or keep as None
print(f"Utils: Parsed-> Amount: {amount}, Currency: {currency}, Item: {item}")
return amount, currency, item
# ... (keep parse_gemini_response as is) ...
def parse_gemini_response(response_text):
"""
Parses a structured string response from Gemini (expected JSON-like).
Example expected format:
"{ \"type\": \"expense\", \"category\": \"Food\", \"amount\": 5.50, \"currency\": \"USD\", \"item\": \"coffee\" }"
"""
try:
# Clean the response text if it's wrapped in markdown code blocks
response_text = re.sub(r"^```json\s*|\s*```$", "", response_text.strip())
data = json.loads(response_text)
return data
except json.JSONDecodeError:
print(f"Warning: Could not parse Gemini response: {response_text}")
return None
except Exception as e:
print(f"Error parsing Gemini response: {e}")
return None
|