import re import json from config import FALLBACK_AMOUNT_REGEX, CURRENCY_SYMBOLS # Import regex and symbols def parse_entities(entities, full_text: str): """ Extracts amount, currency, and item description from NER entities and full text. Args: entities: List of dictionaries from the NER pipeline. full_text: The original input text string. Returns: A tuple: (amount, currency, item) """ amount, currency, item = None, None, None potential_amounts = [] # 1. Use the FALLBACK_AMOUNT_REGEX on the full text first - it's often more reliable # Regex groups: # 1: Symbol/Code before number ($, EUR, etc.) # 2: Number when symbol/code is before # 3: Number when symbol/code is after # 4: Symbol/Code after number (rs, dollars, etc.) # 5: Standalone number for match in FALLBACK_AMOUNT_REGEX.finditer(full_text): num_str = None curr_symbol = None curr_code = None if match.group(1) and match.group(2): # Symbol/Code before curr_symbol = match.group(1) num_str = match.group(2) elif match.group(3) and match.group(4): # Symbol/Code after num_str = match.group(3) curr_code = match.group(4) elif match.group(5) and not match.group(1) and not match.group(4): # Standalone number num_str = match.group(5) if num_str: try: value = float(num_str.replace(",", "")) # Basic validation: avoid huge numbers unless they have decimals (might be IDs) if value < 1_000_000 or '.' in num_str: potential_amounts.append({ "value": value, "currency_symbol": curr_symbol, "currency_code": curr_code, "match_obj": match # Store match object for position info later if needed }) except ValueError: continue # Ignore invalid numbers like "1,2,3" # 2. Determine Amount and Currency from regex matches if potential_amounts: # Prioritize matches that included a currency symbol/code currency_matches = [p for p in potential_amounts if p["currency_symbol"] or p["currency_code"]] if currency_matches: # Often the largest value with currency is the main one best_match = max(currency_matches, key=lambda x: x["value"]) amount = best_match["value"] # Determine currency from symbol/code symbol = best_match["currency_symbol"] code = best_match["currency_code"] if symbol: if "₹" in symbol: currency = "INR" elif "$" in symbol: currency = "USD" elif "€" in symbol: currency = "EUR" elif "£" in symbol: currency = "GBP" elif code: code_lower = code.lower() if code_lower in ["inr", "rs", "rupees"]: currency = "INR" elif code_lower in ["usd", "dollars"]: currency = "USD" elif code_lower in ["eur", "euros"]: currency = "EUR" elif code_lower in ["gbp", "pounds"]: currency = "GBP" else: # If no currency found, take the largest standalone number as amount best_match = max(potential_amounts, key=lambda x: x["value"]) amount = best_match["value"] currency = None # Explicitly None if not found # 3. Extract Item using NER entities (excluding amounts/currency) item_parts = [] if entities: # Get text segments identified as potential amounts by the regex amount_texts = set() for p in potential_amounts: amount_texts.add(p["match_obj"].group(0)) # Add the full matched string for entity in entities: entity_group = entity.get("entity_group", "") word = entity.get("word", "") # Skip if the entity word is part of a detected amount or is just a currency symbol if word in amount_texts or word in CURRENCY_SYMBOLS: continue # Skip if it's classified as MONEY by NER (already handled by regex) # Allow CARDINAL if it wasn't part of a regex match (e.g., quantity "2 coffees") if "MONEY" in entity_group: continue # Include relevant entity types for item description if entity_group in ["MISC", "ORG", "PRODUCT", "EVENT", "WORK_OF_ART", "LOC", "PER", "CARDINAL", "QUANTITY"]: # Clean up sub-word tokens like ##ing cleaned_word = word.replace(" ##", "").strip() if cleaned_word: item_parts.append(cleaned_word) if item_parts: item = " ".join(item_parts).strip() # Further clean-up (optional): remove leading/trailing punctuation if desired item = re.sub(r"^[^\w]+|[^\w]+$", "", item) # 4. Final checks and return # If amount is found but currency is None, consider a default (optional, decided against for now) # if amount is not None and currency is None: # currency = "INR" # Or keep as None print(f"Utils: Parsed-> Amount: {amount}, Currency: {currency}, Item: {item}") return amount, currency, item # ... (keep parse_gemini_response as is) ... def parse_gemini_response(response_text): """ Parses a structured string response from Gemini (expected JSON-like). Example expected format: "{ \"type\": \"expense\", \"category\": \"Food\", \"amount\": 5.50, \"currency\": \"USD\", \"item\": \"coffee\" }" """ try: # Clean the response text if it's wrapped in markdown code blocks response_text = re.sub(r"^```json\s*|\s*```$", "", response_text.strip()) data = json.loads(response_text) return data except json.JSONDecodeError: print(f"Warning: Could not parse Gemini response: {response_text}") return None except Exception as e: print(f"Error parsing Gemini response: {e}") return None