ClearSpend

Sleeping

File size: 6,157 Bytes

07b50c0

import re
import json
from config import FALLBACK_AMOUNT_REGEX, CURRENCY_SYMBOLS # Import regex and symbols

def parse_entities(entities, full_text: str):
    """
    Extracts amount, currency, and item description from NER entities and full text.

    Args:
        entities: List of dictionaries from the NER pipeline.
        full_text: The original input text string.

    Returns:
        A tuple: (amount, currency, item)
    """
    amount, currency, item = None, None, None
    potential_amounts = []

    # 1. Use the FALLBACK_AMOUNT_REGEX on the full text first - it's often more reliable
    #    Regex groups:
    #    1: Symbol/Code before number ($, EUR, etc.)
    #    2: Number when symbol/code is before
    #    3: Number when symbol/code is after
    #    4: Symbol/Code after number (rs, dollars, etc.)
    #    5: Standalone number
    for match in FALLBACK_AMOUNT_REGEX.finditer(full_text):
        num_str = None
        curr_symbol = None
        curr_code = None

        if match.group(1) and match.group(2): # Symbol/Code before
            curr_symbol = match.group(1)
            num_str = match.group(2)
        elif match.group(3) and match.group(4): # Symbol/Code after
            num_str = match.group(3)
            curr_code = match.group(4)
        elif match.group(5) and not match.group(1) and not match.group(4): # Standalone number
             num_str = match.group(5)

        if num_str:
            try:
                value = float(num_str.replace(",", ""))
                # Basic validation: avoid huge numbers unless they have decimals (might be IDs)
                if value < 1_000_000 or '.' in num_str:
                    potential_amounts.append({
                        "value": value,
                        "currency_symbol": curr_symbol,
                        "currency_code": curr_code,
                        "match_obj": match # Store match object for position info later if needed
                    })
            except ValueError:
                continue # Ignore invalid numbers like "1,2,3"

    # 2. Determine Amount and Currency from regex matches
    if potential_amounts:
        # Prioritize matches that included a currency symbol/code
        currency_matches = [p for p in potential_amounts if p["currency_symbol"] or p["currency_code"]]
        if currency_matches:
            # Often the largest value with currency is the main one
            best_match = max(currency_matches, key=lambda x: x["value"])
            amount = best_match["value"]
            # Determine currency from symbol/code
            symbol = best_match["currency_symbol"]
            code = best_match["currency_code"]
            if symbol:
                if "₹" in symbol: currency = "INR"
                elif "$" in symbol: currency = "USD"
                elif "€" in symbol: currency = "EUR"
                elif "£" in symbol: currency = "GBP"
            elif code:
                code_lower = code.lower()
                if code_lower in ["inr", "rs", "rupees"]: currency = "INR"
                elif code_lower in ["usd", "dollars"]: currency = "USD"
                elif code_lower in ["eur", "euros"]: currency = "EUR"
                elif code_lower in ["gbp", "pounds"]: currency = "GBP"
        else:
            # If no currency found, take the largest standalone number as amount
            best_match = max(potential_amounts, key=lambda x: x["value"])
            amount = best_match["value"]
            currency = None # Explicitly None if not found

    # 3. Extract Item using NER entities (excluding amounts/currency)
    item_parts = []
    if entities:
        # Get text segments identified as potential amounts by the regex
        amount_texts = set()
        for p in potential_amounts:
            amount_texts.add(p["match_obj"].group(0)) # Add the full matched string

        for entity in entities:
            entity_group = entity.get("entity_group", "")
            word = entity.get("word", "")

            # Skip if the entity word is part of a detected amount or is just a currency symbol
            if word in amount_texts or word in CURRENCY_SYMBOLS:
                continue

            # Skip if it's classified as MONEY by NER (already handled by regex)
            # Allow CARDINAL if it wasn't part of a regex match (e.g., quantity "2 coffees")
            if "MONEY" in entity_group:
                 continue

            # Include relevant entity types for item description
            if entity_group in ["MISC", "ORG", "PRODUCT", "EVENT", "WORK_OF_ART", "LOC", "PER", "CARDINAL", "QUANTITY"]:
                 # Clean up sub-word tokens like ##ing
                 cleaned_word = word.replace(" ##", "").strip()
                 if cleaned_word:
                    item_parts.append(cleaned_word)

    if item_parts:
        item = " ".join(item_parts).strip()
        # Further clean-up (optional): remove leading/trailing punctuation if desired
        item = re.sub(r"^[^\w]+|[^\w]+$", "", item)


    # 4. Final checks and return
    # If amount is found but currency is None, consider a default (optional, decided against for now)
    # if amount is not None and currency is None:
    #     currency = "INR" # Or keep as None

    print(f"Utils: Parsed-> Amount: {amount}, Currency: {currency}, Item: {item}")
    return amount, currency, item

# ... (keep parse_gemini_response as is) ...
def parse_gemini_response(response_text):
    """
    Parses a structured string response from Gemini (expected JSON-like).
    Example expected format:
    "{ \"type\": \"expense\", \"category\": \"Food\", \"amount\": 5.50, \"currency\": \"USD\", \"item\": \"coffee\" }"
    """
    try:
        # Clean the response text if it's wrapped in markdown code blocks
        response_text = re.sub(r"^```json\s*|\s*```$", "", response_text.strip())
        data = json.loads(response_text)
        return data
    except json.JSONDecodeError:
        print(f"Warning: Could not parse Gemini response: {response_text}")
        return None
    except Exception as e:
        print(f"Error parsing Gemini response: {e}")
        return None