File size: 6,157 Bytes
07b50c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import re
import json
from config import FALLBACK_AMOUNT_REGEX, CURRENCY_SYMBOLS # Import regex and symbols

def parse_entities(entities, full_text: str):
    """
    Extracts amount, currency, and item description from NER entities and full text.

    Args:
        entities: List of dictionaries from the NER pipeline.
        full_text: The original input text string.

    Returns:
        A tuple: (amount, currency, item)
    """
    amount, currency, item = None, None, None
    potential_amounts = []

    # 1. Use the FALLBACK_AMOUNT_REGEX on the full text first - it's often more reliable
    #    Regex groups:
    #    1: Symbol/Code before number ($, EUR, etc.)
    #    2: Number when symbol/code is before
    #    3: Number when symbol/code is after
    #    4: Symbol/Code after number (rs, dollars, etc.)
    #    5: Standalone number
    for match in FALLBACK_AMOUNT_REGEX.finditer(full_text):
        num_str = None
        curr_symbol = None
        curr_code = None

        if match.group(1) and match.group(2): # Symbol/Code before
            curr_symbol = match.group(1)
            num_str = match.group(2)
        elif match.group(3) and match.group(4): # Symbol/Code after
            num_str = match.group(3)
            curr_code = match.group(4)
        elif match.group(5) and not match.group(1) and not match.group(4): # Standalone number
             num_str = match.group(5)

        if num_str:
            try:
                value = float(num_str.replace(",", ""))
                # Basic validation: avoid huge numbers unless they have decimals (might be IDs)
                if value < 1_000_000 or '.' in num_str:
                    potential_amounts.append({
                        "value": value,
                        "currency_symbol": curr_symbol,
                        "currency_code": curr_code,
                        "match_obj": match # Store match object for position info later if needed
                    })
            except ValueError:
                continue # Ignore invalid numbers like "1,2,3"

    # 2. Determine Amount and Currency from regex matches
    if potential_amounts:
        # Prioritize matches that included a currency symbol/code
        currency_matches = [p for p in potential_amounts if p["currency_symbol"] or p["currency_code"]]
        if currency_matches:
            # Often the largest value with currency is the main one
            best_match = max(currency_matches, key=lambda x: x["value"])
            amount = best_match["value"]
            # Determine currency from symbol/code
            symbol = best_match["currency_symbol"]
            code = best_match["currency_code"]
            if symbol:
                if "₹" in symbol: currency = "INR"
                elif "$" in symbol: currency = "USD"
                elif "€" in symbol: currency = "EUR"
                elif "£" in symbol: currency = "GBP"
            elif code:
                code_lower = code.lower()
                if code_lower in ["inr", "rs", "rupees"]: currency = "INR"
                elif code_lower in ["usd", "dollars"]: currency = "USD"
                elif code_lower in ["eur", "euros"]: currency = "EUR"
                elif code_lower in ["gbp", "pounds"]: currency = "GBP"
        else:
            # If no currency found, take the largest standalone number as amount
            best_match = max(potential_amounts, key=lambda x: x["value"])
            amount = best_match["value"]
            currency = None # Explicitly None if not found

    # 3. Extract Item using NER entities (excluding amounts/currency)
    item_parts = []
    if entities:
        # Get text segments identified as potential amounts by the regex
        amount_texts = set()
        for p in potential_amounts:
            amount_texts.add(p["match_obj"].group(0)) # Add the full matched string

        for entity in entities:
            entity_group = entity.get("entity_group", "")
            word = entity.get("word", "")

            # Skip if the entity word is part of a detected amount or is just a currency symbol
            if word in amount_texts or word in CURRENCY_SYMBOLS:
                continue

            # Skip if it's classified as MONEY by NER (already handled by regex)
            # Allow CARDINAL if it wasn't part of a regex match (e.g., quantity "2 coffees")
            if "MONEY" in entity_group:
                 continue

            # Include relevant entity types for item description
            if entity_group in ["MISC", "ORG", "PRODUCT", "EVENT", "WORK_OF_ART", "LOC", "PER", "CARDINAL", "QUANTITY"]:
                 # Clean up sub-word tokens like ##ing
                 cleaned_word = word.replace(" ##", "").strip()
                 if cleaned_word:
                    item_parts.append(cleaned_word)

    if item_parts:
        item = " ".join(item_parts).strip()
        # Further clean-up (optional): remove leading/trailing punctuation if desired
        item = re.sub(r"^[^\w]+|[^\w]+$", "", item)


    # 4. Final checks and return
    # If amount is found but currency is None, consider a default (optional, decided against for now)
    # if amount is not None and currency is None:
    #     currency = "INR" # Or keep as None

    print(f"Utils: Parsed-> Amount: {amount}, Currency: {currency}, Item: {item}")
    return amount, currency, item

# ... (keep parse_gemini_response as is) ...
def parse_gemini_response(response_text):
    """
    Parses a structured string response from Gemini (expected JSON-like).
    Example expected format:
    "{ \"type\": \"expense\", \"category\": \"Food\", \"amount\": 5.50, \"currency\": \"USD\", \"item\": \"coffee\" }"
    """
    try:
        # Clean the response text if it's wrapped in markdown code blocks
        response_text = re.sub(r"^```json\s*|\s*```$", "", response_text.strip())
        data = json.loads(response_text)
        return data
    except json.JSONDecodeError:
        print(f"Warning: Could not parse Gemini response: {response_text}")
        return None
    except Exception as e:
        print(f"Error parsing Gemini response: {e}")
        return None