Spaces:

yourpartner
/

demospace

Running

File size: 40,802 Bytes

import re
from fastapi import FastAPI
from fastapi import Header
from pydantic import BaseModel
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification
import dateparser
from datetime import datetime
from langdetect import detect_langs
from textblob import TextBlob
from dateparser.search import search_dates
import uuid
import time
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)


from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from fastapi.responses import ORJSONResponse
from fastapi.requests import Request
from fastapi import status
import asyncio
import psycopg2
from psycopg2.extras import Json
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

DATABASE_URL = os.getenv("DATABASE_URL")

app = FastAPI(default_response_class=ORJSONResponse)
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # or your domain(s)
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

CREATE_TABLE_QUERY = """
CREATE TABLE IF NOT EXISTS user_entries (
    uuid UUID PRIMARY KEY,
    user_id TEXT,
    user_name TEXT,
    uese_email TEXT,
    raw_text TEXT,
    word_count INT,
    day_of_week TEXT,
    hour_of_day INT,
    month TEXT,
    year INT,
    type TEXT,
    expense_type TEXT,
    intent TEXT,
    confidence_scores JSONB,
    urgency_score INT,
    time_mentions TEXT[],
    parsed_dates TEXT[],
    tense TEXT[],
    summary TEXT,
    people TEXT[],
    mood TEXT,
    language JSONB,
    sentiment_score FLOAT,
    tags TEXT[],
    action_required BOOLEAN,
    entities JSONB,
    amounts JSONB,
    stores JSONB,
    processing_time_ms INT,
    raw_json JSONB,
    created_at TIMESTAMPTZ DEFAULT now()
);
"""

@app.on_event("startup")
def run_migrations():
    try:
        conn = psycopg2.connect(DATABASE_URL)
        cur = conn.cursor()
        cur.execute(CREATE_TABLE_QUERY)
        conn.commit()
        cur.close()
        conn.close()
        print("✅ Table checked/created at startup.")
    except Exception as e:
        print("❌ Migration failed:", e)

# Load classification and summarization models
classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")
summarizer_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
# classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# summarizer_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
# summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

# Load Indic NER (or any general one)
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Labels for classification
labels = [
  "task (something to be done or completed)",
  "event (an activity that is happening or has happened)",
  "reminder (a message to remember something in the future)",
  "meeting (a planned gathering between people to discuss something)",
  "relationship (message about personal or emotional connection with someone)",
  "note (general note or quick thought not related to any specific category)",
  "journal (personal reflection or emotional writing about one's day or thoughts)",
  "memory (recollection or recording of a past moment or experience)",
  "status_update (current condition, feeling, or situation being shared)",
  "sick_notice (informing about illness or not feeling well)",
  "out_of_office (message about being unavailable for work or responsibilities)",
  "travel_plan (planning or mentioning a trip or journey)",
  "celebration (message about a festive occasion, party or achievement)",
  "expense (money spent on something, either small or large)",
  "news (update about public events, announcements, or current affairs)",
  "information (factual content or informative message not tied to user activity)",
  "purchase (buying or ordering something, like a product or service)",
  "other (does not clearly fall into any specific category)"
]

POPULAR_STORES = {
    "amazon": "shopping",
    "flipkart": "shopping",
    "myntra": "fashion",
    "swiggy": "food",
    "zomato": "food",
    "uber": "transport",
    "ola": "transport",
    "bigbasket": "groceries",
    "blinkit": "groceries",
    "jiomart": "groceries",
    "netflix": "entertainment",
    "hotstar": "entertainment",
    "airbnb": "travel",
    "makemytrip": "travel",
    "bookmyshow": "entertainment",
    "dunzo": "delivery",
    "meesho": "shopping",
    "nykaa": "beauty",
    "instamart": "groceries",
    "apple": "electronics",
    "google": "services"
}

expense_keywords = [
    "paid", "bought", "purchased", "ordered", "spent", "payment",
    "recharged", "booked", "transaction", "debit", "renewed",
    "credit card", "cash", "amount", "transfer", "EMI", "wallet",
    "petrol", "bill", "invoice", "kharida", "kharcha", "kharch", "bill", "paisa", "khareed", "order", "le liya", "diya", "khud diya", "khud kharida",
    "expense", "cost", "buy", "buying", "purchase", "purchased", "paid for", "paid to", "paid via", "paid using",
    "expense", "expenses", "costs", "costing", "bills", "bought from", "ordered from", "paid at",
    "paid online", "paid cash", "paid card", "paid wallet", "paid app", "paid through", "paid via",
    "khariden", "kharidi"
]

class TextInput(BaseModel):
    text: str
    user_id: str

# Function to detect popular store categories in the text
def detect_store_category(text: str):
    found_stores = []
    lowered = text.lower()
    
    for store, category in POPULAR_STORES.items():
        if store in lowered:
            found_stores.append({
                "store": store,
                "category": category
            })
    
    return found_stores

# Function to extract dates and time mentions based on regex patterns
def extract_dates_with_accuracy(text: str, amounts: list = None):
    amounts = amounts or []
    amount_values = {str(int(a["value"])) for a in amounts if isinstance(a["value"], (int, float))}

    original_text = text
    text_lower = text.lower()

    # Step 1: Replace Hinglish phrases with English equivalents (only for parsing)
    hinglish_map = {
        "aaj": "today",
        "kal": "tomorrow",   # Assuming future
        "parso": "day after tomorrow",
        "abhi": "now",
        "subah": "morning",
        "shaam": "evening",
        "raat ko": "night",
        "agli baar": "next time",
        "agli hafte": "next week",
        "agli mahine": "next month",
        "iss hafte": "this week",
        "iss mahine": "this month",
        "pichhle hafte": "last week",
        "tareekh": "date",
        "do din baad": "in 2 days",
        "teen din baad": "in 3 days",
    }

    replaced_text = text_lower
    for h_word, en_word in hinglish_map.items():
        replaced_text = re.sub(rf"\b{re.escape(h_word)}\b", en_word, replaced_text)

    # Step 2: Parse using dateparser
    results = search_dates(replaced_text, settings={
        "PREFER_DATES_FROM": "future",
        "RELATIVE_BASE": datetime.now(),
        "RETURN_AS_TIMEZONE_AWARE": False,
        "STRICT_PARSING": True,
    })

    time_mentions = []
    parsed_dates = []

    if results:
        for phrase, date in results:
            clean_phrase = phrase.strip().lower()

            if clean_phrase in amount_values:
                continue
            if clean_phrase in {"on", "at", "in", "by", "to", "of"}:
                continue
            if re.fullmatch(r"\d{3,4}", clean_phrase):  # skip 2025, 1200
                continue
            time_mentions.append(clean_phrase)
            parsed_dates.append(date.isoformat())

    return time_mentions, parsed_dates

def detect_tense(parsed_dates):
    now = datetime.now()
    tenses = set()
    for d in parsed_dates:
        dt = dateparser.parse(d)
        if not dt:
            continue
        if dt < now:
            tenses.add("past")
        elif dt > now:
            tenses.add("future")
        else:
            tenses.add("present")
    return list(tenses) if tenses else ["unknown"]

def generate_summary(text):
    input_ids = summarizer_tokenizer("summarize: " + text, return_tensors="pt").input_ids
    output_ids = summarizer_model.generate(input_ids, max_length=60, num_beams=4, early_stopping=True)
    return summarizer_tokenizer.decode(output_ids[0], skip_special_tokens=True)

def estimate_mood(text):
    text_lower = text.lower()
    # Expanded mood map with Hindi/Hinglish and phrases
    mood_map = {
        "happy": [
            "happy", "excited", "good", "joy", "grateful", "glad", "pleased", "content", "satisfied", "cheerful", "elated",
            "maza aa gaya", "achha lag raha hai", "khush", "khushi", "badiya", "mast", "enjoy", "enjoyed", "mazedaar", "achha"
        ],
        "sad": [
            "sad", "upset", "crying", "lonely", "depressed", "down", "disappointed", "heartbroken", "unhappy",
            "bura lag raha hai", "dukhi", "udaas", "rona", "rona aa gaya", "dil toot gaya", "nirash"
        ],
        "angry": [
            "angry", "annoyed", "frustrated", "irritated", "mad", "furious", "gussa", "gusse mein", "chidh", "naraz",
            "bhadak gaya", "chidh gaya", "irritate", "irritated"
        ],
        "nervous": [
            "nervous", "anxious", "scared", "worried", "fearful", "uneasy", "tensed", "tension", "ghabrahat", "chinta",
            "parishan", "dara hua", "ghabra gaya", "stress", "stressed"
        ],
        "unwell": [
            "sick", "unwell", "not feeling well", "fever", "cold", "headache", "flu", "ill", "nauseous", "dizzy",
            "thak gaya", "thaka hua", "bimaar", "bimar", "bukhar", "sardard", "beemar", "kamjor", "thakan"
        ],
        "neutral": [
            "ok", "fine", "theek", "normal", "usual", "routine", "nothing special", "kuch khaas nahi", "no stress"
        ]
    }

    detected_moods = []
    for mood, keywords in mood_map.items():
        for kw in keywords:
            if kw in text_lower:
                detected_moods.append(mood)
                break  # Only need one match per mood

    # Use sentiment as a fallback if no mood keyword matched
    if not detected_moods:
        sentiment = get_sentiment_score(text)
        if sentiment > 0.2:
            return "happy"
        elif sentiment < -0.2:
            return "sad"
        else:
            return "neutral"

    # Priority: angry > sad > unwell > nervous > happy > neutral
    priority = ["angry", "sad", "unwell", "nervous", "happy", "neutral"]
    for mood in priority:
        if mood in detected_moods:
            return mood

    return "neutral"

def generate_tags(label, text):
    # Define stopwords manually (lightweight and fast)
    stopwords = set([
        "or", "to", "also", "the", "and", "a", "an", "in", "on", "of", "for",
        "with", "at", "by", "from", "as", "is", "was", "are", "be", "will",
        "has", "have", "it", "this", "that", "but", "if", "not", "so", "do",
        "does", "did", "am", "can", "i", "me", "my", "you", "we", "they", "he", "she"
    ])

    base_tags = [label]

    # Extract keywords (only alphabetic words with 4 or more letters)
    keywords = re.findall(r'\b[a-zA-Z]{4,}\b', text.lower())

    # Filter out stopwords
    filtered_keywords = [word for word in keywords if word not in stopwords]

    # Add forced tags based on context
    force_tags = []
    lowered = text.lower()

    if any(w in lowered for w in ["sick", "unwell", "not feeling well", "fever"]):
        force_tags += ["sick", "leave"]
    if "work" in lowered:
        force_tags.append("work")

    # Merge and deduplicate tags
    return list(set(base_tags + force_tags + filtered_keywords))

# Detect language using langdetect
def detect_language(text):
    langs = detect_langs(text)  # returns list like: [en:0.99, hi:0.01]
    if langs:
        top_lang = langs[0]
        return {"lang": top_lang.lang, "prob": round(top_lang.prob, 6)}
    return {"lang": "unknown", "prob": 0}
    
# Detect sentiment using TextBlob
def get_sentiment_score(text):
    try:
        blob = TextBlob(text)
        return round(blob.sentiment.polarity, 3)  # Range: -1 to 1
    except:
        return 0.0

# Infer intent based on label
def infer_intent(label, text):
    label_to_intent = {
        "out_of_office": "taking_leave",
        "sick_notice": "taking_leave",
        "reminder": "set_reminder",
        "event": "log_event",
        "meeting": "schedule_meeting",
        "note": "log_note",
        "journal": "log_memory",
        "memory": "log_memory",
        "status_update": "status_update",
        "task": "create_task",
        "celebration": "log_event"
    }
    return label_to_intent.get(label, "other")

# Extract entities using NER
def extract_entities(text):
    ner_results = ner_pipeline(text)
    entities = {"people": [], "places": [], "organizations": [], "dates": [], "misc": []}
    
    PLACE_KEYWORDS = [
        "garden", "hotel", "resort", "mall", "restaurant", "cafe", "market",
        "school", "college", "temple", "station", "airport", "hospital",
        "park", "store", "shop", "gym", "theater", "cinema", "bank", "office",
        "court", "salon", "studio", "museum", "library", "club", "university",
        "guest house", "hostel", "canteen", "clinic", "zoo", "residency", "apartment"
    ]

    RELATION_KEYWORDS = [
        # English
        "mom", "dad", "father", "mother", "sister", "brother", "sis", "bro",
        "uncle", "aunt", "aunty", "cousin", "grandfather", "grandmother",
        "grandpa", "grandma", "wife", "husband", "son", "daughter", "child",
        "kids", "baby", "partner", "fiancé", "fiancée", "in-laws", "relatives",
        "friend", "colleague", "buddy", "pal", "mate", "acquaintance", "companion",
        "girlfriend", "boyfriend", "lover", "spouse", "significant other",

        # Hindi & Hinglish
        "maa", "mummy", "papa", "pappa", "pitaji", "mataji", "didi", "behen", "bhai",
        "chacha", "chachi", "mama", "mami", "tau", "tai", "nana", "nani",
        "dada", "dadi", "sasur", "sasuma", "jija", "saali", "bhabhi", "devar",
        "nandoi", "patni", "pati", "bachcha", "baccha", "beta", "beti", "putra", "putri",
        "sambandhi", "rishtedaar", "saheli", "dost", "yara", "saathi"
    ]

    for ent in ner_results:
        word = ent["word"].replace("##", "")
        if len(word) <= 2  or not word.isalpha():
            continue  # skip single-letter non-words
        group = ent["entity_group"]
        if group == "PER":
            entities["people"].append(word)
        elif group == "LOC":
            entities["places"].append(word)
        elif group == "ORG":
            entities["organizations"].append(word)
        elif group == "DATE":
            entities["dates"].append(word)
        else:
            entities["misc"].append(word)

    # ✅ Fallback: Add known days/dates if not already captured
    day_keywords = re.findall(r'\b(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', text, re.IGNORECASE)
    for day in day_keywords:
        if day not in entities["dates"]:
            entities["dates"].append(day)

    # ✅ Fallback: Add phrases like “product launch”, “project”, etc. to misc
    lower_text = text.lower()
    if "product launch" in lower_text:
        entities["misc"].append("product launch")
    if "birthday" in lower_text:
        entities["misc"].append("birthday")
    if "project" in lower_text:
        entities["misc"].append("project")

    # ✅ Add keyword-based places
    for place in PLACE_KEYWORDS:
        if place in lower_text and place not in entities["places"]:
            entities["places"].append(place)

     # ✅ Detect relation keywords (English + Hindi)
    for relation in RELATION_KEYWORDS:
        if re.search(rf"\b{re.escape(relation)}\b", text.lower()):
            entities["people"].append(relation)

    # ✅ Deduplicate and return

    return {k: list(set(v)) for k, v in entities.items()}

# Function to calculate urgency score based on parsed dates
def get_urgency_score(text, parsed_dates):
    urgency_keywords = ["urgent", "asap", "immediate", "must", "need to", "important", "don’t forget", "right away"]
    text_lower = text.lower()

    score = 0.0

    # 1. Keyword-based boost
    if any(word in text_lower for word in urgency_keywords):
        score = 0.7

    # 2. Time-based boost
    now = datetime.now()
    for d in parsed_dates:
        dt = dateparser.parse(d)
        if dt:
            hours = (dt - now).total_seconds() / 3600
            if 0 <= hours <= 24:
                score = max(score, 1.0)
            elif 24 < hours <= 72:
                score = max(score, 0.8)
            elif 72 < hours <= 168:
                score = max(score, 0.5)

    return round(score, 2)

# Function to get meta information about the text
def get_meta_info(text: str):
    now = datetime.now()
    return {
        "word_count": len(text.strip().split()),
        "day_of_week": now.strftime('%A'),    # e.g., "Thursday"
        "hour_of_day": now.hour,
        "month": now.strftime('%B'),     # e.g., "July"
        "year": now.year               # 0 to 23
    }

def is_year_context(text_snippet):
    return bool(re.search(r"\b(?:jan|feb|march|april|may|june|july|aug|sept|oct|nov|dec|year|in|on|by|for)\b", text_snippet))

# Function to extract amounts in various currencies from text
def extract_amounts(text: str):
    currency_patterns = [
        # INR variants
        (re.compile(r"(?:₹|rs\.?|inr)\s?(\d[\d,]*(?:\.\d+)?)"), "INR"),
        (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?(?:₹|rs\.?|inr)"), "INR"),
        (re.compile(r"(\d+(?:\.\d+)?)\s?(rupees?|rupaye|rupiye)"), "INR"),
        # USD variants
        (re.compile(r"(?:\$)\s?(\d[\d,]*(?:\.\d+)?)"), "USD"),
        (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?\$"), "USD"),
        (re.compile(r"(\d+(?:\.\d+)?)\s?(dollars?)"), "USD"),
        (re.compile(r"(\d+(?:\.\d+)?)\s?(cents?)"), "USD"),
        # EUR variants
        (re.compile(r"(?:€|eur)\s?(\d[\d,]*(?:\.\d+)?)"), "EUR"),
        (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?€"), "EUR"),
        (re.compile(r"(\d+(?:\.\d+)?)\s?(euros?)"), "EUR"),
        # GBP variants
        (re.compile(r"(?:£|gbp)\s?(\d[\d,]*(?:\.\d+)?)"), "GBP"),
        (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?£"), "GBP"),
        (re.compile(r"(\d+(?:\.\d+)?)\s?(pounds?)"), "GBP"),
        # INR large units
        (re.compile(r"(\d+(?:\.\d+)?)\s?(lacs?|lakhs?)"), "INR"),
        (re.compile(r"(\d+(?:\.\d+)?)\s?(crores?|crs?|cr)"), "INR"),
    ]

    results = []
    seen = set()
    text_lower = text.lower()

    for pattern, currency_code in currency_patterns:
        for match in pattern.finditer(text_lower):
            groups = match.groups()
            raw_number = next((g for g in groups if re.match(r"\d", g)), None)
            if not raw_number:
                continue
            # Ignore phone numbers and IDs (10+ digits)
            if len(raw_number.replace(",", "")) >= 10:
                continue
            try:
                number = float(raw_number.replace(",", ""))

                # Check for lakh/crore/cents multipliers
                if any(g in ['lakh', 'lacs', 'lakhs'] for g in groups):
                    number *= 100_000
                elif any(g in ['crore', 'crores', 'cr', 'crs'] for g in groups):
                    number *= 10_000_000
                elif any(g == 'cents' for g in groups):
                    number /= 100

            except Exception:
                continue

            key = (number, currency_code)
            if key not in seen:
                seen.add(key)
                results.append({
                    "value": round(number, 2),
                    "currency": currency_code
                })

    # Fallback matching for generic numeric phrases near expense keywords
    if not results:
        fallback_patterns = [
            re.compile(
                r"\b(?:paid|spent|buy|purchase|cost|price|add(?:ed)?|gift(?:ed)?|bill(?: of)?|recharge(?:d)?|charged|transfer(?:red)?)\b[^0-9]{0,10}(\d[\d,]*(?:\.\d+)?)"
            ),
            re.compile(r"\b(\d[\d,]{2,8})\b\s?(?:rs|inr)?")
        ]
        for fallback_pattern in fallback_patterns:
            match = fallback_pattern.search(text_lower)
            if match:
                number_str = match.group(1).replace(",", "")
                # Ignore phone numbers and IDs
                if len(number_str) >= 10:
                    continue
                try:
                    number = float(number_str)

                    # Context check for year-like numbers
                    if 2020 <= number <= 2100:
                        # Check 5-6 words before/after for year clue
                        span = match.span(1)
                        surrounding = text_lower[max(0, span[0]-30):span[1]+30]
                        if is_year_context(surrounding):
                            continue  # Looks like a year

                    key = (number, "INR")
                    if key not in seen:
                        seen.add(key)
                        results.append({
                            "value": round(number, 2),
                            "currency": "INR"
                        })
                        break  # Only extract first match in fallback
                except:
                    continue

    return results


def predict_expense_category(text, detected_stores):
    text_lower = text.lower()

    # 1. Use detected store category if available
    if detected_stores:
        best_match = max(detected_stores, key=lambda s: s.get("confidence", 1.0))
        return best_match["category"]

    # Category keyword mapping
    category_keywords = {
    "food": [
        "food", "lunch", "dinner", "breakfast", "snacks", "swiggy", "zomato", "dominos", "pizza", "kfc", "mcdonald",
        "restaurant", "hotel", "cafe", "canteen", "meal", "buffet", "thali", "tiffin", "order", "takeaway", "parcel",
        "eat", "eating", "brunch", "supper", "kitchen", "cook", "cooking", "chef", "dish", "dishes", "menu", "serve",
        "served", "serving", "food court", "food delivery", "delivery", "online order", "food app", "food bill",
        "beverage", "juice", "shake", "smoothie", "coffee", "tea", "chai", "cold drink", "soft drink", "soda", "water bottle",
        "ice cream", "dessert", "sweet", "sweets", "chocolate", "candy", "bakery", "bread", "cake", "pastry", "cookie",
        "biscuit", "chips", "fries", "burger", "sandwich", "roll", "wrap", "noodles", "pasta", "rice", "biryani", "curry",
        "gravy", "dal", "sabzi", "roti", "naan", "paratha", "chapati", "idli", "dosa", "vada", "sambar", "chutney", "samosa",
        "pakora", "chaat", "pani puri", "golgappa", "sev", "poha", "upma", "maggi", "maggie", "momos", "spring roll",
        "manchurian", "paneer", "butter chicken", "tandoori", "kebab", "shawarma", "pizza hut", "subway", "starbucks",
        # Hindi/Hinglish
        "khana", "nashta", "bhojan", "rasoi", "thali", "dabba", "tiffin", "chai", "paani", "jal", "kharcha khana",
        "khane ka bill", "khane ka paisa", "khane ki cheez", "khana order", "khana mangwaya", "khana khaya", "khana khud banaya",
        "khana kharch", "khana kharida", "khana diya", "khana laya", "khana banaya"
    ],
    "transport": [
        "uber", "ola", "taxi", "cab", "bus", "train", "metro", "flight", "auto", "rickshaw", "car", "gaadi", "yatra", "safar", "travel", "ticket", "plane", "udaan", "station", "airport", "rapido",
    ],
    "shopping": [
        "amazon", "flipkart", "myntra", "shopping", "clothes", "kapde", "apparel", "shoes", "jeans", "tshirt", "store", "fashion", "dukaan", "mall", "bazaar", "market", "kharida", "order diya", "le liya"
    ],
    "housing": [
        "rent", "apartment", "house", "ghar", "flat", "maintenance", "landlord", "kiraya", "makaan", "room", "hostel", "pg", "society"
    ],
    "utilities": [
        "electricity", "power", "bijli", "water", "pani", "gas", "bill", "recharge", "broadband", "wifi", "airtel", "jio", "phone", "mobile", "internet", "light", "cylinder", "connection"
    ],
    "entertainment": [
        "movie", "netflix", "hotstar", "bookmyshow", "spotify", "gaming", "youtube premium", "cinema", "film", "picture", "game", "khel", "manoranjan", "show", "concert"
    ],
    "health": [
        "medicine", "hospital", "doctor", "clinic", "pharmacy", "tablet", "surgery", "checkup", "dawai", "aspatal", "ilaaj", "health", "bimari", "test", "medical", "pathology", "chemist"
    ],
    "travel": [
        "trip", "travel", "tour", "vacation", "hotel", "airbnb", "booking.com", "goibibo", "makemytrip", "yatra", "safar", "holiday", "journey", "musafir", "booking", "trip kiya"
    ],
    "education": [
        "course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill", "padhai", "school", "college", "tuition", "kitab", "book", "fees", "shiksha"
    ],
    "digital_services": [
        "domain", "membership", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas", "subscription", "digital", "online", "app", "service", "renewal"
    ],
    "gifts_donations": [
        "gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift", "uphaar", "daan", "tohfa", "chanda", "puja", "mandir", "gurudwara"
    ],
    "finance": [
        "insurance", "sip", "mutual fund", "stock", "demat", "zerodha", "investment", "trading", "upstox", "crypto", "policy", "premium", "loan", "emi", "fd", "rd", "paisa", "bank", "account"
    ],
    "family_kids": [
        "kid", "baby", "school", "daycare", "tuition", "books", "uniform", "toys", "creche", "baccha", "bachche", "parivar", "family", "beti", "beta", "child", "children"
    ],
    "stationery": [
        "pen", "pencil", "notebook", "diary", "eraser", "sharpener", "paper", "stationery", "register", "files", "file", "markers", "highlighter", "sticky notes", "geometry box",
        "stapler", "ink", "printer paper", "stationary shop", "stationary", "copy", "kagaz", "likhne ka saman"
    ]
}

    # 2. Match using keyword scores
    matched = {cat: sum(1 for kw in kws if kw in text_lower) for cat, kws in category_keywords.items()}
    best_match = max(matched.items(), key=lambda x: x[1])
    
    if best_match[1] > 0:
        return best_match[0]

    return "miscellaneous"



def insert_text_entry(data):
    try:
        conn = psycopg2.connect(DATABASE_URL)
        cur = conn.cursor()

        insert_query = """
            INSERT INTO user_entries (
                uuid, user_id, raw_text, word_count, day_of_week, hour_of_day, month, year,
                type, expense_type, intent, confidence_scores, urgency_score,
                time_mentions, parsed_dates, tense, summary,
                people, mood, language, sentiment_score, tags,
                action_required, entities, amounts, stores, processing_time_ms, raw_json
            ) VALUES (
                %(uuid)s, %(user_id)s, %(raw_text)s, %(word_count)s, %(day_of_week)s, %(hour_of_day)s, %(month)s, %(year)s,
                %(type)s, %(expense_type)s, %(intent)s, %(confidence_scores)s, %(urgency_score)s,
                %(time_mentions)s, %(parsed_dates)s, %(tense)s, %(summary)s,
                %(people)s, %(mood)s, %(language)s, %(sentiment_score)s, %(tags)s,
                %(action_required)s, %(entities)s, %(amounts)s, %(stores)s, %(processing_time_ms)s, %(raw_json)s
            )
            ON CONFLICT (uuid) DO NOTHING;
        """

        cur.execute(insert_query, {
            **data,
            "confidence_scores": Json(data["confidence_scores"]),
            "language": Json(data["language"]),
            "stores": Json(data["stores"]),
            "entities": Json(data["entities"]),
            "amounts": Json(data["amounts"]),
            "raw_json": Json(data["raw_json"])
        })

        conn.commit()
        cur.close()
        conn.close()
        print("✅ Data inserted successfully")

    except Exception as e:
        print("❌ Failed to insert data:", e)
    


@app.get("/health")
def health_check():
    return {"message": "✅ Hello from yourpartner/demospace — API is running!"}

@app.exception_handler(404)
async def not_found_handler(request: Request, exc):
    return ORJSONResponse(status_code=404, content={"error": "Route not found"})

@app.exception_handler(500)
async def internal_error_handler(request: Request, exc):
    return ORJSONResponse(status_code=500, content={"error": "Internal server error: " + str(exc)})

# Search endpoint to filter user entries based on various criteria
@app.get("/search", response_class=ORJSONResponse)
async def search_entries(
    userid: str = Header(..., description="User ID"),
    tags: str = "",
    query: str = "",
    startDate: str = "",
    endDate: str = "",
    type: str = ""
):
    # Validate user_id from header
    if not userid or not userid.strip():
        return ORJSONResponse(status_code=400, content={"error": "Missing or empty userid header."})

    # Build SQL filters
    filters = ["user_id = %s"]
    params = [userid]

    if type:
        filters.append("type = %s")
        params.append(type)

    if tags:
        tag_list = [t.strip() for t in tags.split(",") if t.strip()]
        filters.append("tags && %s")
        params.append(tag_list)

    if query:
        filters.append("(raw_text ILIKE %s OR summary ILIKE %s)")
        params.extend([f"%{query}%", f"%{query}%"])

    if startDate:
        try:
            start_dt = datetime.strptime(startDate, "%d-%m-%Y")
            filters.append("created_at >= %s")
            params.append(start_dt)
        except:
            return ORJSONResponse(status_code=400, content={"error": "Invalid startDate format. Use DD-MM-YYYY."})

    if endDate:
        try:
            end_dt = datetime.strptime(endDate, "%d-%m-%Y")
            filters.append("created_at <= %s")
            params.append(end_dt)
        except:
            return ORJSONResponse(status_code=400, content={"error": "Invalid endDate format. Use DD-MM-YYYY."})

    where_clause = " AND ".join(filters)
    query_sql = f"SELECT * FROM user_entries WHERE {where_clause} ORDER BY created_at DESC LIMIT 50"

    try:
        conn = psycopg2.connect(DATABASE_URL)
        cur = conn.cursor()
        cur.execute(query_sql, tuple(params))
        rows = cur.fetchall()
        columns = [desc[0] for desc in cur.description]
        entries = [dict(zip(columns, row)) for row in rows]
        # Remove raw_json from each entry in results
        for entry in entries:
            entry.pop("raw_json", None)
            
        cur.close()
        conn.close()
    except Exception as e:
        return ORJSONResponse(status_code=500, content={"error": str(e)})

    return ORJSONResponse(content={"results": entries})

@app.get("/visualyse/{user_id}", response_class=ORJSONResponse)
async def visualyse_dashboard(user_id: str):
    try:
        conn = psycopg2.connect(DATABASE_URL)
        cur = conn.cursor()
        # Fetch all entries for the user
        cur.execute("SELECT * FROM user_entries WHERE user_id = %s", (user_id,))
        rows = cur.fetchall()
        columns = [desc[0] for desc in cur.description]
        entries = [dict(zip(columns, row)) for row in rows]
        cur.close()
        conn.close()
    except Exception as e:
        return ORJSONResponse(status_code=500, content={"error": str(e)})

    # Section 1: Expense Overview
    expenses = [e for e in entries if e["type"] == "expense"]
    total_expense = sum(a["value"] for e in expenses for a in (e["amounts"] or []))
    expense_count = len(expenses)
    expense_by_category = {}
    for e in expenses:
        cat = e.get("expense_type", "miscellaneous")
        amt = sum(a["value"] for a in (e["amounts"] or []))
        expense_by_category[cat] = expense_by_category.get(cat, 0) + amt

    # Monthly/Weekly Trends
    monthly_trends = {}
    for e in expenses:
        key = f"{e['month']}-{e['year']}"
        amt = sum(a["value"] for a in (e["amounts"] or []))
        monthly_trends[key] = monthly_trends.get(key, 0) + amt

    # Section 2: Top Stores & Categories
    store_stats = {}
    for e in expenses:
        for s in (e["stores"] or []):
            store = s.get("store", "unknown")
            amt = sum(a["value"] for a in (e["amounts"] or []))
            if store not in store_stats:
                store_stats[store] = {"count": 0, "total": 0}
            store_stats[store]["count"] += 1
            store_stats[store]["total"] += amt
    top_categories = sorted(expense_by_category.items(), key=lambda x: x[1], reverse=True)

    # Section 3: Recent Expenses
    recent_expenses = sorted(expenses, key=lambda e: e.get("created_at", ""), reverse=True)[:7]

    # Section 4: Mood Trends
    mood_dist = {}
    for e in entries:
        mood = e.get("mood", "neutral")
        mood_dist[mood] = mood_dist.get(mood, 0) + 1

    # Section 5: Tags & Keywords
    tag_freq = {}
    for e in entries:
        for tag in (e["tags"] or []):
            tag_freq[tag] = tag_freq.get(tag, 0) + 1
    top_tags = sorted(tag_freq.items(), key=lambda x: x[1], reverse=True)[:15]

    # Section 6: Time Analysis
    day_stats = {}
    hour_stats = {}
    for e in expenses:
        day = e.get("day_of_week", "unknown")
        hour = e.get("hour_of_day", 0)
        amt = sum(a["value"] for a in (e["amounts"] or []))
        day_stats[day] = day_stats.get(day, 0) + amt
        hour_stats[hour] = hour_stats.get(hour, 0) + amt

    # Section 7: Meta Info
    entry_count = len(entries)
    type_dist = {}
    for e in entries:
        t = e.get("type", "other")
        type_dist[t] = type_dist.get(t, 0) + 1

    dashboard = {
        "expense_overview": {
            "total_expense": total_expense,
            "expense_count": expense_count,
            "expense_by_category": expense_by_category,
            "monthly_trends": monthly_trends
        },
        "top_stores": store_stats,
        "top_categories": top_categories,
        "recent_expenses": recent_expenses,
        "mood_distribution": mood_dist,
        "top_tags": top_tags,
        "time_analysis": {
            "by_day": day_stats,
            "by_hour": hour_stats
        },
        "meta_info": {
            "entry_count": entry_count,
            "type_distribution": type_dist
        }
    }
    return ORJSONResponse(content=dashboard)

@app.post("/analyze", response_class=ORJSONResponse)
async def analyze(input: TextInput):
    start_time = time.time()  # ⏱️ start

    text = input.text

    label_map = {
        "task (something to be done or completed)": "task",
        "event (an activity that is happening or has happened)": "event",
        "reminder (a message to remember something in the future)": "reminder",
        "meeting (a planned gathering between people to discuss something)": "meeting",
        "relationship (message about personal or emotional connection with someone)": "relationship",
        "note (general note or quick thought not related to any specific category)": "note",
        "journal (personal reflection or emotional writing about one's day or thoughts)": "journal",
        "memory (recollection or recording of a past moment or experience)": "memory",
        "status_update (current condition, feeling, or situation being shared)": "status_update",
        "sick_notice (informing about illness or not feeling well)": "sick_notice",
        "out_of_office (message about being unavailable for work or responsibilities)": "out_of_office",
        "travel_plan (planning or mentioning a trip or journey)": "travel_plan",
        "celebration (message about a festive occasion, party or achievement)": "celebration",
        "expense (money spent on something, either small or large)": "expense",
        "news (update about public events, announcements, or current affairs)": "news",
        "information (factual content or informative message not tied to user activity)": "information",
        "purchase (buying or ordering something, like a product or service)": "purchase",
        "other (does not clearly fall into any specific category)": "other"
    }

    # classification = classifier(text, labels)
    # Async call to classifier
    classification = await asyncio.to_thread(classifier, text, labels, hypothesis_template="This entry is about {}.")
    best_label = classification['labels'][0]

    best_label = label_map.get(best_label, best_label)
    amounts = await asyncio.to_thread(extract_amounts, text)   

    # Check if the best label is expense or purchase based on keywords
    if (
        best_label == "task"
        and (any(word in text.lower() for word in expense_keywords) or amounts)
    ):
        best_label = "expense"

    if best_label == "purchase":
        best_label = "expense"

    if "reported" in text or "announced" in text or "collapsed" in text:
        if best_label in ["task", "reminder", "event"]:
            best_label = "news"

    scores = dict(zip(classification['labels'], classification['scores']))
    # # Convert to short labels
    confidence_scores_full = {
        label_map.get(label, label): score
        for label, score in scores.items()
    }   
    # Only keep top 2
    confidence_scores = dict(sorted(confidence_scores_full.items(), key=lambda x: x[1], reverse=True)[:2])

         
    parsed_dates, time_mentions = await asyncio.to_thread(extract_dates_with_accuracy, text, amounts)
    tenses = detect_tense(parsed_dates)    
    summary = await asyncio.to_thread(generate_summary, text)
    mood = estimate_mood(text)
    tags = generate_tags(best_label, text)
    language_detected = detect_language(text)
    sentiment_score = get_sentiment_score(text)    
    if sentiment_score is None or sentiment_score == "":
        sentiment_score = 0.0

    entities = await asyncio.to_thread(extract_entities, text)
    people = entities["people"] # Extracted people entities
    intent = infer_intent(best_label, text)
    urgency_score = get_urgency_score(text, parsed_dates)
    detected_stores = detect_store_category(text)
    
    expense_category = ""
    if best_label == "expense" or best_label ==  "purchase":
        expense_category = predict_expense_category(text, detected_stores)

    # Define action triggers
    ACTION_TRIGGERS = ["plan", "organize", "schedule", "remember", "book", "call", "follow up", "need to"]
    action_required = False
    if any(word in text.lower() for word in ACTION_TRIGGERS): action_required = True

    action_required = urgency_score >= 0.6 or action_required
    meta = get_meta_info(text)

    end_time = time.time()  # ⏱️ end
    processing_time_ms = round((end_time - start_time) * 1000)

    result = {
        "uuid": str(uuid.uuid4()),  # Unique identifier for the request
        "user_id": input.user_id,  # Unique identifier for the request
        "raw_text": text,
        "word_count": meta["word_count"], 
        "day_of_week": meta["day_of_week"],
        "hour_of_day": meta["hour_of_day"], 
        "month": meta["month"], 
        "year": meta["year"], 
        "type": best_label,
        "expense_type": expense_category,
        "intent": intent,
        "confidence_scores": confidence_scores,
        "urgency_score": urgency_score,
        "time_mentions": time_mentions,
        "parsed_dates": parsed_dates,
        "tense": tenses,
        "summary": summary.removeprefix("summary:").strip(),
        "people": people,
        "mood": mood,
        "language": language_detected,
        "sentiment_score": sentiment_score,
        "tags": tags,
        "action_required": action_required,
        "entities": entities,
        "amounts": amounts,
        "stores": detected_stores,
        "processing_time_ms": processing_time_ms
    }
    
    # Store a copy of result without raw_json to avoid circular reference
    raw_json_copy = result.copy()
    # Remove raw_json if present (shouldn't be, but for safety)
    raw_json_copy.pop("raw_json", None)
    result["raw_json"] = raw_json_copy

    # Insert into database
    await asyncio.to_thread(insert_text_entry, result)

    # Log the result
    print("✅ Analysis complete")
    
    # Remove raw_json from response
    result.pop("raw_json", None)

    # Return the result as JSON response
    return ORJSONResponse(content=result)