Spaces:

VladRet2026
/

ConvertAudioToJSON

Running

App Files Files

VladGeekPro Copilot commited on 20 days ago

Commit

144bf42

1 Parent(s): 7b59e3d

DeletedDBAgentAndCreatedExpensePredictor

Browse files

Co-authored-by: Copilot <copilot@github.com>

Files changed (4) hide show

app.py +16 -11
expense_predictor.py +102 -0
requirements.txt +0 -2
sql_generator.py +0 -111

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ from extractors import (
     ExpenseUserExtractor,
     ExpenseAmountExtractor,
 )
-from sql_generator import generate_sql
 # HuggingFace Token (если нужен для моделей)
@@ -576,10 +576,10 @@ def index():
     """Главная страница API."""
     return jsonify({
         "status": "ok",
-        "message": "Voice processing API is running",
         "endpoints": {
             "POST /process-audio": "Process audio file",
-            "POST /generate-sql": "Generate SQLite SELECT query from natural language",
             "GET /health": "Health check",
             "GET /test-data": "Run text-only extraction tests"
         }
@@ -674,16 +674,21 @@ def process_audio():
             os.unlink(temp_path)
-@app.post("/generate-sql")
-def generate_sql_endpoint():
-    """Генерирует SQL по текстовому запросу и схеме БД."""
     payload = parse_json_payload()
-    query = payload.get("query") or payload.get("text") or ""
-    limit = payload.get("limit") or 200
     try:
-        sql = generate_sql(question=query, limit=int(limit))
-        return jsonify({"sql": sql})
     except Exception as exception:
         return jsonify({"status": "error", "message": str(exception)}), 422

     ExpenseUserExtractor,
     ExpenseAmountExtractor,
 )
+from expense_predictor import predict_expenses
 # HuggingFace Token (если нужен для моделей)
     """Главная страница API."""
     return jsonify({
         "status": "ok",
+        "message": "Expense Processing API is running",
         "endpoints": {
             "POST /process-audio": "Process audio file",
+            "POST /predict-expenses": "Predict next 3 expenses based on history",
             "GET /health": "Health check",
             "GET /test-data": "Run text-only extraction tests"
         }
             os.unlink(temp_path)
+@app.post("/predict-expenses")
+def predict_expenses_endpoint():
+    """Predicts top 3 expenses user should add based on 6-month history."""
     payload = parse_json_payload()
+    expenses = payload.get("expenses") or []
+    if not isinstance(expenses, list):
+        return jsonify({"status": "error", "message": "expenses must be a list"}), 422
     try:
+        predictions = predict_expenses(expenses)
+        return jsonify({
+            "status": "ok",
+            "predictions": predictions
+        })
     except Exception as exception:
         return jsonify({"status": "error", "message": str(exception)}), 422

expense_predictor.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+Expense prediction model: suggests next expenses based on 6-month history.
+- Input: JSON array of 300 expense records
+- Output: Top 3 predicted expenses (date, sum, supplier, user)
+"""
+from datetime import datetime, timedelta
+from collections import defaultdict
+from typing import Optional
+import statistics
+def predict_expenses(expenses: list[dict]) -> list[dict]:
+    """
+    Predict top 3 expenses user should add.
+    Input: [{"date": "2026-01-15", "sum": 150.50, "supplier_id": 5, "user_id": 1, ...}, ...]
+    Output: [{"date": str, "sum": float, "supplier_id": int, "user_id": int, "confidence": float}, ...]
+    """
+    if not expenses or len(expenses) < 2:
+        print(f"[PREDICT] Not enough records: {len(expenses) if expenses else 0}")
+        return []
+    # Group by (supplier_id, user_id)
+    supplier_user_history = defaultdict(list)
+    supplier_freq = defaultdict(int)
+    total_records = len(expenses)
+    print(f"[PREDICT] Total records received: {total_records}")
+    for i, exp in enumerate(expenses):
+        print(f"[PREDICT]   [{i+1}] date={exp.get('date')}, sum={exp.get('sum')}, supplier_id={exp.get('supplier_id')}, user_id={exp.get('user_id')}")
+    for exp in expenses:
+        key = (exp["supplier_id"], exp["user_id"])
+        supplier_user_history[key].append(exp)
+        supplier_freq[key] += 1
+    print(f"[PREDICT] Unique (supplier, user) pairs: {len(supplier_user_history)}")
+    for key, count in supplier_freq.items():
+        pct = count / total_records * 100
+        print(f"[PREDICT]   supplier_id={key[0]}, user_id={key[1]} → {count} records ({pct:.1f}%)")
+    # Filter: frequency > 50% over 6 months
+    candidates = {
+        key: records
+        for key, records in supplier_user_history.items()
+        if supplier_freq[key] / total_records >= 0.5
+    }
+    print(f"[PREDICT] Candidates after >50% filter: {len(candidates)}")
+    if not candidates:
+        print("[PREDICT] No candidates passed the frequency filter. Returning empty.")
+        return []
+    # Analyze each candidate: avg amount, interval, last date
+    predictions = []
+    for (supplier_id, user_id), records in candidates.items():
+        amounts = [float(r["sum"]) for r in records]
+        avg_amount = statistics.mean(amounts)
+        # Calculate interval between transactions (days)
+        dates = sorted([datetime.fromisoformat(r["date"]) for r in records])
+        if len(dates) >= 2:
+            intervals = [(dates[i+1] - dates[i]).days for i in range(len(dates) - 1)]
+            avg_interval = statistics.mean(intervals)
+        else:
+            avg_interval = 30  # default monthly
+        last_date = dates[-1]
+        next_predicted_date = (last_date + timedelta(days=avg_interval)).strftime("%Y-%m-%d")
+        # Confidence: higher if more consistent (lower std dev)
+        amount_std = statistics.stdev(amounts) if len(amounts) > 1 else 0
+        consistency = max(0, 1 - (amount_std / avg_amount)) if avg_amount > 0 else 0.5
+        frequency_score = min(supplier_freq[(supplier_id, user_id)] / total_records, 1.0)
+        confidence = (consistency + frequency_score) / 2
+        print(
+            f"[PREDICT] supplier_id={supplier_id}, user_id={user_id} | "
+            f"avg_amount={avg_amount:.2f}, avg_interval={avg_interval:.1f}d, "
+            f"last_date={last_date.date()}, next_date={next_predicted_date}, "
+            f"consistency={consistency:.2f}, freq_score={frequency_score:.2f}, confidence={confidence:.2f}"
+        )
+        predictions.append({
+            "date": next_predicted_date,
+            "sum": round(avg_amount, 2),
+            "supplier_id": supplier_id,
+            "user_id": user_id,
+            "confidence": round(confidence, 2)
+        })
+    # Return top 3 by confidence
+    result = sorted(predictions, key=lambda x: x["confidence"], reverse=True)[:3]
+    print(f"[PREDICT] Final top {len(result)} predictions:")
+    for i, p in enumerate(result, 1):
+        print(f"[PREDICT]   #{i}: supplier_id={p['supplier_id']}, user_id={p['user_id']}, date={p['date']}, sum={p['sum']}, confidence={p['confidence']}")
+    return result

requirements.txt CHANGED Viewed

@@ -9,5 +9,3 @@ python-dateutil
 iuliia
 scikit-learn
 sentencepiece
-transformers==4.41.2
-torch==2.3.1

 iuliia
 scikit-learn
 sentencepiece

sql_generator.py DELETED Viewed

@@ -1,111 +0,0 @@
-from __future__ import annotations
-import os
-from dataclasses import dataclass
-from typing import Any
-# Compact Spider-style schema: only business tables, no Laravel internals.
-# Format: table col type , col type | table2 col type , col type
-# Foreign keys annotated inline for model guidance.
-DEFAULT_DB_SCHEMA = (
-    "users : id int , name varchar , email varchar , created_at datetime , updated_at datetime | "
-    "categories : id int , name varchar , slug varchar , notes text , created_at datetime , updated_at datetime | "
-    "suppliers : id int , name varchar , slug varchar , category_id int , created_at datetime , updated_at datetime | "
-    "expenses : id int , user_id int , date date , category_id int , supplier_id int , sum numeric , notes text , created_at datetime , updated_at datetime | "
-    "debts : id int , date date , user_id int , debt_sum numeric , overpayment_id int , notes text , payment_status varchar , partial_sum numeric , date_paid date , created_at datetime , updated_at datetime | "
-    "overpayments : id int , user_id int , sum numeric , notes text , created_at datetime , updated_at datetime | "
-    "paid_debts : id int , debt_id int , changed_debt_date date , paid_by_user_id int , payment_status varchar , paid_sum numeric , created_at datetime , updated_at datetime | "
-    "expense_change_requests : id int , expense_id int , user_id int , action_type varchar , current_date date , current_user_id int , current_category_id int , current_supplier_id int , current_sum numeric , requested_date date , requested_user_id int , requested_category_id int , requested_supplier_id int , requested_sum numeric , notes text , status varchar , applied_at datetime , created_at datetime , updated_at datetime | "
-    "expense_change_request_votes : id int , expense_change_request_id int , user_id int , vote varchar , notes text , created_at datetime , updated_at datetime"
-)
-_SQL_GENERATOR: Any | None = None
-@dataclass(frozen=True)
-class SqlGenerationRequest:
-    question: str
-    limit: int = 200
-def _get_sql_generator() -> Any:
-    global _SQL_GENERATOR
-    if _SQL_GENERATOR is None:
-        from transformers import pipeline
-        model_id = os.getenv("SQL_MODEL", "gaussalgo/T5-LM-Large-text2sql-spider")
-        _SQL_GENERATOR = pipeline(
-            task="text2text-generation",
-            model=model_id,
-            tokenizer=model_id,
-            device=-1,
-        )
-    return _SQL_GENERATOR
-def _build_prompt(payload: SqlGenerationRequest) -> str:
-    # gaussalgo/T5-LM-Large-text2sql-spider is trained on Spider benchmark.
-    # Expected format: "Question: {q} | {compact_schema}"
-    # where schema uses pipe-separator between tables and " : " between table name and columns.
-    return f"Question: {payload.question} | {DEFAULT_DB_SCHEMA}"
-def _normalize_sql(raw_sql: str, limit: int) -> str:
-    sql = (raw_sql or "").strip()
-    if not sql:
-        raise ValueError("SQL model returned an empty result.")
-    if "```" in sql:
-        parts = [part.strip() for part in sql.split("```") if part.strip()]
-        sql = parts[-1]
-    upper_sql = sql.upper()
-    sql_start = upper_sql.find("SELECT")
-    if sql_start == -1:
-        raise ValueError("Generated SQL is not a SELECT query.")
-    sql = sql[sql_start:]
-    if ";" in sql:
-        sql = sql.split(";", 1)[0].strip()
-    upper_sql = sql.upper()
-    forbidden = ("INSERT ", "UPDATE ", "DELETE ", "DROP ", "ALTER ", "PRAGMA ", "ATTACH ", "CREATE ", "REPLACE ")
-    if any(keyword in upper_sql for keyword in forbidden):
-        raise ValueError("Generated SQL contains forbidden statements.")
-    if not upper_sql.startswith("SELECT "):
-        raise ValueError("Only SELECT queries are allowed.")
-    aggregate_markers = ("COUNT(", "SUM(", "AVG(", "MIN(", "MAX(")
-    has_limit = " LIMIT " in upper_sql
-    if not has_limit and not any(marker in upper_sql for marker in aggregate_markers):
-        sql = f"{sql} LIMIT {limit}"
-    return sql
-def generate_sql(question: str, limit: int = 200) -> str:
-    clean_question = (question or "").strip()
-    if not clean_question:
-        raise ValueError("Field 'query' is required.")
-    payload = SqlGenerationRequest(
-        question=clean_question,
-        limit=limit,
-    )
-    generator = _get_sql_generator()
-    prompt = _build_prompt(payload)
-    result = generator(
-        prompt,
-        max_new_tokens=512,
-        do_sample=False,
-        num_beams=4,
-        truncation=True,
-    )
-    generated_text = result[0].get("generated_text", "") if result else ""
-    return _normalize_sql(generated_text, limit=payload.limit)