Spaces:

brandonmusic
/

VerdictAI

Runtime error

App Files Files Community

brandonmusic commited on Aug 2

Commit

8137280

verified ·

1 Parent(s): 9a07481

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -187

app.py CHANGED Viewed

@@ -1,71 +1,39 @@
 # app.py
 # This is the updated main script. Copy-paste this over your existing app.py.
 # Changes:
-# - Switched from Gradio to Flask for serving the custom HTML+CSS+JS frontend.
-# - Added API endpoint /api/chat for handling user inputs (prompt, jurisdiction, IRAC mode, web search toggle, file).
-# - Serves index.html as the root page (you'll need to add index.html to your repo with the provided HTML code).
-# - Integrated file handling in API (extracts text and appends to prompt if needed).
-# - Forced task_type to "irac" if IRAC mode is enabled; otherwise, uses classify_prompt.
-# - Added web_search toggle handling.
-# - Updated ask_gpt41_mini to use the fine-tuned model ft:gpt-4.1-mini-2025-04-14:w-jeffrey-scott-psc:verdictaitrain:BysFkyX4.
-# - If the task is document_creation, routes directly to the fine-tuned GPT model.
-# - Retained all other logic, including RAG (semantic_search for CAP + municipal_search for municipal; now hybrid with BM25 for municipal).
-# - Note: Add 'bm25s' to your requirements.txt for hybrid search (pip install bm25s).
-# - Note: The SaulLM endpoint is kept as-is (likely 7B; if you want 141B, update SAUL_ENDPOINT to a new HF cloud endpoint for SaulLM-141B).
-# - Note: For full chat history, the frontend JS handles appending messages client-side (stateless backend).
-# - Updated route_model to use retrieve_context(prompt, task_type) instead of separate semantic_search/municipal_search.
-# - For document_creation/summaries, skip RAG (no retrieve_context call) to avoid slowdown.
-import gradio as gr  # Retained if needed, but not used for UI anymore
-from openai import OpenAI
-import requests
 import os
 import logging
 from datetime import datetime
 import pdfplumber
-from docx import Document  # Added for .docx support
 from googleapiclient.discovery import build
 import re
-from datasets import load_dataset, Dataset, load_from_disk
-from sentence_transformers import SentenceTransformer
-import torch
-import numpy as np
-import shutil
-import pyarrow.parquet as pq
-from huggingface_hub import hf_hub_download
-import pickle
-import faiss
-import threading
-import subprocess
 from task_processing import process_task_response
 from gpt_helpers import ask_gpt41_mini
-# New imports for split modules
-from retrieval import *
-from prompt_builder import *
-from post_processing import *
-# Flask imports
-from flask import Flask, request, jsonify, send_from_directory
-from werkzeug.utils import secure_filename
-# BM25 for hybrid search (add 'bm25s' to requirements.txt)
-from bm25s import BM25
-app_flask = Flask(__name__)  # Renamed to avoid conflict with 'app' variable
 os.environ["HF_HOME"] = "/data/.huggingface"
-# Add or update this section in script.py
-# Ensure this is placed after imports but before any dataset loading or function definitions
-from huggingface_hub import login
-# Load HF token for SaulLM endpoint and gated repos
 hf_token = os.environ.get("HF_TOKEN", "")
-if not hf_token:
-    logger.warning("HF_TOKEN not set; SaulLM endpoint may require authentication and gated repos may not be accessible.")
-# Authenticate for gated Hugging Face repos (e.g., for centroids download)
 if hf_token:
     login(hf_token)
     logger.info("Authenticated with Hugging Face token for gated repos.")
@@ -75,158 +43,75 @@ else:
 # Check environment variables
 try:
     OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "Missing")
-    GOOGLE_SEARCH_API = os.environ.get("GOOGLE_SEARCH_API", "Missing")  # This is now treated as CSE ID (cx)
-    GOOGLE_CUSTOM_SEARCH_API_KEY = os.environ.get("GOOGLE_CUSTOM_SEARCH_API_KEY", "Missing")  # New: API key (developerKey)
     if OPENAI_API_KEY == "Missing" or GOOGLE_CUSTOM_SEARCH_API_KEY == "Missing" or GOOGLE_SEARCH_API == "Missing":
         raise KeyError("API keys not set")
     logger.info(f"OpenAI API Key starts with: {OPENAI_API_KEY[:10]}...")
-    logger.info("API keys loaded successfully")
 except KeyError as e:
     logger.error(f"Missing environment variable: {str(e)}")
     raise EnvironmentError(f"Required secrets OPENAI_API_KEY, GOOGLE_CUSTOM_SEARCH_API_KEY, and GOOGLE_SEARCH_API must be set in Hugging Face Space Secrets")
-# Load HF token for SaulLM endpoint
-hf_token = os.environ.get("HF_TOKEN", "")
-if not hf_token:
-    logger.warning("HF_TOKEN not set; SaulLM endpoint may require authentication")
-import requests
 # Initialize OpenAI client
 openai_client = OpenAI(api_key=OPENAI_API_KEY)
 # SaulLM endpoint
 SAUL_ENDPOINT = "https://l4tuv4j9bu616t5x.us-east-1.aws.endpoints.huggingface.cloud"
-# Persistent storage path for dataset
-LOCAL_PATH = "/data/cap_dataset"
-dataset_info_path = os.path.join(LOCAL_PATH, 'dataset_info.json')
-if os.path.exists(dataset_info_path):
-    cap_dataset = load_from_disk(LOCAL_PATH)
-else:
-    try:
-        cap_dataset = load_dataset("TeraflopAI/Caselaw-Access-Project", split="train")
-        cap_dataset.save_to_disk(LOCAL_PATH)
-    except Exception as e:
-        logger.error(f"Dataset download/save failed: {str(e)}")
-        if os.path.exists(LOCAL_PATH):
-            shutil.rmtree(LOCAL_PATH)  # Clean up partial save
-        raise
-# Precompute CID to index mapping for CAP dataset
-cap_id_to_index = {doc['cid']: i for i, doc in enumerate(cap_dataset) if 'cid' in doc}
-# Preload some clusters in background (e.g., clusters 0-9)
-def preload_clusters():
-    for cluster_id in range(10):  # Adjust range as needed
-        try:
-            load_cluster_vectors(cluster_id, model="gte-Qwen2-1.5B-instruct")
-            logger.info(f"Preloaded cluster {cluster_id}")
-        except Exception as e:
-            logger.error(f"Preload failed for cluster {cluster_id}: {e}")
-threading.Thread(target=preload_clusters).start()
 # State dictionary for jurisdiction
 STATES = {
-    "AL": "Alabama",
-    "AK": "Alaska",
-    "AZ": "Arizona",
-    "AR": "Arkansas",
-    "CA": "California",
-    "CO": "Colorado",
-    "CT": "Connecticut",
-    "DE": "Delaware",
-    "FL": "Florida",
-    "GA": "Georgia",
-    "HI": "Hawaii",
-    "ID": "Idaho",
-    "IL": "Illinois",
-    "IN": "Indiana",
-    "IA": "Iowa",
-    "KS": "Kansas",
-    "KY": "Kentucky",
-    "LA": "Louisiana",
-    "ME": "Maine",
-    "MD": "Maryland",
-    "MA": "Massachusetts",
-    "MI": "Michigan",
-    "MN": "Minnesota",
-    "MS": "Mississippi",
-    "MO": "Missouri",
-    "MT": "Montana",
-    "NE": "Nebraska",
-    "NV": "Nevada",
-    "NH": "New Hampshire",
-    "NJ": "New Jersey",
-    "NM": "New Mexico",
-    "NY": "New York",
-    "NC": "North Carolina",
-    "ND": "North Dakota",
-    "OH": "Ohio",
-    "OK": "Oklahoma",
-    "OR": "Oregon",
-    "PA": "Pennsylvania",
-    "RI": "Rhode Island",
-    "SC": "South Carolina",
-    "SD": "South Dakota",
-    "TN": "Tennessee",
-    "TX": "Texas",
-    "UT": "Utah",
-    "VT": "Vermont",
-    "VA": "Virginia",
-    "WA": "Washington",
-    "WV": "West Virginia",
-    "WI": "Wisconsin",
-    "WY": "Wyoming",
-    "Federal": "Federal",
-    "All States": "All States",
-    "Other": "Other States"
 }
 def route_model(prompt, task_type, files=None, search_web=False, jurisdiction="KY"):
     logger.info(f"Routing prompt: {prompt}, Task: {task_type}, Web Search: {search_web}, Jurisdiction: {jurisdiction}")
     rag_context = ""
-    if task_type in ["case_law", "irac", "statute"]:  # Skip RAG for pure document_creation/summaries
-        combined_results = retrieve_context(prompt, task_type)
-        # Filter by jurisdiction if specified (e.g., "KY" for Kentucky)
-        if jurisdiction and jurisdiction != "All States":
-            state_name = STATES.get(jurisdiction, "")
-            state_code = jurisdiction  # e.g., "KY"
-            combined_results = [r for r in combined_results if state_code in r['citation'] or state_name in r['citation'] or state_code in r['name'] or state_name in r['name']]
-        if combined_results:
-            rag_context = "Retrieved legal authorities (case law and statutes):\n" + "\n".join([f"{i+1}. [{auth.get('source', 'Unknown')}] {auth['name']}, {auth['citation']}: \"{auth['snippet']}\"" for i, auth in enumerate(combined_results)])
-    prompt = f"User prompt: {prompt}\n\n{rag_context}"
     if task_type == "document_creation":
         # Route directly to fine-tuned GPT for document creation
         saul_response = ask_gpt41_mini(prompt, jurisdiction)
     else:
         try:
-            messages = build_saul_prompt(prompt, task_type, jurisdiction, rag_context)  # From prompt_builder
             saul_response = ask_saul(messages, task_type, jurisdiction)
         except Exception as e:
             logger.error(f"SaulLM failed: {e}. Falling back to GPT-4o.")
-            saul_response = ask_gpt4o(prompt)  # Fallback
-    # Task-specific processing (existing code)
     saul_response = process_task_response(task_type, saul_response, prompt, jurisdiction)
     if search_web:
         web_data = google_search(prompt, GOOGLE_CUSTOM_SEARCH_API_KEY, GOOGLE_SEARCH_API)
         saul_response = f"Google Search results: {web_data}\n{saul_response}"
     editor_prompt = build_editor_prompt(prompt, task_type, jurisdiction, saul_response, rag_context)
     final_response = ask_gpt4o(editor_prompt)
     final_response = ground_statutes(final_response, jurisdiction, GOOGLE_CUSTOM_SEARCH_API_KEY, GOOGLE_SEARCH_API, ask_gpt4o)
     return final_response
 def ask_saul(messages, task_type, jurisdiction):
@@ -234,10 +119,7 @@ def ask_saul(messages, task_type, jurisdiction):
         headers = {"Authorization": f"Bearer {hf_token}"} if hf_token else {}
         payload = {
             "messages": messages,
-            "parameters": {
-                "max_length": 32768,
-                "temperature": 0.3
-            }
         }
         logger.info(f"SaulLM payload: messages length={len(messages)}, max_length={payload['parameters']['max_length']}")
         response = requests.post(SAUL_ENDPOINT, headers=headers, json=payload)
@@ -249,10 +131,9 @@ def ask_saul(messages, task_type, jurisdiction):
             return result[0].get("generated_text", "[No response from SaulLM]")
         else:
             return result.get("generated_text", "[No response from SaulLM]")
     except Exception as e:
         logger.error(f"SaulLM error: {str(e)}")
-        raise  # Raise to catch in route_model for fallback
 def ask_gpt4o(prompt):
     try:
@@ -260,11 +141,9 @@ def ask_gpt4o(prompt):
         response = openai_client.chat.completions.create(
             model="gpt-4o",
             messages=[
-                {"role": "system", "content": (
-                    f"You are the final editor for a legal research assistant. {irac_system} "
-                    "Ensure high quote density from retrieved authorities and include relevant facts from the cited cases. "
-                    "Maintain accurate citations. Do not paraphrase legal holdings when direct quotes are available."
-                )},
                 {"role": "user", "content": prompt}
             ],
             temperature=0.3,
@@ -297,7 +176,7 @@ def extract_text_from_file(file_path):
 def classify_prompt(prompt):
     prompt_lower = prompt.lower()
     if "summarize" in prompt_lower:
-        return "document_analysis"  # Treat summarize as analysis for routing
     if any(k in prompt_lower for k in ["irac", "issue", "rule", "analysis", "conclusion", "brief", "memorandum", "memo"]):
         return "irac"
     elif any(k in prompt_lower for k in ["case", "precedent", "law"]):
@@ -335,10 +214,9 @@ def summarize_document(files):
         file = files[0]
         text = extract_text_from_file(file)
         if text:
-            summary = ask_gpt4o(f"Summarize the following document: {text[:10000]}")  # Limit to avoid token limits
             return f"Summary: {summary}"
-        return "No text extracted from file."
-    return "Please upload a file to summarize."
 def analyze_document(files):
     if files:
@@ -346,8 +224,7 @@ def analyze_document(files):
         if text:
             analysis = ask_gpt4o(f"Analyze the following document for legal issues, risks, or key clauses: {text[:10000]}")
             return f"Analysis: {analysis}"
-        return "No text extracted from file."
-    return "No file uploaded for analysis."
 def check_issues(files):
     if files:
@@ -355,8 +232,7 @@ def check_issues(files):
         if text:
             issues = ask_gpt4o(f"Check for red flags, unusual clauses, or potential issues in this legal document and highlight them: {text[:10000]}")
             return f"Highlighted Issues: {issues}"
-        return "No text extracted from file."
-    return "No file uploaded to check."
 # Flask routes
 @app_flask.route('/')
@@ -370,7 +246,6 @@ def api_chat():
     irac_mode = request.form.get('irac_mode', 'false') == 'true'
     search_web = request.form.get('web_search', 'false') == 'true'
     file = request.files.get('file')
     file_text = ""
     files = None
     if file:
@@ -378,7 +253,7 @@ def api_chat():
         temp_path = os.path.join('/tmp', filename)
         file.save(temp_path)
         file_text = extract_text_from_file(temp_path)
-        files = [temp_path]  # Pass as list for route_model
         os.remove(temp_path)
     task_type = classify_prompt(prompt)

 # app.py
 # This is the updated main script. Copy-paste this over your existing app.py.
 # Changes:
+# - Removed unused 'bm25s' import (replaced with rank_bm25 in retrieval.py).
+# - Integrated retrieve_context from retrieval.py, which now uses hybrid_cap_search with lazy loading.
+# - Added handling for missing CAP components, logging warnings and skipping RAG if caches are absent.
+# - Retained Flask for serving the custom HTML+CSS+JS frontend and API endpoint /api/chat.
+# - Kept file handling, IRAC mode, web search toggle, and task classification logic.
+# - Updated route_model to use retrieve_context only if CAP components are available.
+# - Note: Precompute CAP components with precompute_cap_embeddings.py before deployment.
+from flask import Flask, request, jsonify, send_from_directory
+from werkzeug.utils import secure_filename
 import os
 import logging
 from datetime import datetime
 import pdfplumber
+from docx import Document
 from googleapiclient.discovery import build
 import re
+from retrieval import retrieve_context, municipal_search  # Updated import
 from task_processing import process_task_response
 from gpt_helpers import ask_gpt41_mini
+from prompt_builder import build_saul_prompt, build_editor_prompt
+from post_processing import ground_statutes
+app_flask = Flask(__name__)
 os.environ["HF_HOME"] = "/data/.huggingface"
+# Logging setup
+logger = logging.getLogger("app")
+logging.basicConfig(level=logging.INFO)
+# Hugging Face authentication
+from huggingface_hub import login
 hf_token = os.environ.get("HF_TOKEN", "")
 if hf_token:
     login(hf_token)
     logger.info("Authenticated with Hugging Face token for gated repos.")
 # Check environment variables
 try:
     OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "Missing")
+    GOOGLE_SEARCH_API = os.environ.get("GOOGLE_SEARCH_API", "Missing")
+    GOOGLE_CUSTOM_SEARCH_API_KEY = os.environ.get("GOOGLE_CUSTOM_SEARCH_API_KEY", "Missing")
     if OPENAI_API_KEY == "Missing" or GOOGLE_CUSTOM_SEARCH_API_KEY == "Missing" or GOOGLE_SEARCH_API == "Missing":
         raise KeyError("API keys not set")
     logger.info(f"OpenAI API Key starts with: {OPENAI_API_KEY[:10]}...")
 except KeyError as e:
     logger.error(f"Missing environment variable: {str(e)}")
     raise EnvironmentError(f"Required secrets OPENAI_API_KEY, GOOGLE_CUSTOM_SEARCH_API_KEY, and GOOGLE_SEARCH_API must be set in Hugging Face Space Secrets")
 # Initialize OpenAI client
 openai_client = OpenAI(api_key=OPENAI_API_KEY)
 # SaulLM endpoint
 SAUL_ENDPOINT = "https://l4tuv4j9bu616t5x.us-east-1.aws.endpoints.huggingface.cloud"
 # State dictionary for jurisdiction
 STATES = {
+    "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas", "CA": "California",
+    "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware", "FL": "Florida", "GA": "Georgia",
+    "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa",
+    "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland",
+    "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi", "MO": "Missouri",
+    "MT": "Montana", "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey",
+    "NM": "New Mexico", "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio",
+    "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina",
+    "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont",
+    "VA": "Virginia", "WA": "Washington", "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming",
+    "Federal": "Federal", "All States": "All States", "Other": "Other States"
 }
 def route_model(prompt, task_type, files=None, search_web=False, jurisdiction="KY"):
     logger.info(f"Routing prompt: {prompt}, Task: {task_type}, Web Search: {search_web}, Jurisdiction: {jurisdiction}")
     rag_context = ""
+    if task_type in ["case_law", "irac", "statute"] and not os.getenv("SKIP_CAP_INIT", "false").lower() == "true":
+        # Check if CAP components are available
+        if all(os.path.exists(f"/data/cap_{ext}") for ext in ["tfidf.pkl", "tfidf_matrix.npz", "gte.npy", "openai.npy"]):
+            combined_results = retrieve_context(prompt, task_type)
+            # Filter by jurisdiction if specified
+            if jurisdiction and jurisdiction != "All States":
+                state_name = STATES.get(jurisdiction, "")
+                state_code = jurisdiction
+                combined_results = [r for r in combined_results if any(s in (r.get('citation', '') + r.get('name', '')) for s in [state_code, state_name])]
+            if combined_results:
+                rag_context = "Retrieved legal authorities (case law and statutes):\n" + "\n".join(
+                    [f"{i+1}. [{auth.get('source', 'Unknown')}] {auth['name']}, {auth['citation']}: \"{auth['snippet']}\"" for i, auth in enumerate(combined_results)])
+                prompt = f"User prompt: {prompt}\n\n{rag_context}"
+        else:
+            logger.warning("CAP hybrid components missing. Precompute them with precompute_cap_embeddings.py. Skipping RAG.")
     if task_type == "document_creation":
         # Route directly to fine-tuned GPT for document creation
         saul_response = ask_gpt41_mini(prompt, jurisdiction)
     else:
         try:
+            messages = build_saul_prompt(prompt, task_type, jurisdiction, rag_context)
             saul_response = ask_saul(messages, task_type, jurisdiction)
         except Exception as e:
             logger.error(f"SaulLM failed: {e}. Falling back to GPT-4o.")
+            saul_response = ask_gpt4o(prompt)
+    # Task-specific processing
     saul_response = process_task_response(task_type, saul_response, prompt, jurisdiction)
     if search_web:
         web_data = google_search(prompt, GOOGLE_CUSTOM_SEARCH_API_KEY, GOOGLE_SEARCH_API)
         saul_response = f"Google Search results: {web_data}\n{saul_response}"
     editor_prompt = build_editor_prompt(prompt, task_type, jurisdiction, saul_response, rag_context)
     final_response = ask_gpt4o(editor_prompt)
     final_response = ground_statutes(final_response, jurisdiction, GOOGLE_CUSTOM_SEARCH_API_KEY, GOOGLE_SEARCH_API, ask_gpt4o)
     return final_response
 def ask_saul(messages, task_type, jurisdiction):
         headers = {"Authorization": f"Bearer {hf_token}"} if hf_token else {}
         payload = {
             "messages": messages,
+            "parameters": {"max_length": 32768, "temperature": 0.3}
         }
         logger.info(f"SaulLM payload: messages length={len(messages)}, max_length={payload['parameters']['max_length']}")
         response = requests.post(SAUL_ENDPOINT, headers=headers, json=payload)
             return result[0].get("generated_text", "[No response from SaulLM]")
         else:
             return result.get("generated_text", "[No response from SaulLM]")
     except Exception as e:
         logger.error(f"SaulLM error: {str(e)}")
+        raise
 def ask_gpt4o(prompt):
     try:
         response = openai_client.chat.completions.create(
             model="gpt-4o",
             messages=[
+                {"role": "system", "content": f"You are the final editor for a legal research assistant. {irac_system} "
+                                              "Ensure high quote density from retrieved authorities and include relevant facts from the cited cases. "
+                                              "Maintain accurate citations. Do not paraphrase legal holdings when direct quotes are available."},
                 {"role": "user", "content": prompt}
             ],
             temperature=0.3,
 def classify_prompt(prompt):
     prompt_lower = prompt.lower()
     if "summarize" in prompt_lower:
+        return "document_analysis"
     if any(k in prompt_lower for k in ["irac", "issue", "rule", "analysis", "conclusion", "brief", "memorandum", "memo"]):
         return "irac"
     elif any(k in prompt_lower for k in ["case", "precedent", "law"]):
         file = files[0]
         text = extract_text_from_file(file)
         if text:
+            summary = ask_gpt4o(f"Summarize the following document: {text[:10000]}")
             return f"Summary: {summary}"
+    return "No text extracted from file." if files else "Please upload a file to summarize."
 def analyze_document(files):
     if files:
         if text:
             analysis = ask_gpt4o(f"Analyze the following document for legal issues, risks, or key clauses: {text[:10000]}")
             return f"Analysis: {analysis}"
+    return "No text extracted from file." if files else "No file uploaded for analysis."
 def check_issues(files):
     if files:
         if text:
             issues = ask_gpt4o(f"Check for red flags, unusual clauses, or potential issues in this legal document and highlight them: {text[:10000]}")
             return f"Highlighted Issues: {issues}"
+    return "No text extracted from file." if files else "No file uploaded to check."
 # Flask routes
 @app_flask.route('/')
     irac_mode = request.form.get('irac_mode', 'false') == 'true'
     search_web = request.form.get('web_search', 'false') == 'true'
     file = request.files.get('file')
     file_text = ""
     files = None
     if file:
         temp_path = os.path.join('/tmp', filename)
         file.save(temp_path)
         file_text = extract_text_from_file(temp_path)
+        files = [temp_path]
         os.remove(temp_path)
     task_type = classify_prompt(prompt)