Spaces:

brandonmusic
/

VerdictAI

Runtime error

App Files Files Community

brandonmusic commited on Aug 12

Commit

a4e3c6d

verified ·

1 Parent(s): 34b551a

Update retrieval.py

Browse files

Files changed (1) hide show

retrieval.py +35 -39

retrieval.py CHANGED Viewed

@@ -1,9 +1,3 @@
-This updated `app.py` script includes the jurisdiction mapping in `route_model` (e.g., defaults to "KY" if not specified, maps to court codes like 'ky kyctapp'). It's fully copy-pastable—replace your existing file.
-### Updated retrieval.py Script
-```python
 # retrieval.py
 # Uncommented all sections for full functionality.
 # Removed duplicated Flask code at the end (copy-paste error).
@@ -13,12 +7,14 @@ This updated `app.py` script includes the jurisdiction mapping in `route_model`
 # Integrated google_search.
 import os
 import logging
-import requests # Lightweight, keep at top
-import pickle # Lightweight
 import shutil
 from huggingface_hub import hf_hub_download, snapshot_download
-from openai import OpenAI # Client init is fast, but usage in functions
-import time # For sleep after download
 # Logging setup (lightweight)
 logger = logging.getLogger("retrieval")
 logging.basicConfig(level=logging.INFO)
@@ -26,23 +22,23 @@ logging.basicConfig(level=logging.INFO)
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 openai_client = OpenAI(api_key=OPENAI_API_KEY)
 GOOGLE_CUSTOM_SEARCH_API_KEY = os.environ.get("GOOGLE_CUSTOM_SEARCH_API_KEY", "Missing")
-GOOGLE_SEARCH_API = os.environ.get("GOOGLE_SEARCH_API", "Missing") # CSE ID
 hf_token = os.environ.get("HF_TOKEN", "")
 COURT_LISTENER_API_KEY = os.environ.get("Court_Listener_API", "Missing")  # Updated to match HF secret name
 # Lazy placeholders
 centroid_vectors = None
 encoder = None
-municipal_encoder = None # Separate for potential dim mismatch
 _cluster_cache = {}
 municipal_faiss_index = None
-cap_faiss_index = None # New for CAP FAISS
 municipal_metadata = None
 municipal_texts = None
 bm25_municipal = None
 # Lazy-load CAP dataset
 def get_cap_dataset():
     if not hasattr(get_cap_dataset, 'dataset') or get_cap_dataset.dataset is None:
-        from datasets import load_from_disk # Lazy import
         LOCAL_PATH = "/data/cap_dataset"
         if os.path.exists(os.path.join(LOCAL_PATH, 'dataset_info.json')):
             try:
@@ -72,7 +68,7 @@ def load_encoder():
     global encoder
     if encoder is not None:
         return
-    from sentence_transformers import SentenceTransformer # Lazy import
     encoder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
     logger.info("🚀 Lazy-loaded CAP Encoder: SentenceTransformer (all-mpnet-base-v2 for 768 dim match)")
     logger.info(f"CAP encoder dimension: {encoder.get_sentence_embedding_dimension()}")
@@ -80,7 +76,7 @@ def load_municipal_encoder():
     global municipal_encoder
     if municipal_encoder is not None:
         return
-    from sentence_transformers import SentenceTransformer # Lazy import
     municipal_encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
     logger.info("🚀 Lazy-loaded Municipal Encoder: SentenceTransformer (all-MiniLM-L6-v2 for 384 dim match)")
     logger.info(f"Municipal encoder dimension: {municipal_encoder.get_sentence_embedding_dimension()}")
@@ -88,7 +84,7 @@ def load_cap_faiss_index():
     global cap_faiss_index
     if cap_faiss_index is not None:
         return
-    import faiss # Lazy import
     cap_index_path = "/data/knn.index"
     if not os.path.exists(cap_index_path):
         try:
@@ -96,7 +92,7 @@ def load_cap_faiss_index():
             logger.info("✅ Downloaded missing CAP FAISS index from HF.")
         except Exception as e:
             logger.error(f"❌ Failed to download CAP FAISS index: {str(e)}. CAP semantic search disabled.")
-            cap_faiss_index = "loaded" # Marker to avoid reload
             return
     try:
         cap_faiss_index = faiss.read_index(cap_index_path)
@@ -104,12 +100,12 @@ def load_cap_faiss_index():
         logger.info(f"CAP FAISS index dimension: {cap_faiss_index.d}")
     except Exception as e:
         logger.error(f"❌ Failed to load CAP FAISS index: {str(e)}. CAP semantic search disabled.")
-        cap_faiss_index = "loaded" # Marker
 def load_municipal_faiss_index():
     global municipal_faiss_index
     if municipal_faiss_index is not None:
         return
-    import faiss # Lazy import
     municipal_index_path = "/data/municipal.index"
     if os.path.exists(municipal_index_path):
         municipal_faiss_index = faiss.read_index(municipal_index_path)
@@ -117,7 +113,7 @@ def load_municipal_faiss_index():
         logger.info(f"Municipal FAISS index dimension: {municipal_faiss_index.d}")
     else:
         logger.error("municipal.index not found in /data. Hybrid search for municipal data disabled.")
-        municipal_faiss_index = "loaded" # Marker to avoid reload
 def load_municipal_metadata():
     global municipal_metadata
     if municipal_metadata is not None:
@@ -146,14 +142,14 @@ def load_bm25_municipal():
     global bm25_municipal
     if bm25_municipal is not None:
         return
-    from rank_bm25 import BM25Okapi # Lazy import
     bm25_municipal_path = "/data/bm25_municipal.pkl"
     if os.path.exists(bm25_municipal_path):
         with open(bm25_municipal_path, 'rb') as f:
             bm25_municipal = pickle.load(f)
         logger.info("✅ Lazy-loaded cached BM25 for municipal hybrid search.")
     else:
-        load_municipal_texts() # Ensure texts loaded
         if not municipal_texts:
             logger.error("Cannot build BM25 index because municipal texts are not loaded.")
             bm25_municipal = "build_failed"
@@ -164,18 +160,18 @@ def load_bm25_municipal():
             pickle.dump(bm25_municipal, f)
         logger.info("✅ Built and cached BM25 for municipal hybrid search.")
 def semantic_search(query, top_k=5, min_score=0.1):
-    import numpy as np # Lazy import
-    from sklearn.feature_extraction.text import TfidfVectorizer # Lazy import
-    from sklearn.metrics.pairwise import cosine_similarity # Lazy import
     logger.info(f"Search query sent to FAISS (CAP): {query}")
     load_cap_faiss_index()
-    if cap_faiss_index == "loaded": # Marker for failed load
         logger.warning("CAP FAISS index not available. Returning empty results.")
         return []
     load_encoder()
     query_vec = encoder.encode(query, normalize_embeddings=True)
     query_vec = np.array(query_vec).astype('float32').reshape(1, -1)
-    import faiss # Ensure imported
     try:
         if query_vec.shape[1] != cap_faiss_index.d:
             raise AssertionError(f"Dimension mismatch: query {query_vec.shape[1]} != index {cap_faiss_index.d}")
@@ -218,7 +214,7 @@ def semantic_search(query, top_k=5, min_score=0.1):
     logger.info(f"FAISS (CAP) returned {len(results)} docs")
     return [{k: v for k, v in r.items() if k != 'score'} for r in results]
 def municipal_search(query, top_k=5, min_score=0.1):
-    import numpy as np # Lazy import
     load_municipal_faiss_index()
     load_municipal_encoder()
     load_bm25_municipal()
@@ -279,23 +275,23 @@ def municipal_search(query, top_k=5, min_score=0.1):
     return [{k: v for k, v in r.items() if k != 'score'} for r in results[:top_k]]
 def retrieve_context(original_prompt, task_type, jurisdiction="ky"):
     query = query_rewrite(original_prompt, task_type)
     cap_results = semantic_search(query)
     municipal_results = municipal_search(query)
     combined_results = cap_results + municipal_results
     if not combined_results:
         logger.warning(f"No context found for query: {query} (task: {task_type}) — attempting web fallback.")
-        fallback_query = f"{query} site:law.cornell.edu OR site:justia.com OR site:findlaw.com"
         web_data = google_search(fallback_query, GOOGLE_CUSTOM_SEARCH_API_KEY, GOOGLE_SEARCH_API)
         if web_data != "No search results found.":
             combined_results = [{"source": "Web", "name": "Web Fallback", "citation": "Various Sources", "snippet": web_data[:700]}]
     # Added: Call CourtListener for case_law or irac tasks and append to combined_results
     if task_type in ["case_law", "irac"] and COURT_LISTENER_API_KEY != "Missing":
         logger.info("Calling CourtListener API...")
-        courtlistener_results = search_courtlistener(query, jurisdiction.lower(), '2021-01-01', '2025-08-11')
         if courtlistener_results and 'results' in courtlistener_results:
             logger.info(f"CourtListener returned {len(courtlistener_results['results'])} results")
             for result in courtlistener_results['results']:
@@ -323,7 +319,7 @@ def query_rewrite(original_prompt, task_type):
             temperature=0.3,
             max_tokens=50
         )
-        rewritten = response.choices[0].message.content.strip().replace('"', '') # Stripped quotes per Gemini
         logger.info(f"Original prompt: {original_prompt[:100]}... -> Rewritten query: {rewritten}")
         return rewritten
     except Exception as e:
@@ -335,7 +331,7 @@ def google_search(query, GOOGLE_CUSTOM_SEARCH_API_KEY, GOOGLE_SEARCH_API):
             return "Google Custom Search API key not set."
         if GOOGLE_SEARCH_API == "Missing":
             return "Google CSE ID not set."
-        from googleapiclient.discovery import build # Lazy import
         service = build("customsearch", "v1", developerKey=GOOGLE_CUSTOM_SEARCH_API_KEY, cache_discovery=False)
         res = service.cse().list(q=query, cx=GOOGLE_SEARCH_API).execute()
         if "items" in res:
@@ -357,7 +353,7 @@ def ground_statutes(response, jurisdiction, GOOGLE_CUSTOM_SEARCH_API_KEY, GOOGLE
     # In practice, parse response for statute mentions, search, and replace/inject quotes
     try:
         # Example: Find statute mentions and ground
-        statute_mentions = re.findall(r'KRS \d+\.\d+', response) # Simple regex for KRS
         if statute_mentions:
             for stat in statute_mentions:
                 search_result = google_search(f"{stat} {jurisdiction} statute text", GOOGLE_CUSTOM_SEARCH_API_KEY, GOOGLE_SEARCH_API)
@@ -368,7 +364,7 @@ def ground_statutes(response, jurisdiction, GOOGLE_CUSTOM_SEARCH_API_KEY, GOOGLE
         logger.error(f"Grounding error: {str(e)}")
         return response
 # New function for CourtListener search (added)
-def search_courtlistener(query, jurisdiction='ky', date_min='2021-01-01', date_max='2025-08-11'):
     """
     Searches CourtListener for cases matching the query.
     Returns JSON data for RAG processing.

 # retrieval.py
 # Uncommented all sections for full functionality.
 # Removed duplicated Flask code at the end (copy-paste error).
 # Integrated google_search.
 import os
 import logging
+import requests  # Lightweight, keep at top
+import pickle  # Lightweight
 import shutil
 from huggingface_hub import hf_hub_download, snapshot_download
+from openai import OpenAI  # Client init is fast, but usage in functions
+import time  # For sleep after download
+import re
+import datetime
 # Logging setup (lightweight)
 logger = logging.getLogger("retrieval")
 logging.basicConfig(level=logging.INFO)
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 openai_client = OpenAI(api_key=OPENAI_API_KEY)
 GOOGLE_CUSTOM_SEARCH_API_KEY = os.environ.get("GOOGLE_CUSTOM_SEARCH_API_KEY", "Missing")
+GOOGLE_SEARCH_API = os.environ.get("GOOGLE_SEARCH_API", "Missing")  # CSE ID
 hf_token = os.environ.get("HF_TOKEN", "")
 COURT_LISTENER_API_KEY = os.environ.get("Court_Listener_API", "Missing")  # Updated to match HF secret name
 # Lazy placeholders
 centroid_vectors = None
 encoder = None
+municipal_encoder = None  # Separate for potential dim mismatch
 _cluster_cache = {}
 municipal_faiss_index = None
+cap_faiss_index = None  # New for CAP FAISS
 municipal_metadata = None
 municipal_texts = None
 bm25_municipal = None
 # Lazy-load CAP dataset
 def get_cap_dataset():
     if not hasattr(get_cap_dataset, 'dataset') or get_cap_dataset.dataset is None:
+        from datasets import load_from_disk  # Lazy import
         LOCAL_PATH = "/data/cap_dataset"
         if os.path.exists(os.path.join(LOCAL_PATH, 'dataset_info.json')):
             try:
     global encoder
     if encoder is not None:
         return
+    from sentence_transformers import SentenceTransformer  # Lazy import
     encoder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
     logger.info("🚀 Lazy-loaded CAP Encoder: SentenceTransformer (all-mpnet-base-v2 for 768 dim match)")
     logger.info(f"CAP encoder dimension: {encoder.get_sentence_embedding_dimension()}")
     global municipal_encoder
     if municipal_encoder is not None:
         return
+    from sentence_transformers import SentenceTransformer  # Lazy import
     municipal_encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
     logger.info("🚀 Lazy-loaded Municipal Encoder: SentenceTransformer (all-MiniLM-L6-v2 for 384 dim match)")
     logger.info(f"Municipal encoder dimension: {municipal_encoder.get_sentence_embedding_dimension()}")
     global cap_faiss_index
     if cap_faiss_index is not None:
         return
+    import faiss  # Lazy import
     cap_index_path = "/data/knn.index"
     if not os.path.exists(cap_index_path):
         try:
             logger.info("✅ Downloaded missing CAP FAISS index from HF.")
         except Exception as e:
             logger.error(f"❌ Failed to download CAP FAISS index: {str(e)}. CAP semantic search disabled.")
+            cap_faiss_index = "loaded"  # Marker to avoid reload
             return
     try:
         cap_faiss_index = faiss.read_index(cap_index_path)
         logger.info(f"CAP FAISS index dimension: {cap_faiss_index.d}")
     except Exception as e:
         logger.error(f"❌ Failed to load CAP FAISS index: {str(e)}. CAP semantic search disabled.")
+        cap_faiss_index = "loaded"  # Marker
 def load_municipal_faiss_index():
     global municipal_faiss_index
     if municipal_faiss_index is not None:
         return
+    import faiss  # Lazy import
     municipal_index_path = "/data/municipal.index"
     if os.path.exists(municipal_index_path):
         municipal_faiss_index = faiss.read_index(municipal_index_path)
         logger.info(f"Municipal FAISS index dimension: {municipal_faiss_index.d}")
     else:
         logger.error("municipal.index not found in /data. Hybrid search for municipal data disabled.")
+        municipal_faiss_index = "loaded"  # Marker to avoid reload
 def load_municipal_metadata():
     global municipal_metadata
     if municipal_metadata is not None:
     global bm25_municipal
     if bm25_municipal is not None:
         return
+    from rank_bm25 import BM25Okapi  # Lazy import
     bm25_municipal_path = "/data/bm25_municipal.pkl"
     if os.path.exists(bm25_municipal_path):
         with open(bm25_municipal_path, 'rb') as f:
             bm25_municipal = pickle.load(f)
         logger.info("✅ Lazy-loaded cached BM25 for municipal hybrid search.")
     else:
+        load_municipal_texts()  # Ensure texts loaded
         if not municipal_texts:
             logger.error("Cannot build BM25 index because municipal texts are not loaded.")
             bm25_municipal = "build_failed"
             pickle.dump(bm25_municipal, f)
         logger.info("✅ Built and cached BM25 for municipal hybrid search.")
 def semantic_search(query, top_k=5, min_score=0.1):
+    import numpy as np  # Lazy import
+    from sklearn.feature_extraction.text import TfidfVectorizer  # Lazy import
+    from sklearn.metrics.pairwise import cosine_similarity  # Lazy import
     logger.info(f"Search query sent to FAISS (CAP): {query}")
     load_cap_faiss_index()
+    if cap_faiss_index == "loaded":  # Marker for failed load
         logger.warning("CAP FAISS index not available. Returning empty results.")
         return []
     load_encoder()
     query_vec = encoder.encode(query, normalize_embeddings=True)
     query_vec = np.array(query_vec).astype('float32').reshape(1, -1)
+    import faiss  # Ensure imported
     try:
         if query_vec.shape[1] != cap_faiss_index.d:
             raise AssertionError(f"Dimension mismatch: query {query_vec.shape[1]} != index {cap_faiss_index.d}")
     logger.info(f"FAISS (CAP) returned {len(results)} docs")
     return [{k: v for k, v in r.items() if k != 'score'} for r in results]
 def municipal_search(query, top_k=5, min_score=0.1):
+    import numpy as np  # Lazy import
     load_municipal_faiss_index()
     load_municipal_encoder()
     load_bm25_municipal()
     return [{k: v for k, v in r.items() if k != 'score'} for r in results[:top_k]]
 def retrieve_context(original_prompt, task_type, jurisdiction="ky"):
     query = query_rewrite(original_prompt, task_type)
     cap_results = semantic_search(query)
     municipal_results = municipal_search(query)
     combined_results = cap_results + municipal_results
     if not combined_results:
         logger.warning(f"No context found for query: {query} (task: {task_type}) — attempting web fallback.")
+        fallback_query = f"{query} site:law.cornell.edu OR site:justia.com OR site:findlaw.com OR site:findlaw.com"
         web_data = google_search(fallback_query, GOOGLE_CUSTOM_SEARCH_API_KEY, GOOGLE_SEARCH_API)
         if web_data != "No search results found.":
             combined_results = [{"source": "Web", "name": "Web Fallback", "citation": "Various Sources", "snippet": web_data[:700]}]
     # Added: Call CourtListener for case_law or irac tasks and append to combined_results
     if task_type in ["case_law", "irac"] and COURT_LISTENER_API_KEY != "Missing":
         logger.info("Calling CourtListener API...")
+        courtlistener_results = search_courtlistener(query, jurisdiction.lower(), '2021-01-01', datetime.datetime.today().date().isoformat())
         if courtlistener_results and 'results' in courtlistener_results:
             logger.info(f"CourtListener returned {len(courtlistener_results['results'])} results")
             for result in courtlistener_results['results']:
             temperature=0.3,
             max_tokens=50
         )
+        rewritten = response.choices[0].message.content.strip().replace('"', '')  # Stripped quotes per Gemini
         logger.info(f"Original prompt: {original_prompt[:100]}... -> Rewritten query: {rewritten}")
         return rewritten
     except Exception as e:
             return "Google Custom Search API key not set."
         if GOOGLE_SEARCH_API == "Missing":
             return "Google CSE ID not set."
+        from googleapiclient.discovery import build  # Lazy import
         service = build("customsearch", "v1", developerKey=GOOGLE_CUSTOM_SEARCH_API_KEY, cache_discovery=False)
         res = service.cse().list(q=query, cx=GOOGLE_SEARCH_API).execute()
         if "items" in res:
     # In practice, parse response for statute mentions, search, and replace/inject quotes
     try:
         # Example: Find statute mentions and ground
+        statute_mentions = re.findall(r'KRS \d+\.\d+', response)  # Simple regex for KRS
         if statute_mentions:
             for stat in statute_mentions:
                 search_result = google_search(f"{stat} {jurisdiction} statute text", GOOGLE_CUSTOM_SEARCH_API_KEY, GOOGLE_SEARCH_API)
         logger.error(f"Grounding error: {str(e)}")
         return response
 # New function for CourtListener search (added)
+def search_courtlistener(query, jurisdiction='ky', date_min='2021-01-01', date_max=datetime.datetime.today().date().isoformat()):
     """
     Searches CourtListener for cases matching the query.
     Returns JSON data for RAG processing.