Spaces:

Kalpokoch
/

ChatbotDemo

Running

App Files Files

Kalpokoch commited on Jul 28

Commit

a6eea30

verified ·

1 Parent(s): 95eb732

Update app/policy_vector_db.py

Browse files

Files changed (1) hide show

app/policy_vector_db.py +28 -59

app/policy_vector_db.py CHANGED Viewed

@@ -1,113 +1,82 @@
 import os
 import json
-import shutil
-import logging
 from typing import List, Dict
-import chromadb
 from sentence_transformers import SentenceTransformer
-import torch
-logger = logging.getLogger("vector-db")
 class PolicyVectorDB:
-    def __init__(self, persist_directory: str, top_k_default: int = 5, relevance_threshold: float = 0.65):
         self.persist_directory = persist_directory
         self.collection_name = "neepco_dop_policies"
         self.top_k_default = top_k_default
         self.relevance_threshold = relevance_threshold
-        self.client = chromadb.PersistentClient(path=self.persist_directory)
-        self.collection = None
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5", device=device)
-        logger.info(f"[INIT] Embedding model loaded on {device.upper()}.")
     def _get_collection(self):
         if self.collection is None:
             self.collection = self.client.get_or_create_collection(
                 name=self.collection_name,
                 metadata={"description": "NEEPCO Delegation of Powers Policy"}
             )
-            logger.info(f"[COLLECTION] Loaded collection '{self.collection_name}'. Count: {self.collection.count()}")
         return self.collection
     def _flatten_metadata(self, metadata: Dict) -> Dict:
-        return {k: str(v) for k, v in metadata.items()}
     def add_chunks(self, chunks: List[Dict]):
         collection = self._get_collection()
         if not chunks:
-            logger.warning("[ADD] No chunks to add.")
             return
         existing_ids = set(collection.get()['ids'])
-        new_chunks = [c for c in chunks if c['id'] not in existing_ids]
         if not new_chunks:
-            logger.info("[ADD] All chunks already exist in DB.")
             return
-        logger.info(f"[ADD] Adding {len(new_chunks)} new chunks.")
         batch_size = 128
         for i in range(0, len(new_chunks), batch_size):
             batch = new_chunks[i:i + batch_size]
-            texts = [c['text'] for c in batch]
-            ids = [c['id'] for c in batch]
-            metadatas = [self._flatten_metadata(c['metadata']) for c in batch]
             embeddings = self.embedding_model.encode(texts, show_progress_bar=False).tolist()
             collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
-        logger.info(f"[ADD] Total docs after insert: {collection.count()}")
     def search(self, query_text: str, top_k: int = None) -> List[Dict]:
         collection = self._get_collection()
-        top_k = top_k or self.top_k_default
         query_embedding = self.embedding_model.encode([query_text]).tolist()
         results = collection.query(
             query_embeddings=query_embedding,
             n_results=top_k,
             include=["documents", "metadatas", "distances"]
         )
         search_results = []
-        if not results.get("documents"):
-            logger.warning("[SEARCH] No documents found.")
-            return []
-        for i, doc in enumerate(results["documents"][0]):
-            score = 1 - results["distances"][0][i]
             search_results.append({
-                "text": doc,
-                "metadata": results["metadatas"][0][i],
-                "relevance_score": round(score, 4)
             })
-        logger.info(f"[SEARCH] Retrieved {len(search_results)} results for query: {query_text}")
         return search_results
-def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str) -> bool:
-    logger.info("[POPULATE] Checking vector DB...")
     try:
         if db_instance._get_collection().count() == 0:
             if not os.path.exists(chunks_file_path):
-                logger.error(f"[ERROR] Chunks file not found at {chunks_file_path}")
                 return False
-            with open(chunks_file_path, "r", encoding="utf-8") as f:
-                chunks = json.load(f)
-            logger.info(f"[POPULATE] Loaded {len(chunks)} chunks. Populating DB...")
-            db_instance.add_chunks(chunks)
-            logger.info("[POPULATE] DB population complete.")
         else:
-            logger.info("[POPULATE] DB already populated.")
-        return True
     except Exception as e:
-        logger.exception(f"[EXCEPTION] During DB population: {str(e)}")
         return False

 import os
 import json
+import torch
 from typing import List, Dict
 from sentence_transformers import SentenceTransformer
+import chromadb
+from chromadb.config import Settings
 class PolicyVectorDB:
+    def __init__(self, persist_directory: str, top_k_default: int = 5, relevance_threshold: float = 0.5):
         self.persist_directory = persist_directory
+        self.client = chromadb.PersistentClient(path=persist_directory, settings=Settings(allow_reset=True))
         self.collection_name = "neepco_dop_policies"
+        self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cuda' if torch.cuda.is_available() else 'cpu')
+        self.collection = None
         self.top_k_default = top_k_default
         self.relevance_threshold = relevance_threshold
     def _get_collection(self):
         if self.collection is None:
             self.collection = self.client.get_or_create_collection(
                 name=self.collection_name,
                 metadata={"description": "NEEPCO Delegation of Powers Policy"}
             )
         return self.collection
     def _flatten_metadata(self, metadata: Dict) -> Dict:
+        return {key: str(value) for key, value in metadata.items()}
     def add_chunks(self, chunks: List[Dict]):
         collection = self._get_collection()
         if not chunks:
+            print("No chunks provided to add.")
             return
         existing_ids = set(collection.get()['ids'])
+        new_chunks = [chunk for chunk in chunks if chunk.get('id') not in existing_ids]
         if not new_chunks:
+            print("No new chunks to add.")
             return
         batch_size = 128
         for i in range(0, len(new_chunks), batch_size):
             batch = new_chunks[i:i + batch_size]
+            texts = [chunk['text'] for chunk in batch]
+            ids = [chunk['id'] for chunk in batch]
+            metadatas = [self._flatten_metadata(chunk['metadata']) for chunk in batch]
             embeddings = self.embedding_model.encode(texts, show_progress_bar=False).tolist()
             collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
     def search(self, query_text: str, top_k: int = None) -> List[Dict]:
         collection = self._get_collection()
         query_embedding = self.embedding_model.encode([query_text]).tolist()
+        top_k = top_k if top_k else self.top_k_default
         results = collection.query(
             query_embeddings=query_embedding,
             n_results=top_k,
             include=["documents", "metadatas", "distances"]
         )
         search_results = []
+        for i, doc in enumerate(results['documents'][0]):
+            relevance_score = 1 - results['distances'][0][i]
             search_results.append({
+                'text': doc,
+                'metadata': results['metadatas'][0][i],
+                'relevance_score': relevance_score
             })
         return search_results
+def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str):
     try:
         if db_instance._get_collection().count() == 0:
             if not os.path.exists(chunks_file_path):
+                print(f"Chunks file not found at {chunks_file_path}")
                 return False
+            with open(chunks_file_path, 'r', encoding='utf-8') as f:
+                chunks_to_add = json.load(f)
+            db_instance.add_chunks(chunks_to_add)
+            return True
         else:
+            return True
     except Exception as e:
+        print(f"DB Population Error: {e}")
         return False