veerukhannan commited on
Commit
6a1ad16
·
verified ·
1 Parent(s): 6702977

Create test_embeddings.py

Browse files
Files changed (1) hide show
  1. test_embeddings.py +76 -0
test_embeddings.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chromadb
3
+ from sentence_transformers import SentenceTransformer
4
+ from loguru import logger
5
+
6
+ class SentenceTransformerEmbeddings:
7
+ def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
8
+ self.model = SentenceTransformer(model_name)
9
+
10
+ def __call__(self, input: list[str]) -> list[list[float]]:
11
+ embeddings = self.model.encode(input)
12
+ return embeddings.tolist()
13
+
14
+ def test_chromadb_content():
15
+ """Test if ChromaDB has the required content"""
16
+ try:
17
+ # Set up ChromaDB path
18
+ base_path = os.path.dirname(os.path.abspath(__file__))
19
+ chroma_path = os.path.join(base_path, 'chroma_db')
20
+
21
+ if not os.path.exists(chroma_path):
22
+ logger.error(f"ChromaDB directory not found at {chroma_path}")
23
+ return False
24
+
25
+ # Initialize ChromaDB
26
+ chroma_client = chromadb.PersistentClient(path=chroma_path)
27
+
28
+ # Check if collection exists
29
+ collections = chroma_client.list_collections()
30
+ if not any(col.name == "legal_documents" for col in collections):
31
+ logger.error("Legal documents collection not found in ChromaDB")
32
+ return False
33
+
34
+ # Get collection
35
+ collection = chroma_client.get_collection(
36
+ name="legal_documents",
37
+ embedding_function=SentenceTransformerEmbeddings()
38
+ )
39
+
40
+ # Check collection size
41
+ count = collection.count()
42
+ if count == 0:
43
+ logger.error("Collection is empty")
44
+ return False
45
+
46
+ logger.info(f"Found {count} documents in ChromaDB")
47
+
48
+ # Test query to verify content
49
+ test_results = collection.query(
50
+ query_texts=["What are the general provisions?"],
51
+ n_results=1
52
+ )
53
+
54
+ if not test_results['documents']:
55
+ logger.error("Test query returned no results")
56
+ return False
57
+
58
+ # Print sample content
59
+ logger.info("Sample content from ChromaDB:")
60
+ for i, (doc, metadata) in enumerate(zip(test_results['documents'][0], test_results['metadatas'][0])):
61
+ logger.info(f"\nDocument {i+1}:")
62
+ logger.info(f"Title: {metadata['title']}")
63
+ logger.info(f"Content preview: {doc[:200]}...")
64
+
65
+ return True
66
+
67
+ except Exception as e:
68
+ logger.error(f"Error testing ChromaDB: {str(e)}")
69
+ return False
70
+
71
+ if __name__ == "__main__":
72
+ success = test_chromadb_content()
73
+ if success:
74
+ print("ChromaDB content verification successful")
75
+ else:
76
+ print("ChromaDB content verification failed")