Spaces:
Sleeping
Sleeping
Create test_embeddings.py
Browse files- test_embeddings.py +76 -0
test_embeddings.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import chromadb
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
from loguru import logger
|
5 |
+
|
6 |
+
class SentenceTransformerEmbeddings:
|
7 |
+
def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
|
8 |
+
self.model = SentenceTransformer(model_name)
|
9 |
+
|
10 |
+
def __call__(self, input: list[str]) -> list[list[float]]:
|
11 |
+
embeddings = self.model.encode(input)
|
12 |
+
return embeddings.tolist()
|
13 |
+
|
14 |
+
def test_chromadb_content():
|
15 |
+
"""Test if ChromaDB has the required content"""
|
16 |
+
try:
|
17 |
+
# Set up ChromaDB path
|
18 |
+
base_path = os.path.dirname(os.path.abspath(__file__))
|
19 |
+
chroma_path = os.path.join(base_path, 'chroma_db')
|
20 |
+
|
21 |
+
if not os.path.exists(chroma_path):
|
22 |
+
logger.error(f"ChromaDB directory not found at {chroma_path}")
|
23 |
+
return False
|
24 |
+
|
25 |
+
# Initialize ChromaDB
|
26 |
+
chroma_client = chromadb.PersistentClient(path=chroma_path)
|
27 |
+
|
28 |
+
# Check if collection exists
|
29 |
+
collections = chroma_client.list_collections()
|
30 |
+
if not any(col.name == "legal_documents" for col in collections):
|
31 |
+
logger.error("Legal documents collection not found in ChromaDB")
|
32 |
+
return False
|
33 |
+
|
34 |
+
# Get collection
|
35 |
+
collection = chroma_client.get_collection(
|
36 |
+
name="legal_documents",
|
37 |
+
embedding_function=SentenceTransformerEmbeddings()
|
38 |
+
)
|
39 |
+
|
40 |
+
# Check collection size
|
41 |
+
count = collection.count()
|
42 |
+
if count == 0:
|
43 |
+
logger.error("Collection is empty")
|
44 |
+
return False
|
45 |
+
|
46 |
+
logger.info(f"Found {count} documents in ChromaDB")
|
47 |
+
|
48 |
+
# Test query to verify content
|
49 |
+
test_results = collection.query(
|
50 |
+
query_texts=["What are the general provisions?"],
|
51 |
+
n_results=1
|
52 |
+
)
|
53 |
+
|
54 |
+
if not test_results['documents']:
|
55 |
+
logger.error("Test query returned no results")
|
56 |
+
return False
|
57 |
+
|
58 |
+
# Print sample content
|
59 |
+
logger.info("Sample content from ChromaDB:")
|
60 |
+
for i, (doc, metadata) in enumerate(zip(test_results['documents'][0], test_results['metadatas'][0])):
|
61 |
+
logger.info(f"\nDocument {i+1}:")
|
62 |
+
logger.info(f"Title: {metadata['title']}")
|
63 |
+
logger.info(f"Content preview: {doc[:200]}...")
|
64 |
+
|
65 |
+
return True
|
66 |
+
|
67 |
+
except Exception as e:
|
68 |
+
logger.error(f"Error testing ChromaDB: {str(e)}")
|
69 |
+
return False
|
70 |
+
|
71 |
+
if __name__ == "__main__":
|
72 |
+
success = test_chromadb_content()
|
73 |
+
if success:
|
74 |
+
print("ChromaDB content verification successful")
|
75 |
+
else:
|
76 |
+
print("ChromaDB content verification failed")
|