TestGradio / VectorDB.py
scott12355's picture
Refactor VectorDB.py to comment out debug print statement and update requirements.txt to correct docx package name
4bdcb8a
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from config import RAG_CONFIG
import os
from PyPDF2 import PdfReader
import chromadb
import docx
# Initialize the embeddings model
embeddings_model = SentenceTransformer("intfloat/e5-large-v2")
# Create or get collection
chroma_client = chromadb.PersistentClient(path="./chroma_db")
# Initialize ChromaDB client
collection = chroma_client.get_or_create_collection(
name="RagDocuments",
metadata={
"hnsw:space": "cosine"
}, # cosine similarity will be used to measure the distance between vectors
)
def initRAG(device):
# Initialize documents if collection is empty
if collection.count() == 0:
print("Loading documents into ChromaDB...")
pdf_texts = load_pdfs(RAG_CONFIG["path"])
word_texts = load_word_docs(RAG_CONFIG["path"])
all_chunks = []
for text in pdf_texts:
all_chunks.extend(chunk_text(text, chunk_size=100, overlap=5))
# Chunk word documents by paragraphs
for text in word_texts:
all_chunks.extend(text.split("\n\n"))
# check for ''
all_chunks = [chunk for chunk in all_chunks if chunk.strip()]
print(f"Total number of chunks: {len(all_chunks)}")
# print(all_chunks)
# Generate embeddings and add to ChromaDB
embeddings = embeddings_model.encode(all_chunks)
collection.add(
embeddings=embeddings.tolist(),
documents=all_chunks,
ids=[f"doc_{i}" for i in range(len(all_chunks))],
)
### Load PDFs
def load_pdfs(directory):
texts = []
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
filepath = os.path.join(directory, filename)
try:
with open(filepath, "rb") as file:
pdf = PdfReader(file)
document_text = "" # Initialize for each file
for page in pdf.pages:
page_text = page.extract_text() or ""
# Normalize whitespace
page_text = " ".join(page_text.split())
document_text += f"{page_text} "
if page_text.strip():
texts.append(document_text)
except Exception as e:
print(f"Error processing {filename}: {e}")
return texts
### Load Word Documents
def load_word_docs(directory):
texts = []
for filename in os.listdir(directory):
if filename.endswith(".docx"):
filepath = os.path.join(directory, filename)
try:
doc = docx.Document(filepath)
document_text = "\n".join([para.text for para in doc.paragraphs])
if document_text.strip():
texts.append(document_text)
except Exception as e:
print(f"Error processing {filename}: {e}")
# check for empty paragraphs
return texts
### Chunk Text for PDF
def chunk_text(text, chunk_size, overlap=0):
words = text.split()
chunks = []
i = 0
while i < len(words):
# Calculate end index for current chunk
end = min(i + chunk_size, len(words))
# Create chunk from words
chunk = " ".join(words[i:end])
if chunk.strip(): # Ensure the chunk is not empty
chunks.append(chunk)
# Move index forward by chunk_size - overlap
i += chunk_size - overlap
# If near the end and have leftover words that are less than overlap
if i < len(words) and len(words) - i < overlap:
break
# Add final chunk if there are remaining words
if i < len(words):
chunks.append(" ".join(words[i:]))
return chunks
### Search Documents in ChromaDB
def search_docs(query, top_k=3):
query_embedding = embeddings_model.encode(query)
results = collection.query(
query_embeddings=[query_embedding.tolist()], n_results=top_k
)
formatted_results = []
for i in range(len(results["documents"][0])):
doc = results["documents"][0][i]
distance = results["distances"][0][i] if "distances" in results else 0
similarity = 1 - distance # Convert distance to similarity score
formatted_result = {
"content": doc,
"similarity_score": f"{similarity:.2f}",
}
formatted_results.append(formatted_result)
return formatted_results