Spaces:
Paused
Paused
Refactor VectorDB.py to comment out debug print statement and update requirements.txt to correct docx package name
4bdcb8a
| from transformers import pipeline | |
| from sentence_transformers import SentenceTransformer | |
| from config import RAG_CONFIG | |
| import os | |
| from PyPDF2 import PdfReader | |
| import chromadb | |
| import docx | |
| # Initialize the embeddings model | |
| embeddings_model = SentenceTransformer("intfloat/e5-large-v2") | |
| # Create or get collection | |
| chroma_client = chromadb.PersistentClient(path="./chroma_db") | |
| # Initialize ChromaDB client | |
| collection = chroma_client.get_or_create_collection( | |
| name="RagDocuments", | |
| metadata={ | |
| "hnsw:space": "cosine" | |
| }, # cosine similarity will be used to measure the distance between vectors | |
| ) | |
| def initRAG(device): | |
| # Initialize documents if collection is empty | |
| if collection.count() == 0: | |
| print("Loading documents into ChromaDB...") | |
| pdf_texts = load_pdfs(RAG_CONFIG["path"]) | |
| word_texts = load_word_docs(RAG_CONFIG["path"]) | |
| all_chunks = [] | |
| for text in pdf_texts: | |
| all_chunks.extend(chunk_text(text, chunk_size=100, overlap=5)) | |
| # Chunk word documents by paragraphs | |
| for text in word_texts: | |
| all_chunks.extend(text.split("\n\n")) | |
| # check for '' | |
| all_chunks = [chunk for chunk in all_chunks if chunk.strip()] | |
| print(f"Total number of chunks: {len(all_chunks)}") | |
| # print(all_chunks) | |
| # Generate embeddings and add to ChromaDB | |
| embeddings = embeddings_model.encode(all_chunks) | |
| collection.add( | |
| embeddings=embeddings.tolist(), | |
| documents=all_chunks, | |
| ids=[f"doc_{i}" for i in range(len(all_chunks))], | |
| ) | |
| ### Load PDFs | |
| def load_pdfs(directory): | |
| texts = [] | |
| for filename in os.listdir(directory): | |
| if filename.endswith(".pdf"): | |
| filepath = os.path.join(directory, filename) | |
| try: | |
| with open(filepath, "rb") as file: | |
| pdf = PdfReader(file) | |
| document_text = "" # Initialize for each file | |
| for page in pdf.pages: | |
| page_text = page.extract_text() or "" | |
| # Normalize whitespace | |
| page_text = " ".join(page_text.split()) | |
| document_text += f"{page_text} " | |
| if page_text.strip(): | |
| texts.append(document_text) | |
| except Exception as e: | |
| print(f"Error processing {filename}: {e}") | |
| return texts | |
| ### Load Word Documents | |
| def load_word_docs(directory): | |
| texts = [] | |
| for filename in os.listdir(directory): | |
| if filename.endswith(".docx"): | |
| filepath = os.path.join(directory, filename) | |
| try: | |
| doc = docx.Document(filepath) | |
| document_text = "\n".join([para.text for para in doc.paragraphs]) | |
| if document_text.strip(): | |
| texts.append(document_text) | |
| except Exception as e: | |
| print(f"Error processing {filename}: {e}") | |
| # check for empty paragraphs | |
| return texts | |
| ### Chunk Text for PDF | |
| def chunk_text(text, chunk_size, overlap=0): | |
| words = text.split() | |
| chunks = [] | |
| i = 0 | |
| while i < len(words): | |
| # Calculate end index for current chunk | |
| end = min(i + chunk_size, len(words)) | |
| # Create chunk from words | |
| chunk = " ".join(words[i:end]) | |
| if chunk.strip(): # Ensure the chunk is not empty | |
| chunks.append(chunk) | |
| # Move index forward by chunk_size - overlap | |
| i += chunk_size - overlap | |
| # If near the end and have leftover words that are less than overlap | |
| if i < len(words) and len(words) - i < overlap: | |
| break | |
| # Add final chunk if there are remaining words | |
| if i < len(words): | |
| chunks.append(" ".join(words[i:])) | |
| return chunks | |
| ### Search Documents in ChromaDB | |
| def search_docs(query, top_k=3): | |
| query_embedding = embeddings_model.encode(query) | |
| results = collection.query( | |
| query_embeddings=[query_embedding.tolist()], n_results=top_k | |
| ) | |
| formatted_results = [] | |
| for i in range(len(results["documents"][0])): | |
| doc = results["documents"][0][i] | |
| distance = results["distances"][0][i] if "distances" in results else 0 | |
| similarity = 1 - distance # Convert distance to similarity score | |
| formatted_result = { | |
| "content": doc, | |
| "similarity_score": f"{similarity:.2f}", | |
| } | |
| formatted_results.append(formatted_result) | |
| return formatted_results | |