from transformers import pipeline from sentence_transformers import SentenceTransformer from config import RAG_CONFIG import os from PyPDF2 import PdfReader import chromadb import docx # Initialize the embeddings model embeddings_model = SentenceTransformer("intfloat/e5-large-v2") # Create or get collection chroma_client = chromadb.PersistentClient(path="./chroma_db") # Initialize ChromaDB client collection = chroma_client.get_or_create_collection( name="RagDocuments", metadata={ "hnsw:space": "cosine" }, # cosine similarity will be used to measure the distance between vectors ) def initRAG(device): # Initialize documents if collection is empty if collection.count() == 0: print("Loading documents into ChromaDB...") pdf_texts = load_pdfs(RAG_CONFIG["path"]) word_texts = load_word_docs(RAG_CONFIG["path"]) all_chunks = [] for text in pdf_texts: all_chunks.extend(chunk_text(text, chunk_size=100, overlap=5)) # Chunk word documents by paragraphs for text in word_texts: all_chunks.extend(text.split("\n\n")) # check for '' all_chunks = [chunk for chunk in all_chunks if chunk.strip()] print(f"Total number of chunks: {len(all_chunks)}") # print(all_chunks) # Generate embeddings and add to ChromaDB embeddings = embeddings_model.encode(all_chunks) collection.add( embeddings=embeddings.tolist(), documents=all_chunks, ids=[f"doc_{i}" for i in range(len(all_chunks))], ) ### Load PDFs def load_pdfs(directory): texts = [] for filename in os.listdir(directory): if filename.endswith(".pdf"): filepath = os.path.join(directory, filename) try: with open(filepath, "rb") as file: pdf = PdfReader(file) document_text = "" # Initialize for each file for page in pdf.pages: page_text = page.extract_text() or "" # Normalize whitespace page_text = " ".join(page_text.split()) document_text += f"{page_text} " if page_text.strip(): texts.append(document_text) except Exception as e: print(f"Error processing {filename}: {e}") return texts ### Load Word Documents def load_word_docs(directory): texts = [] for filename in os.listdir(directory): if filename.endswith(".docx"): filepath = os.path.join(directory, filename) try: doc = docx.Document(filepath) document_text = "\n".join([para.text for para in doc.paragraphs]) if document_text.strip(): texts.append(document_text) except Exception as e: print(f"Error processing {filename}: {e}") # check for empty paragraphs return texts ### Chunk Text for PDF def chunk_text(text, chunk_size, overlap=0): words = text.split() chunks = [] i = 0 while i < len(words): # Calculate end index for current chunk end = min(i + chunk_size, len(words)) # Create chunk from words chunk = " ".join(words[i:end]) if chunk.strip(): # Ensure the chunk is not empty chunks.append(chunk) # Move index forward by chunk_size - overlap i += chunk_size - overlap # If near the end and have leftover words that are less than overlap if i < len(words) and len(words) - i < overlap: break # Add final chunk if there are remaining words if i < len(words): chunks.append(" ".join(words[i:])) return chunks ### Search Documents in ChromaDB def search_docs(query, top_k=3): query_embedding = embeddings_model.encode(query) results = collection.query( query_embeddings=[query_embedding.tolist()], n_results=top_k ) formatted_results = [] for i in range(len(results["documents"][0])): doc = results["documents"][0][i] distance = results["distances"][0][i] if "distances" in results else 0 similarity = 1 - distance # Convert distance to similarity score formatted_result = { "content": doc, "similarity_score": f"{similarity:.2f}", } formatted_results.append(formatted_result) return formatted_results