ask-osho / create_vector_db.py
harithapliyal's picture
second commit
2d828c9
import os
from typing import List, Dict
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.utils import embedding_functions
from tqdm import tqdm
import torch
class PDFVectorizer:
def __init__(self, pdf_dir: str, db_dir: str):
self.pdf_dir = pdf_dir
self.db_dir = db_dir
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
# Initialize ChromaDB with sentence-transformers embeddings
self.client = chromadb.PersistentClient(path=db_dir)
# Check if GPU is available
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {self.device}")
self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="all-MiniLM-L6-v2",
device=self.device
)
self.collection = self.client.create_collection(
name="osho_books",
embedding_function=self.embedding_function
)
def extract_text_from_pdf(self, pdf_path: str) -> str:
"""Extract text from a PDF file."""
try:
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
print(f"Error processing {pdf_path}: {str(e)}")
return ""
def process_pdf(self, pdf_path: str) -> List[Dict]:
"""Process a single PDF file and return chunks with metadata."""
text = self.extract_text_from_pdf(pdf_path)
if not text:
return []
chunks = self.text_splitter.split_text(text)
book_name = os.path.basename(pdf_path)
return [{
"text": chunk,
"metadata": {
"book": book_name,
"chunk_index": i
}
} for i, chunk in enumerate(chunks)]
def create_vector_database(self):
"""Process all PDFs and create the vector database."""
pdf_files = [f for f in os.listdir(self.pdf_dir) if f.endswith('.pdf')]
for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
pdf_path = os.path.join(self.pdf_dir, pdf_file)
chunks = self.process_pdf(pdf_path)
if chunks:
# Add chunks to ChromaDB
self.collection.add(
documents=[chunk["text"] for chunk in chunks],
metadatas=[chunk["metadata"] for chunk in chunks],
ids=[f"{pdf_file}_{chunk['metadata']['chunk_index']}" for chunk in chunks]
)
print(f"Added {len(chunks)} chunks from {pdf_file}")
if __name__ == "__main__":
# Define directories
pdf_dir = os.path.join(os.getcwd(), "OshoBooks")
db_dir = os.path.join(os.getcwd(), "vector_db")
# Create vector database directory if it doesn't exist
os.makedirs(db_dir, exist_ok=True)
# Initialize and run the vectorizer
vectorizer = PDFVectorizer(pdf_dir, db_dir)
vectorizer.create_vector_database()
print("Vector database creation completed!")