Spaces:
Sleeping
Sleeping
import os | |
from typing import List, Dict | |
import PyPDF2 | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import chromadb | |
from chromadb.utils import embedding_functions | |
from tqdm import tqdm | |
import torch | |
class PDFVectorizer: | |
def __init__(self, pdf_dir: str, db_dir: str): | |
self.pdf_dir = pdf_dir | |
self.db_dir = db_dir | |
self.text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=200, | |
length_function=len | |
) | |
# Initialize ChromaDB with sentence-transformers embeddings | |
self.client = chromadb.PersistentClient(path=db_dir) | |
# Check if GPU is available | |
self.device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
print(f"Using device: {self.device}") | |
self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction( | |
model_name="all-MiniLM-L6-v2", | |
device=self.device | |
) | |
self.collection = self.client.create_collection( | |
name="osho_books", | |
embedding_function=self.embedding_function | |
) | |
def extract_text_from_pdf(self, pdf_path: str) -> str: | |
"""Extract text from a PDF file.""" | |
try: | |
with open(pdf_path, 'rb') as file: | |
reader = PyPDF2.PdfReader(file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
except Exception as e: | |
print(f"Error processing {pdf_path}: {str(e)}") | |
return "" | |
def process_pdf(self, pdf_path: str) -> List[Dict]: | |
"""Process a single PDF file and return chunks with metadata.""" | |
text = self.extract_text_from_pdf(pdf_path) | |
if not text: | |
return [] | |
chunks = self.text_splitter.split_text(text) | |
book_name = os.path.basename(pdf_path) | |
return [{ | |
"text": chunk, | |
"metadata": { | |
"book": book_name, | |
"chunk_index": i | |
} | |
} for i, chunk in enumerate(chunks)] | |
def create_vector_database(self): | |
"""Process all PDFs and create the vector database.""" | |
pdf_files = [f for f in os.listdir(self.pdf_dir) if f.endswith('.pdf')] | |
for pdf_file in tqdm(pdf_files, desc="Processing PDFs"): | |
pdf_path = os.path.join(self.pdf_dir, pdf_file) | |
chunks = self.process_pdf(pdf_path) | |
if chunks: | |
# Add chunks to ChromaDB | |
self.collection.add( | |
documents=[chunk["text"] for chunk in chunks], | |
metadatas=[chunk["metadata"] for chunk in chunks], | |
ids=[f"{pdf_file}_{chunk['metadata']['chunk_index']}" for chunk in chunks] | |
) | |
print(f"Added {len(chunks)} chunks from {pdf_file}") | |
if __name__ == "__main__": | |
# Define directories | |
pdf_dir = os.path.join(os.getcwd(), "OshoBooks") | |
db_dir = os.path.join(os.getcwd(), "vector_db") | |
# Create vector database directory if it doesn't exist | |
os.makedirs(db_dir, exist_ok=True) | |
# Initialize and run the vectorizer | |
vectorizer = PDFVectorizer(pdf_dir, db_dir) | |
vectorizer.create_vector_database() | |
print("Vector database creation completed!") | |