Spaces:
Running
Running
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import Chroma | |
| from sentence_transformers import SentenceTransformer | |
| from langchain_core.embeddings import Embeddings | |
| # ---------------------------- | |
| # Embeddings | |
| # ---------------------------- | |
| class SBERTEmbeddings(Embeddings): | |
| def __init__(self): | |
| self.model = SentenceTransformer("all-MiniLM-L6-v2") | |
| def embed_documents(self, texts): | |
| return self.model.encode(texts).tolist() | |
| def embed_query(self, text): | |
| return self.model.encode(text).tolist() | |
| print("Loading PDF...") | |
| # CHANGE THIS FILE NAME | |
| loader = PyPDFLoader("data/AI_note.pdf") | |
| documents = loader.load() | |
| print(f"Loaded {len(documents)} pages") | |
| # ---------------------------- | |
| # Chunking | |
| # ---------------------------- | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=50 | |
| ) | |
| chunks = splitter.split_documents(documents) | |
| print(f"Created {len(chunks)} chunks") | |
| # ---------------------------- | |
| # Embeddings | |
| # ---------------------------- | |
| embedding_model = SBERTEmbeddings() | |
| print("Creating vector database...") | |
| vectordb = Chroma.from_documents( | |
| documents=chunks, | |
| embedding=embedding_model, | |
| persist_directory="chroma_db" | |
| ) | |
| print("Vector DB created successfully!") |