book_retriever / doc_retriever.py
achdaisy's picture
updates
ec775cd verified
import os
import openai
# Specify the folder path
folder_path = 'documents'
# List all files in the folder
files = os.listdir(folder_path)
# Count the number of files
num_documents = len(files)
print("Number of documents saved in the 'documents' folder:", num_documents)
os.environ['OPENAI_API_KEY'] = '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'
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever
## Text Splitting & Docloader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import PyPDFLoader
# Specify the folder path
folder_path = 'documents'
# List all PDF files in the folder
pdf_files = [file for file in os.listdir(folder_path) if file.endswith('.pdf')]
# Debug print to check the list of PDF files
print("PDF files found:", pdf_files)
# Create loaders for each PDF file
loaders = []
for pdf_file in pdf_files:
pdf_file_path = os.path.join(folder_path, pdf_file)
loader = PyPDFLoader(pdf_file_path)
loaders.append(loader)
# Load documents into a list
docs = []
for loader in loaders:
docs.extend(loader.load())
"""**Embeddings**"""
import torch
# Check if GPU is available
if torch.cuda.is_available():
device = torch.device('cuda')
else:
device = torch.device('cpu')
print(f'Using device: {device}')
from langchain.embeddings import HuggingFaceBgeEmbeddings
model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
bge_embeddings = HuggingFaceBgeEmbeddings(
model_name=model_name,
model_kwargs={'device': 'cpu'},
encode_kwargs=encode_kwargs
)
"""**Using ParentDocumentRetriever to retrieve full documents rather than chunks**"""
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
collection_name="full_documents",
embedding_function=bge_embeddings #OpenAIEmbeddings()
)
# The storage layer for the parent documents
store = InMemoryStore()
full_doc_retriever = ParentDocumentRetriever(
vectorstore=vectorstore,
docstore=store,
child_splitter=child_splitter,
)
full_doc_retriever.add_documents(docs, ids=None)
def answer(query):
# index = vectorstore.similarity_search(query, k = 2)
retrieved_docs = full_doc_retriever.get_relevant_documents(query)
output = retrieved_docs[0].page_content
return output