book_retriever

Runtime error

App Files Files Community

book_retriever / doc_retriever.py

achdaisy

updates

ec775cd verified 3 months ago

raw history blame contribute delete

No virus

2.67 kB


	import os
	import openai

	# Specify the folder path
	folder_path = 'documents'

	# List all files in the folder
	files = os.listdir(folder_path)

	# Count the number of files
	num_documents = len(files)

	print("Number of documents saved in the 'documents' folder:", num_documents)


	os.environ['OPENAI_API_KEY'] = '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'

	from langchain.schema import Document
	from langchain.vectorstores import Chroma
	from langchain.retrievers import ParentDocumentRetriever

	## Text Splitting & Docloader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.storage import InMemoryStore
	from langchain.document_loaders import PyPDFLoader



	from langchain.document_loaders import PyPDFLoader
	# Specify the folder path
	folder_path = 'documents'

	# List all PDF files in the folder
	pdf_files = [file for file in os.listdir(folder_path) if file.endswith('.pdf')]

	# Debug print to check the list of PDF files
	print("PDF files found:", pdf_files)

	# Create loaders for each PDF file
	loaders = []
	for pdf_file in pdf_files:
	pdf_file_path = os.path.join(folder_path, pdf_file)
	loader = PyPDFLoader(pdf_file_path)
	loaders.append(loader)

	# Load documents into a list
	docs = []
	for loader in loaders:
	docs.extend(loader.load())


	"""Embeddings"""

	import torch

	# Check if GPU is available
	if torch.cuda.is_available():
	device = torch.device('cuda')
	else:
	device = torch.device('cpu')

	print(f'Using device: {device}')



	from langchain.embeddings import HuggingFaceBgeEmbeddings

	model_name = "BAAI/bge-small-en-v1.5"
	encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

	bge_embeddings = HuggingFaceBgeEmbeddings(
	model_name=model_name,
	model_kwargs={'device': 'cpu'},
	encode_kwargs=encode_kwargs
	)

	"""Using ParentDocumentRetriever to retrieve full documents rather than chunks"""

	# This text splitter is used to create the child documents
	child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)


	# The vectorstore to use to index the child chunks
	vectorstore = Chroma(
	collection_name="full_documents",
	embedding_function=bge_embeddings #OpenAIEmbeddings()
	)

	# The storage layer for the parent documents
	store = InMemoryStore()

	full_doc_retriever = ParentDocumentRetriever(
	vectorstore=vectorstore,
	docstore=store,
	child_splitter=child_splitter,
	)

	full_doc_retriever.add_documents(docs, ids=None)


	def answer(query):
	# index = vectorstore.similarity_search(query, k = 2)

	retrieved_docs = full_doc_retriever.get_relevant_documents(query)
	output = retrieved_docs[0].page_content

	return output