import os import openai # Specify the folder path folder_path = 'documents' # List all files in the folder files = os.listdir(folder_path) # Count the number of files num_documents = len(files) print("Number of documents saved in the 'documents' folder:", num_documents) os.environ['OPENAI_API_KEY'] = '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' from langchain.schema import Document from langchain.vectorstores import Chroma from langchain.retrievers import ParentDocumentRetriever ## Text Splitting & Docloader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.storage import InMemoryStore from langchain.document_loaders import PyPDFLoader from langchain.document_loaders import PyPDFLoader # Specify the folder path folder_path = 'documents' # List all PDF files in the folder pdf_files = [file for file in os.listdir(folder_path) if file.endswith('.pdf')] # Debug print to check the list of PDF files print("PDF files found:", pdf_files) # Create loaders for each PDF file loaders = [] for pdf_file in pdf_files: pdf_file_path = os.path.join(folder_path, pdf_file) loader = PyPDFLoader(pdf_file_path) loaders.append(loader) # Load documents into a list docs = [] for loader in loaders: docs.extend(loader.load()) """**Embeddings**""" import torch # Check if GPU is available if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print(f'Using device: {device}') from langchain.embeddings import HuggingFaceBgeEmbeddings model_name = "BAAI/bge-small-en-v1.5" encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity bge_embeddings = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs={'device': 'cpu'}, encode_kwargs=encode_kwargs ) """**Using ParentDocumentRetriever to retrieve full documents rather than chunks**""" # This text splitter is used to create the child documents child_splitter = RecursiveCharacterTextSplitter(chunk_size=400) # The vectorstore to use to index the child chunks vectorstore = Chroma( collection_name="full_documents", embedding_function=bge_embeddings #OpenAIEmbeddings() ) # The storage layer for the parent documents store = InMemoryStore() full_doc_retriever = ParentDocumentRetriever( vectorstore=vectorstore, docstore=store, child_splitter=child_splitter, ) full_doc_retriever.add_documents(docs, ids=None) def answer(query): # index = vectorstore.similarity_search(query, k = 2) retrieved_docs = full_doc_retriever.get_relevant_documents(query) output = retrieved_docs[0].page_content return output