Spaces:
Runtime error
Runtime error
import os | |
import openai | |
# Specify the folder path | |
folder_path = 'documents' | |
# List all files in the folder | |
files = os.listdir(folder_path) | |
# Count the number of files | |
num_documents = len(files) | |
print("Number of documents saved in the 'documents' folder:", num_documents) | |
os.environ['OPENAI_API_KEY'] = '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' | |
from langchain.schema import Document | |
from langchain.vectorstores import Chroma | |
from langchain.retrievers import ParentDocumentRetriever | |
## Text Splitting & Docloader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.storage import InMemoryStore | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.document_loaders import PyPDFLoader | |
# Specify the folder path | |
folder_path = 'documents' | |
# List all PDF files in the folder | |
pdf_files = [file for file in os.listdir(folder_path) if file.endswith('.pdf')] | |
# Debug print to check the list of PDF files | |
print("PDF files found:", pdf_files) | |
# Create loaders for each PDF file | |
loaders = [] | |
for pdf_file in pdf_files: | |
pdf_file_path = os.path.join(folder_path, pdf_file) | |
loader = PyPDFLoader(pdf_file_path) | |
loaders.append(loader) | |
# Load documents into a list | |
docs = [] | |
for loader in loaders: | |
docs.extend(loader.load()) | |
"""**Embeddings**""" | |
import torch | |
# Check if GPU is available | |
if torch.cuda.is_available(): | |
device = torch.device('cuda') | |
else: | |
device = torch.device('cpu') | |
print(f'Using device: {device}') | |
from langchain.embeddings import HuggingFaceBgeEmbeddings | |
model_name = "BAAI/bge-small-en-v1.5" | |
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity | |
bge_embeddings = HuggingFaceBgeEmbeddings( | |
model_name=model_name, | |
model_kwargs={'device': 'cpu'}, | |
encode_kwargs=encode_kwargs | |
) | |
"""**Using ParentDocumentRetriever to retrieve full documents rather than chunks**""" | |
# This text splitter is used to create the child documents | |
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400) | |
# The vectorstore to use to index the child chunks | |
vectorstore = Chroma( | |
collection_name="full_documents", | |
embedding_function=bge_embeddings #OpenAIEmbeddings() | |
) | |
# The storage layer for the parent documents | |
store = InMemoryStore() | |
full_doc_retriever = ParentDocumentRetriever( | |
vectorstore=vectorstore, | |
docstore=store, | |
child_splitter=child_splitter, | |
) | |
full_doc_retriever.add_documents(docs, ids=None) | |
def answer(query): | |
# index = vectorstore.similarity_search(query, k = 2) | |
retrieved_docs = full_doc_retriever.get_relevant_documents(query) | |
output = retrieved_docs[0].page_content | |
return output |