import os
import openai

# Specify the folder path
folder_path = 'documents'

# List all files in the folder
files = os.listdir(folder_path)

# Count the number of files
num_documents = len(files)

print("Number of documents saved in the 'documents' folder:", num_documents)


os.environ['OPENAI_API_KEY'] = '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'

from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever

## Text Splitting & Docloader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.document_loaders import PyPDFLoader


from langchain.document_loaders import PyPDFLoader
# Specify the folder path
folder_path = 'documents'

# List all PDF files in the folder
pdf_files = [file for file in os.listdir(folder_path) if file.endswith('.pdf')]

# Debug print to check the list of PDF files
print("PDF files found:", pdf_files)

# Create loaders for each PDF file
loaders = []
for pdf_file in pdf_files:
    pdf_file_path = os.path.join(folder_path, pdf_file)
    loader = PyPDFLoader(pdf_file_path)
    loaders.append(loader)

# Load documents into a list
docs = []
for loader in loaders:
    docs.extend(loader.load())


"""**Embeddings**"""

import torch

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print(f'Using device: {device}')


from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cpu'},
    encode_kwargs=encode_kwargs
)

"""**Using ParentDocumentRetriever to retrieve full documents rather than chunks**"""

# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)


# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents",
    embedding_function=bge_embeddings  #OpenAIEmbeddings()
)

# The storage layer for the parent documents
store = InMemoryStore()

full_doc_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

full_doc_retriever.add_documents(docs, ids=None)


def answer(query):
    # index = vectorstore.similarity_search(query, k = 2)

    retrieved_docs = full_doc_retriever.get_relevant_documents(query)
    output = retrieved_docs[0].page_content

    return output