from langchain.document_loaders import DirectoryLoader,PyPDFLoader,UnstructuredMarkdownLoader,BSHTMLLoader,UnstructuredExcelLoader,TextLoader,JSONLoader,Docx2txtLoader
import tempfile
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# import variables as vr
from langchain.schema import document
def load_documents_fn(files):
    loaders =[]
    documents=[]
    for file in files:
        print(type(file))

        file_type = file.name.split('.')[-1]
        print(file)
        print(file_type)
        if file_type=="txt":
            temp_file = tempfile.NamedTemporaryFile(delete=False)

            temp_file.write(file.read())
            temp_file_path = temp_file.name
            # temp_file_path = os.path.join(temp_dir,temp_file.name)

            text_loader = TextLoader(file_path=temp_file_path).load()
            # st.text("TXT file has been loaded into the text loader")
            print(text_loader)

            # loaders.append(text_loader)
            documents.extend(text_loader)
            # temp_file.close()
            # if temp_file_path:
            #     os.remove(temp_file_path)
        if file_type == "pdf":
            temp_file = tempfile.NamedTemporaryFile(delete=False)
            temp_file.write(file.read())
            temp_file_path = temp_file.name
            # temp_file_path = os.path.join(temp_dir,temp_file.name)
            pdf_loader = PyPDFLoader(file_path=temp_file_path).load()
            # pdf_loader=DirectoryLoader(temp_dir, glob="**/*.pdf",loader_cls=PyPDFLoader)
            # data=pdf_loader.load()
            print(pdf_loader)
            # st.text("Pdf has been successully loaded into the PDF_LOADER")
            # text = extract_text_from_pdf(temp_file_path)
            # print(text)
            # st.write("Extracted PDF Text:")
            # st.write(text)
            # loaders.append(pdf_loader)
            documents.extend(pdf_loader)
            # temp_file.close()
            # return documents
            # if temp_file_path:
            #     os.remove(temp_file_path)
        elif file_type == "docx":
            temp_file = tempfile.NamedTemporaryFile(delete=False)
            temp_file.write(file.read())
            temp_file_path = temp_file.name
            # temp_file_path = os.path.join(temp_dir,temp_file.name)
            docx_loader =  Docx2txtLoader(file_path=temp_file_path).load()
            # docx_loader=DirectoryLoader(temp_dir, glob="**/*.docx",loader_cls=Docx2txtLoader)
            # st.text("DOCX has been successully loaded into the DOCX_LOADER")
            print(docx_loader)
            # text = extract_text_from_docx(temp_file_path)
            # print(text)

            # st.write("Extracted DOCX Text:")
            # st.write(text)
            # loaders.append(docx_loader)
            documents.extend(docx_loader)
            # temp_file.close()
            # if temp_file_path:
            #     os.remove(temp_file_path)
        elif file_type == "csv":
            temp_file = tempfile.NamedTemporaryFile(delete=False)
            temp_file.write(file.read())
            temp_file_path = temp_file.name

            # temp_file_path = os.path.join(temp_dir,temp_file.name)
            # csv_loader = CSVLoader(file_path=temp_file_path).load
            csv_loader = UnstructuredFileLoader(temp_file_path).load()
            # df = pd.read_csv(temp_file_path)
            # temp_file_path=df.to_excel(temp_file.name)
            # csv_loader = DataFrameLoader(temp_file_path).load()

            # csv_loader=DirectoryLoader(temp_dir, glob="**/*.csv",loader_cls=CSVLoader)
            # st.text("CSV has been successully loaded into the CSV_LOADER")
            # time.sleep(0.5)s
            print(csv_loader)
            # dataframe = pd.read_csv(temp_file_path)
            # print(dataframe)
            # st.write("CSV Data:")
            # st.write(dataframe)
            # loaders.append(csv_loader)
            documents.extend(csv_loader)
            # temp_file.close()
            # if temp_file_path:
            #     os.remove(temp_file_path)
        elif file_type == "xlsx":
            temp_file = tempfile.NamedTemporaryFile(delete=False)
            temp_file.write(file.read())
            temp_file_path = temp_file.name
            # temp_file_path = os.path.join(temp_dir,temp_file.name)
            excel_loader = UnstructuredExcelLoader(file_path=temp_file_path).load()
            # excel_loader=DirectoryLoader(temp_dir, glob="**/*.xlsx",loader_cls=UnstructuredExcelLoader)
            # st.text("Excel has been successully loaded into the DOCX_LOADER")
            print(excel_loader)
            print("Loaded the excel file in excel_loader")
            # dataframe = pd.read_excel(temp_file_path, engine='openpyxl')
            # print(dataframe)
            # st.write("Excel Data:")
            # st.write(dataframe)
            # loaders.append(excel_loader)
            documents.extend(excel_loader)
            # temp_file.close()
            # if temp_file_path:
            #     os.remove(temp_file_path)
        elif file_type == "html":
            temp_file = tempfile.NamedTemporaryFile(delete=False)
            temp_file.write(file.read())
            temp_file_path = temp_file.name
            # temp_file_path = os.path.join(temp_dir,temp_file.name)
            # html_loader = BSHTMLLoader(file_path=temp_file_path).load()
            html_loader = UnstructuredFileLoader(temp_file_path).load()

            # html_loader=DirectoryLoader(temp_dir, glob="**/*.html",loader_cls=UnstructuredHTMLLoader)
            # st.text("HTML has been successully loaded into the html_LOADER")
            print(html_loader)
            print("Loaded the html file in html_loader")
            # text = extract_text_from_html(temp_file_path)
            # print(text)
            # st.write("Extracted HTML Text:")
            # st.write(text)
            # loaders.append(html_loader)
            documents.extend(html_loader)
        #Splitting the documents
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) #chunk overlap seems to work better
        documents = text_splitter.split_documents(documents)
        return documents
    

# def main():
#     documents = load_documents()