from langchain.document_loaders import DirectoryLoader,PyPDFLoader,UnstructuredMarkdownLoader,BSHTMLLoader,UnstructuredExcelLoader,TextLoader,JSONLoader,Docx2txtLoader import tempfile from langchain.document_loaders import UnstructuredFileLoader from langchain.document_loaders.csv_loader import CSVLoader from langchain.text_splitter import RecursiveCharacterTextSplitter # import variables as vr from langchain.schema import document def load_documents_fn(files): loaders =[] documents=[] for file in files: print(type(file)) file_type = file.name.split('.')[-1] print(file) print(file_type) if file_type=="txt": temp_file = tempfile.NamedTemporaryFile(delete=False) temp_file.write(file.read()) temp_file_path = temp_file.name # temp_file_path = os.path.join(temp_dir,temp_file.name) text_loader = TextLoader(file_path=temp_file_path).load() # st.text("TXT file has been loaded into the text loader") print(text_loader) # loaders.append(text_loader) documents.extend(text_loader) # temp_file.close() # if temp_file_path: # os.remove(temp_file_path) if file_type == "pdf": temp_file = tempfile.NamedTemporaryFile(delete=False) temp_file.write(file.read()) temp_file_path = temp_file.name # temp_file_path = os.path.join(temp_dir,temp_file.name) pdf_loader = PyPDFLoader(file_path=temp_file_path).load() # pdf_loader=DirectoryLoader(temp_dir, glob="**/*.pdf",loader_cls=PyPDFLoader) # data=pdf_loader.load() print(pdf_loader) # st.text("Pdf has been successully loaded into the PDF_LOADER") # text = extract_text_from_pdf(temp_file_path) # print(text) # st.write("Extracted PDF Text:") # st.write(text) # loaders.append(pdf_loader) documents.extend(pdf_loader) # temp_file.close() # return documents # if temp_file_path: # os.remove(temp_file_path) elif file_type == "docx": temp_file = tempfile.NamedTemporaryFile(delete=False) temp_file.write(file.read()) temp_file_path = temp_file.name # temp_file_path = os.path.join(temp_dir,temp_file.name) docx_loader = Docx2txtLoader(file_path=temp_file_path).load() # docx_loader=DirectoryLoader(temp_dir, glob="**/*.docx",loader_cls=Docx2txtLoader) # st.text("DOCX has been successully loaded into the DOCX_LOADER") print(docx_loader) # text = extract_text_from_docx(temp_file_path) # print(text) # st.write("Extracted DOCX Text:") # st.write(text) # loaders.append(docx_loader) documents.extend(docx_loader) # temp_file.close() # if temp_file_path: # os.remove(temp_file_path) elif file_type == "csv": temp_file = tempfile.NamedTemporaryFile(delete=False) temp_file.write(file.read()) temp_file_path = temp_file.name # temp_file_path = os.path.join(temp_dir,temp_file.name) # csv_loader = CSVLoader(file_path=temp_file_path).load csv_loader = UnstructuredFileLoader(temp_file_path).load() # df = pd.read_csv(temp_file_path) # temp_file_path=df.to_excel(temp_file.name) # csv_loader = DataFrameLoader(temp_file_path).load() # csv_loader=DirectoryLoader(temp_dir, glob="**/*.csv",loader_cls=CSVLoader) # st.text("CSV has been successully loaded into the CSV_LOADER") # time.sleep(0.5)s print(csv_loader) # dataframe = pd.read_csv(temp_file_path) # print(dataframe) # st.write("CSV Data:") # st.write(dataframe) # loaders.append(csv_loader) documents.extend(csv_loader) # temp_file.close() # if temp_file_path: # os.remove(temp_file_path) elif file_type == "xlsx": temp_file = tempfile.NamedTemporaryFile(delete=False) temp_file.write(file.read()) temp_file_path = temp_file.name # temp_file_path = os.path.join(temp_dir,temp_file.name) excel_loader = UnstructuredExcelLoader(file_path=temp_file_path).load() # excel_loader=DirectoryLoader(temp_dir, glob="**/*.xlsx",loader_cls=UnstructuredExcelLoader) # st.text("Excel has been successully loaded into the DOCX_LOADER") print(excel_loader) print("Loaded the excel file in excel_loader") # dataframe = pd.read_excel(temp_file_path, engine='openpyxl') # print(dataframe) # st.write("Excel Data:") # st.write(dataframe) # loaders.append(excel_loader) documents.extend(excel_loader) # temp_file.close() # if temp_file_path: # os.remove(temp_file_path) elif file_type == "html": temp_file = tempfile.NamedTemporaryFile(delete=False) temp_file.write(file.read()) temp_file_path = temp_file.name # temp_file_path = os.path.join(temp_dir,temp_file.name) # html_loader = BSHTMLLoader(file_path=temp_file_path).load() html_loader = UnstructuredFileLoader(temp_file_path).load() # html_loader=DirectoryLoader(temp_dir, glob="**/*.html",loader_cls=UnstructuredHTMLLoader) # st.text("HTML has been successully loaded into the html_LOADER") print(html_loader) print("Loaded the html file in html_loader") # text = extract_text_from_html(temp_file_path) # print(text) # st.write("Extracted HTML Text:") # st.write(text) # loaders.append(html_loader) documents.extend(html_loader) #Splitting the documents text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) #chunk overlap seems to work better documents = text_splitter.split_documents(documents) return documents # def main(): # documents = load_documents()