Generative_QNA / load_documents.py
Omkar008's picture
Update load_documents.py
d3dc8f1
from langchain.document_loaders import DirectoryLoader,PyPDFLoader,UnstructuredMarkdownLoader,BSHTMLLoader,UnstructuredExcelLoader,TextLoader,JSONLoader,Docx2txtLoader
import tempfile
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# import variables as vr
from langchain.schema import document
def load_documents_fn(files):
loaders =[]
documents=[]
for file in files:
print(type(file))
file_type = file.name.split('.')[-1]
print(file)
print(file_type)
if file_type=="txt":
temp_file = tempfile.NamedTemporaryFile(delete=False)
temp_file.write(file.read())
temp_file_path = temp_file.name
# temp_file_path = os.path.join(temp_dir,temp_file.name)
text_loader = TextLoader(file_path=temp_file_path).load()
# st.text("TXT file has been loaded into the text loader")
print(text_loader)
# loaders.append(text_loader)
documents.extend(text_loader)
# temp_file.close()
# if temp_file_path:
# os.remove(temp_file_path)
if file_type == "pdf":
temp_file = tempfile.NamedTemporaryFile(delete=False)
temp_file.write(file.read())
temp_file_path = temp_file.name
# temp_file_path = os.path.join(temp_dir,temp_file.name)
pdf_loader = PyPDFLoader(file_path=temp_file_path).load()
# pdf_loader=DirectoryLoader(temp_dir, glob="**/*.pdf",loader_cls=PyPDFLoader)
# data=pdf_loader.load()
print(pdf_loader)
# st.text("Pdf has been successully loaded into the PDF_LOADER")
# text = extract_text_from_pdf(temp_file_path)
# print(text)
# st.write("Extracted PDF Text:")
# st.write(text)
# loaders.append(pdf_loader)
documents.extend(pdf_loader)
# temp_file.close()
# return documents
# if temp_file_path:
# os.remove(temp_file_path)
elif file_type == "docx":
temp_file = tempfile.NamedTemporaryFile(delete=False)
temp_file.write(file.read())
temp_file_path = temp_file.name
# temp_file_path = os.path.join(temp_dir,temp_file.name)
docx_loader = Docx2txtLoader(file_path=temp_file_path).load()
# docx_loader=DirectoryLoader(temp_dir, glob="**/*.docx",loader_cls=Docx2txtLoader)
# st.text("DOCX has been successully loaded into the DOCX_LOADER")
print(docx_loader)
# text = extract_text_from_docx(temp_file_path)
# print(text)
# st.write("Extracted DOCX Text:")
# st.write(text)
# loaders.append(docx_loader)
documents.extend(docx_loader)
# temp_file.close()
# if temp_file_path:
# os.remove(temp_file_path)
elif file_type == "csv":
temp_file = tempfile.NamedTemporaryFile(delete=False)
temp_file.write(file.read())
temp_file_path = temp_file.name
# temp_file_path = os.path.join(temp_dir,temp_file.name)
# csv_loader = CSVLoader(file_path=temp_file_path).load
csv_loader = UnstructuredFileLoader(temp_file_path).load()
# df = pd.read_csv(temp_file_path)
# temp_file_path=df.to_excel(temp_file.name)
# csv_loader = DataFrameLoader(temp_file_path).load()
# csv_loader=DirectoryLoader(temp_dir, glob="**/*.csv",loader_cls=CSVLoader)
# st.text("CSV has been successully loaded into the CSV_LOADER")
# time.sleep(0.5)s
print(csv_loader)
# dataframe = pd.read_csv(temp_file_path)
# print(dataframe)
# st.write("CSV Data:")
# st.write(dataframe)
# loaders.append(csv_loader)
documents.extend(csv_loader)
# temp_file.close()
# if temp_file_path:
# os.remove(temp_file_path)
elif file_type == "xlsx":
temp_file = tempfile.NamedTemporaryFile(delete=False)
temp_file.write(file.read())
temp_file_path = temp_file.name
# temp_file_path = os.path.join(temp_dir,temp_file.name)
excel_loader = UnstructuredExcelLoader(file_path=temp_file_path).load()
# excel_loader=DirectoryLoader(temp_dir, glob="**/*.xlsx",loader_cls=UnstructuredExcelLoader)
# st.text("Excel has been successully loaded into the DOCX_LOADER")
print(excel_loader)
print("Loaded the excel file in excel_loader")
# dataframe = pd.read_excel(temp_file_path, engine='openpyxl')
# print(dataframe)
# st.write("Excel Data:")
# st.write(dataframe)
# loaders.append(excel_loader)
documents.extend(excel_loader)
# temp_file.close()
# if temp_file_path:
# os.remove(temp_file_path)
elif file_type == "html":
temp_file = tempfile.NamedTemporaryFile(delete=False)
temp_file.write(file.read())
temp_file_path = temp_file.name
# temp_file_path = os.path.join(temp_dir,temp_file.name)
# html_loader = BSHTMLLoader(file_path=temp_file_path).load()
html_loader = UnstructuredFileLoader(temp_file_path).load()
# html_loader=DirectoryLoader(temp_dir, glob="**/*.html",loader_cls=UnstructuredHTMLLoader)
# st.text("HTML has been successully loaded into the html_LOADER")
print(html_loader)
print("Loaded the html file in html_loader")
# text = extract_text_from_html(temp_file_path)
# print(text)
# st.write("Extracted HTML Text:")
# st.write(text)
# loaders.append(html_loader)
documents.extend(html_loader)
#Splitting the documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) #chunk overlap seems to work better
documents = text_splitter.split_documents(documents)
return documents
# def main():
# documents = load_documents()