Spaces:

Hushh
/

Generative_QNA

Paused

App Files Files Community

Generative_QNA / load_documents.py

Omkar008

Update load_documents.py

d3dc8f1 7 months ago

raw history blame contribute delete

No virus

6.43 kB

	from langchain.document_loaders import DirectoryLoader,PyPDFLoader,UnstructuredMarkdownLoader,BSHTMLLoader,UnstructuredExcelLoader,TextLoader,JSONLoader,Docx2txtLoader
	import tempfile
	from langchain.document_loaders import UnstructuredFileLoader
	from langchain.document_loaders.csv_loader import CSVLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	# import variables as vr
	from langchain.schema import document
	def load_documents_fn(files):
	loaders =[]
	documents=[]
	for file in files:
	print(type(file))

	file_type = file.name.split('.')[-1]
	print(file)
	print(file_type)
	if file_type=="txt":
	temp_file = tempfile.NamedTemporaryFile(delete=False)

	temp_file.write(file.read())
	temp_file_path = temp_file.name
	# temp_file_path = os.path.join(temp_dir,temp_file.name)

	text_loader = TextLoader(file_path=temp_file_path).load()
	# st.text("TXT file has been loaded into the text loader")
	print(text_loader)

	# loaders.append(text_loader)
	documents.extend(text_loader)
	# temp_file.close()
	# if temp_file_path:
	# os.remove(temp_file_path)
	if file_type == "pdf":
	temp_file = tempfile.NamedTemporaryFile(delete=False)
	temp_file.write(file.read())
	temp_file_path = temp_file.name
	# temp_file_path = os.path.join(temp_dir,temp_file.name)
	pdf_loader = PyPDFLoader(file_path=temp_file_path).load()
	# pdf_loader=DirectoryLoader(temp_dir, glob="*/.pdf",loader_cls=PyPDFLoader)
	# data=pdf_loader.load()
	print(pdf_loader)
	# st.text("Pdf has been successully loaded into the PDF_LOADER")
	# text = extract_text_from_pdf(temp_file_path)
	# print(text)
	# st.write("Extracted PDF Text:")
	# st.write(text)
	# loaders.append(pdf_loader)
	documents.extend(pdf_loader)
	# temp_file.close()
	# return documents
	# if temp_file_path:
	# os.remove(temp_file_path)
	elif file_type == "docx":
	temp_file = tempfile.NamedTemporaryFile(delete=False)
	temp_file.write(file.read())
	temp_file_path = temp_file.name
	# temp_file_path = os.path.join(temp_dir,temp_file.name)
	docx_loader = Docx2txtLoader(file_path=temp_file_path).load()
	# docx_loader=DirectoryLoader(temp_dir, glob="*/.docx",loader_cls=Docx2txtLoader)
	# st.text("DOCX has been successully loaded into the DOCX_LOADER")
	print(docx_loader)
	# text = extract_text_from_docx(temp_file_path)
	# print(text)

	# st.write("Extracted DOCX Text:")
	# st.write(text)
	# loaders.append(docx_loader)
	documents.extend(docx_loader)
	# temp_file.close()
	# if temp_file_path:
	# os.remove(temp_file_path)
	elif file_type == "csv":
	temp_file = tempfile.NamedTemporaryFile(delete=False)
	temp_file.write(file.read())
	temp_file_path = temp_file.name

	# temp_file_path = os.path.join(temp_dir,temp_file.name)
	# csv_loader = CSVLoader(file_path=temp_file_path).load
	csv_loader = UnstructuredFileLoader(temp_file_path).load()
	# df = pd.read_csv(temp_file_path)
	# temp_file_path=df.to_excel(temp_file.name)
	# csv_loader = DataFrameLoader(temp_file_path).load()

	# csv_loader=DirectoryLoader(temp_dir, glob="*/.csv",loader_cls=CSVLoader)
	# st.text("CSV has been successully loaded into the CSV_LOADER")
	# time.sleep(0.5)s
	print(csv_loader)
	# dataframe = pd.read_csv(temp_file_path)
	# print(dataframe)
	# st.write("CSV Data:")
	# st.write(dataframe)
	# loaders.append(csv_loader)
	documents.extend(csv_loader)
	# temp_file.close()
	# if temp_file_path:
	# os.remove(temp_file_path)
	elif file_type == "xlsx":
	temp_file = tempfile.NamedTemporaryFile(delete=False)
	temp_file.write(file.read())
	temp_file_path = temp_file.name
	# temp_file_path = os.path.join(temp_dir,temp_file.name)
	excel_loader = UnstructuredExcelLoader(file_path=temp_file_path).load()
	# excel_loader=DirectoryLoader(temp_dir, glob="*/.xlsx",loader_cls=UnstructuredExcelLoader)
	# st.text("Excel has been successully loaded into the DOCX_LOADER")
	print(excel_loader)
	print("Loaded the excel file in excel_loader")
	# dataframe = pd.read_excel(temp_file_path, engine='openpyxl')
	# print(dataframe)
	# st.write("Excel Data:")
	# st.write(dataframe)
	# loaders.append(excel_loader)
	documents.extend(excel_loader)
	# temp_file.close()
	# if temp_file_path:
	# os.remove(temp_file_path)
	elif file_type == "html":
	temp_file = tempfile.NamedTemporaryFile(delete=False)
	temp_file.write(file.read())
	temp_file_path = temp_file.name
	# temp_file_path = os.path.join(temp_dir,temp_file.name)
	# html_loader = BSHTMLLoader(file_path=temp_file_path).load()
	html_loader = UnstructuredFileLoader(temp_file_path).load()

	# html_loader=DirectoryLoader(temp_dir, glob="*/.html",loader_cls=UnstructuredHTMLLoader)
	# st.text("HTML has been successully loaded into the html_LOADER")
	print(html_loader)
	print("Loaded the html file in html_loader")
	# text = extract_text_from_html(temp_file_path)
	# print(text)
	# st.write("Extracted HTML Text:")
	# st.write(text)
	# loaders.append(html_loader)
	documents.extend(html_loader)
	#Splitting the documents
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) #chunk overlap seems to work better
	documents = text_splitter.split_documents(documents)
	return documents


	# def main():
	# documents = load_documents()