Spaces:
Paused
Paused
from langchain.document_loaders import DirectoryLoader,PyPDFLoader,UnstructuredMarkdownLoader,BSHTMLLoader,UnstructuredExcelLoader,TextLoader,JSONLoader,Docx2txtLoader | |
import tempfile | |
from langchain.document_loaders import UnstructuredFileLoader | |
from langchain.document_loaders.csv_loader import CSVLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# import variables as vr | |
from langchain.schema import document | |
def load_documents_fn(files): | |
loaders =[] | |
documents=[] | |
for file in files: | |
print(type(file)) | |
file_type = file.name.split('.')[-1] | |
print(file) | |
print(file_type) | |
if file_type=="txt": | |
temp_file = tempfile.NamedTemporaryFile(delete=False) | |
temp_file.write(file.read()) | |
temp_file_path = temp_file.name | |
# temp_file_path = os.path.join(temp_dir,temp_file.name) | |
text_loader = TextLoader(file_path=temp_file_path).load() | |
# st.text("TXT file has been loaded into the text loader") | |
print(text_loader) | |
# loaders.append(text_loader) | |
documents.extend(text_loader) | |
# temp_file.close() | |
# if temp_file_path: | |
# os.remove(temp_file_path) | |
if file_type == "pdf": | |
temp_file = tempfile.NamedTemporaryFile(delete=False) | |
temp_file.write(file.read()) | |
temp_file_path = temp_file.name | |
# temp_file_path = os.path.join(temp_dir,temp_file.name) | |
pdf_loader = PyPDFLoader(file_path=temp_file_path).load() | |
# pdf_loader=DirectoryLoader(temp_dir, glob="**/*.pdf",loader_cls=PyPDFLoader) | |
# data=pdf_loader.load() | |
print(pdf_loader) | |
# st.text("Pdf has been successully loaded into the PDF_LOADER") | |
# text = extract_text_from_pdf(temp_file_path) | |
# print(text) | |
# st.write("Extracted PDF Text:") | |
# st.write(text) | |
# loaders.append(pdf_loader) | |
documents.extend(pdf_loader) | |
# temp_file.close() | |
# return documents | |
# if temp_file_path: | |
# os.remove(temp_file_path) | |
elif file_type == "docx": | |
temp_file = tempfile.NamedTemporaryFile(delete=False) | |
temp_file.write(file.read()) | |
temp_file_path = temp_file.name | |
# temp_file_path = os.path.join(temp_dir,temp_file.name) | |
docx_loader = Docx2txtLoader(file_path=temp_file_path).load() | |
# docx_loader=DirectoryLoader(temp_dir, glob="**/*.docx",loader_cls=Docx2txtLoader) | |
# st.text("DOCX has been successully loaded into the DOCX_LOADER") | |
print(docx_loader) | |
# text = extract_text_from_docx(temp_file_path) | |
# print(text) | |
# st.write("Extracted DOCX Text:") | |
# st.write(text) | |
# loaders.append(docx_loader) | |
documents.extend(docx_loader) | |
# temp_file.close() | |
# if temp_file_path: | |
# os.remove(temp_file_path) | |
elif file_type == "csv": | |
temp_file = tempfile.NamedTemporaryFile(delete=False) | |
temp_file.write(file.read()) | |
temp_file_path = temp_file.name | |
# temp_file_path = os.path.join(temp_dir,temp_file.name) | |
# csv_loader = CSVLoader(file_path=temp_file_path).load | |
csv_loader = UnstructuredFileLoader(temp_file_path).load() | |
# df = pd.read_csv(temp_file_path) | |
# temp_file_path=df.to_excel(temp_file.name) | |
# csv_loader = DataFrameLoader(temp_file_path).load() | |
# csv_loader=DirectoryLoader(temp_dir, glob="**/*.csv",loader_cls=CSVLoader) | |
# st.text("CSV has been successully loaded into the CSV_LOADER") | |
# time.sleep(0.5)s | |
print(csv_loader) | |
# dataframe = pd.read_csv(temp_file_path) | |
# print(dataframe) | |
# st.write("CSV Data:") | |
# st.write(dataframe) | |
# loaders.append(csv_loader) | |
documents.extend(csv_loader) | |
# temp_file.close() | |
# if temp_file_path: | |
# os.remove(temp_file_path) | |
elif file_type == "xlsx": | |
temp_file = tempfile.NamedTemporaryFile(delete=False) | |
temp_file.write(file.read()) | |
temp_file_path = temp_file.name | |
# temp_file_path = os.path.join(temp_dir,temp_file.name) | |
excel_loader = UnstructuredExcelLoader(file_path=temp_file_path).load() | |
# excel_loader=DirectoryLoader(temp_dir, glob="**/*.xlsx",loader_cls=UnstructuredExcelLoader) | |
# st.text("Excel has been successully loaded into the DOCX_LOADER") | |
print(excel_loader) | |
print("Loaded the excel file in excel_loader") | |
# dataframe = pd.read_excel(temp_file_path, engine='openpyxl') | |
# print(dataframe) | |
# st.write("Excel Data:") | |
# st.write(dataframe) | |
# loaders.append(excel_loader) | |
documents.extend(excel_loader) | |
# temp_file.close() | |
# if temp_file_path: | |
# os.remove(temp_file_path) | |
elif file_type == "html": | |
temp_file = tempfile.NamedTemporaryFile(delete=False) | |
temp_file.write(file.read()) | |
temp_file_path = temp_file.name | |
# temp_file_path = os.path.join(temp_dir,temp_file.name) | |
# html_loader = BSHTMLLoader(file_path=temp_file_path).load() | |
html_loader = UnstructuredFileLoader(temp_file_path).load() | |
# html_loader=DirectoryLoader(temp_dir, glob="**/*.html",loader_cls=UnstructuredHTMLLoader) | |
# st.text("HTML has been successully loaded into the html_LOADER") | |
print(html_loader) | |
print("Loaded the html file in html_loader") | |
# text = extract_text_from_html(temp_file_path) | |
# print(text) | |
# st.write("Extracted HTML Text:") | |
# st.write(text) | |
# loaders.append(html_loader) | |
documents.extend(html_loader) | |
#Splitting the documents | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) #chunk overlap seems to work better | |
documents = text_splitter.split_documents(documents) | |
return documents | |
# def main(): | |
# documents = load_documents() | |