""" Python Backend API to chat with private data 08/15/2023 D.M. Theekshana Samaradiwakara """ import os import time import glob from multiprocessing import Pool from tqdm import tqdm from dotenv import load_dotenv from chromaDb import save_files from langchain.document_loaders import ( CSVLoader, EverNoteLoader, PyMuPDFLoader, TextLoader, UnstructuredEmailLoader, UnstructuredEPubLoader, UnstructuredHTMLLoader, UnstructuredMarkdownLoader, UnstructuredODTLoader, UnstructuredPowerPointLoader, UnstructuredWordDocumentLoader, ) from langchain.document_loaders import DirectoryLoader text_loader_kwargs={'autodetect_encoding': True} from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.docstore.document import Document from chroma import load_store load_dotenv() chunk_size = os.environ.get('EMBEDDING_CHUNK_SIZE') chunk_overlap = os.environ.get('EMBEDDING_CHUNK_OVERLAP') embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME") # Map file extensions to document loaders and their arguments LOADER_MAPPING = { ".csv": (CSVLoader, {}), # ".docx": (Docx2txtLoader, {}), ".doc": (UnstructuredWordDocumentLoader, {}), ".docx": (UnstructuredWordDocumentLoader, {}), ".enex": (EverNoteLoader, {}), ".eml": (UnstructuredEmailLoader, {}), ".epub": (UnstructuredEPubLoader, {}), ".html": (UnstructuredHTMLLoader, {}), ".md": (UnstructuredMarkdownLoader, {}), ".odt": (UnstructuredODTLoader, {}), ".pdf": (PyMuPDFLoader, {}), ".ppt": (UnstructuredPowerPointLoader, {}), ".pptx": (UnstructuredPowerPointLoader, {}), ".txt": (TextLoader, {"encoding": "utf8"}), # Add more mappings for other file extensions and loaders as needed } class DataPipeline: def __init__(self): self.dataset_name = None self.vectorstore = None def load_documents_in_folder(self, folder): print(f"loading documents...") loader = DirectoryLoader(folder, glob="**/[!.]*", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs) pages = loader.load() return pages def load_single_document(self, doc): ext = "." + doc.name.rsplit(".", 1)[-1] if ext in LOADER_MAPPING: loader_class, loader_args = LOADER_MAPPING[ext] loader = loader_class(doc, **loader_args) return loader.load() raise ValueError(f"Unsupported file extension '{ext}'") def load_documents(self, uploaded_files): with Pool(processes=os.cpu_count()) as pool: results = [] with tqdm(total=len(uploaded_files), desc='Loading new documents', ncols=80) as pbar: for i, docs in enumerate(pool.imap_unordered(self.load_single_document, uploaded_files)): results.extend(docs) pbar.update() return results def load_streamlit_documents(self, uploaded_files, year): documents = [] for uploaded_file in uploaded_files: print(print("\n\n uploaded_file \n\n",uploaded_file,"\n")) source = uploaded_file.name print(print("\n\n source \n\n",source,"\n")) content = uploaded_file.read().decode('latin-1') print(print("\n\n content \n\n",content[:10],"\n")) doc = Document( page_content=content, metadata={ "source": source, 'year': year } ) print(print("doc")) print(print("\n doc \n\n",doc,"\n\n\n\n")) documents.append(doc) return documents def process_documents(self, documents): print(f"Creating embeddings. May take some minutes...") text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n\n", "\n", "(?<=\. )", " ", ""] ) texts = text_splitter.split_documents(documents) return texts def persist_documents(self, persist_directory, document_splits): save_files(persist_directory, document_splits) def add_metadata(self, documents, metadata, value): for doc in documents: doc.metadata[metadata]=value return documents