#importing dependencies from langchain.embeddings import HuggingFaceBgeEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.document_loaders import PyPDFDirectoryLoader from langchain.storage import LocalFileStore import time import torch import streamlit as st import tkinter as tk from tkinter import filedialog from pathlib import Path def select_folder(): root = tk.Tk() root.withdraw() folder_path = filedialog.askdirectory(master=root) root.destroy() return folder_path # check if CUDA is available and set the device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Using device:', device) store = LocalFileStore("../cache/") #loading data root = tk.Tk() root.withdraw() # Make folder picker dialog appear on top of other windows root.wm_attributes('-topmost', 1) # Folder picker button st.title('Pick Pdfs Folder') st.write('Please select a folder:') dirname = "" pdfs_folder = "" clicked = st.button('Browse') if clicked: dirname = st.text_input('Selected folder:', filedialog.askdirectory(master=root)) pdfs_folder = Path(dirname) if pdfs_folder: st.write("Selected folder path:", pdfs_folder) loader = PyPDFDirectoryLoader(pdfs_folder) documents = loader.load() st.write(len(documents)) #splitting splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 10) text_chunks = splitter.split_documents(documents) st.write(len(text_chunks)) #loading HuggingFaceBGE embeddings model_name = "BAAI/bge-small-en" st.write("Loading tokenizer model", model_name) model_kwargs = {"device": device} encode_kwargs = {"normalize_embeddings": True} embeddings = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) st.write('Embeddings loaded!') # creating Documents vector database. t1 = time.time() persist_directory = 'dbname' vectordb = Chroma.from_documents( documents = text_chunks, embedding = embeddings, collection_metadata = {"hnsw:space": "cosine"}, persist_directory = persist_directory ) t2 = time.time() st.write('Time taken for building db : ', (t2 - t1))