Spaces:
Sleeping
Sleeping
#importing dependencies | |
from langchain.embeddings import HuggingFaceBgeEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
from langchain.document_loaders import PyPDFDirectoryLoader | |
from langchain.storage import LocalFileStore | |
import time | |
import torch | |
import streamlit as st | |
import tkinter as tk | |
from tkinter import filedialog | |
from pathlib import Path | |
def select_folder(): | |
root = tk.Tk() | |
root.withdraw() | |
folder_path = filedialog.askdirectory(master=root) | |
root.destroy() | |
return folder_path | |
# check if CUDA is available and set the device | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
print('Using device:', device) | |
store = LocalFileStore("../cache/") | |
#loading data | |
root = tk.Tk() | |
root.withdraw() | |
# Make folder picker dialog appear on top of other windows | |
root.wm_attributes('-topmost', 1) | |
# Folder picker button | |
st.title('Pick Pdfs Folder') | |
st.write('Please select a folder:') | |
dirname = "" | |
pdfs_folder = "" | |
clicked = st.button('Browse') | |
if clicked: | |
dirname = st.text_input('Selected folder:', filedialog.askdirectory(master=root)) | |
pdfs_folder = Path(dirname) | |
if pdfs_folder: | |
st.write("Selected folder path:", pdfs_folder) | |
loader = PyPDFDirectoryLoader(pdfs_folder) | |
documents = loader.load() | |
st.write(len(documents)) | |
#splitting | |
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 10) | |
text_chunks = splitter.split_documents(documents) | |
st.write(len(text_chunks)) | |
#loading HuggingFaceBGE embeddings | |
model_name = "BAAI/bge-small-en" | |
st.write("Loading tokenizer model", model_name) | |
model_kwargs = {"device": device} | |
encode_kwargs = {"normalize_embeddings": True} | |
embeddings = HuggingFaceBgeEmbeddings( | |
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs | |
) | |
st.write('Embeddings loaded!') | |
# creating Documents vector database. | |
t1 = time.time() | |
persist_directory = 'dbname' | |
vectordb = Chroma.from_documents( | |
documents = text_chunks, | |
embedding = embeddings, | |
collection_metadata = {"hnsw:space": "cosine"}, | |
persist_directory = persist_directory | |
) | |
t2 = time.time() | |
st.write('Time taken for building db : ', (t2 - t1)) | |