import streamlit as st import faiss import os from PyPDF2 import PdfFileReader from sentence_transformers import SentenceTransformer import pickle st.title("File Upload and Vector Database Creation") dataset = st.selectbox("Select Dataset", ["Sales", "Marketing", "HR"]) uploaded_file = st.file_uploader("Upload your file", type=["txt", "pdf", "docx"]) # Function to extract text from PDF def extract_text_from_pdf(file): reader = PdfFileReader(file) text = "" for page in range(reader.getNumPages()): text += reader.getPage(page).extract_text() return text if uploaded_file is not None: if uploaded_file.type == "application/pdf": text = extract_text_from_pdf(uploaded_file) elif uploaded_file.type == "text/plain": text = str(uploaded_file.read(), "utf-8") st.write("File uploaded successfully!") # Load pre-trained model for embeddings model = SentenceTransformer('all-MiniLM-L6-v2') embeddings = model.encode([text]) # Create or load existing FAISS index dimension = 384 # Example dimension size for the MiniLM model index_file = f'vector_db_{dataset}.index' if os.path.exists(index_file): index = faiss.read_index(index_file) else: index = faiss.IndexFlatL2(dimension) # Add embeddings to the index index.add(embeddings) # Save the index faiss.write_index(index, index_file) # Save metadata metadata_file = f'metadata_{dataset}.pkl' if os.path.exists(metadata_file): with open(metadata_file, 'rb') as f: metadata = pickle.load(f) else: metadata = [] metadata.append(text) with open(metadata_file, 'wb') as f: pickle.dump(metadata, f) st.write("Vector database updated and saved successfully!") # Option to download the vector database file with open(index_file, 'rb') as f: st.download_button( label=f"Download {index_file}", data=f, file_name=index_file ) # Option to download the metadata file with open(metadata_file, 'rb') as f: st.download_button( label=f"Download {metadata_file}", data=f, file_name=metadata_file )