Spaces:
Runtime error
Runtime error
import streamlit as st | |
import faiss | |
import os | |
from PyPDF2 import PdfFileReader | |
from sentence_transformers import SentenceTransformer | |
import pickle | |
st.title("File Upload and Vector Database Creation") | |
dataset = st.selectbox("Select Dataset", ["Sales", "Marketing", "HR"]) | |
uploaded_file = st.file_uploader("Upload your file", type=["txt", "pdf", "docx"]) | |
# Function to extract text from PDF | |
def extract_text_from_pdf(file): | |
reader = PdfFileReader(file) | |
text = "" | |
for page in range(reader.getNumPages()): | |
text += reader.getPage(page).extract_text() | |
return text | |
if uploaded_file is not None: | |
if uploaded_file.type == "application/pdf": | |
text = extract_text_from_pdf(uploaded_file) | |
elif uploaded_file.type == "text/plain": | |
text = str(uploaded_file.read(), "utf-8") | |
st.write("File uploaded successfully!") | |
# Load pre-trained model for embeddings | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
embeddings = model.encode([text]) | |
# Create or load existing FAISS index | |
dimension = 384 # Example dimension size for the MiniLM model | |
index_file = f'vector_db_{dataset}.index' | |
if os.path.exists(index_file): | |
index = faiss.read_index(index_file) | |
else: | |
index = faiss.IndexFlatL2(dimension) | |
# Add embeddings to the index | |
index.add(embeddings) | |
# Save the index | |
faiss.write_index(index, index_file) | |
# Save metadata | |
metadata_file = f'metadata_{dataset}.pkl' | |
if os.path.exists(metadata_file): | |
with open(metadata_file, 'rb') as f: | |
metadata = pickle.load(f) | |
else: | |
metadata = [] | |
metadata.append(text) | |
with open(metadata_file, 'wb') as f: | |
pickle.dump(metadata, f) | |
st.write("Vector database updated and saved successfully!") | |
# Option to download the vector database file | |
with open(index_file, 'rb') as f: | |
st.download_button( | |
label=f"Download {index_file}", | |
data=f, | |
file_name=index_file | |
) | |
# Option to download the metadata file | |
with open(metadata_file, 'rb') as f: | |
st.download_button( | |
label=f"Download {metadata_file}", | |
data=f, | |
file_name=metadata_file | |
) |