pdf-upload / app.py
Sam Schneider
Add more libraries needed to run.
8501ef2
import os
import streamlit as st
import fitz # PyMuPDF
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import CharacterTextSplitter
# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
def pdf_to_text(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
return text
st.title("PDF to Embeddings")
# User inputs for Pinecone
api_key = st.text_input("Enter your Pinecone API Key", type="password")
index_name = st.text_input("Enter the Pinecone Index Name")
# File uploader
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file is not None:
# Extract text from PDF
text = pdf_to_text(uploaded_file)
st.write("Extracted Text:")
st.write(text)
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_text(text)
# Initialize Pinecone Index
if api_key and index_name:
pc = Pinecone(
api_key=api_key
)
index = pc.Index(index_name)
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
os.environ['PINECONE_API_KEY'] = api_key
docsearch = PineconeVectorStore.from_texts(docs, embeddings, index_name=index_name)
st.write(f"Successfully uploaded {len(docs)} vectors to Pinecone Index '{index_name}'")
else:
st.write("Please provide your Pinecone API Key and Index Name.")