import os import streamlit as st import fitz # PyMuPDF from sentence_transformers import SentenceTransformer from pinecone import Pinecone from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain_pinecone import PineconeVectorStore from langchain_text_splitters import CharacterTextSplitter # Load the sentence transformer model model = SentenceTransformer('all-MiniLM-L6-v2') def pdf_to_text(pdf_file): doc = fitz.open(stream=pdf_file.read(), filetype="pdf") text = "" for page in doc: text += page.get_text() return text st.title("PDF to Embeddings") # User inputs for Pinecone api_key = st.text_input("Enter your Pinecone API Key", type="password") index_name = st.text_input("Enter the Pinecone Index Name") # File uploader uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") if uploaded_file is not None: # Extract text from PDF text = pdf_to_text(uploaded_file) st.write("Extracted Text:") st.write(text) text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) docs = text_splitter.split_text(text) # Initialize Pinecone Index if api_key and index_name: pc = Pinecone( api_key=api_key ) index = pc.Index(index_name) embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") os.environ['PINECONE_API_KEY'] = api_key docsearch = PineconeVectorStore.from_texts(docs, embeddings, index_name=index_name) st.write(f"Successfully uploaded {len(docs)} vectors to Pinecone Index '{index_name}'") else: st.write("Please provide your Pinecone API Key and Index Name.")