Spaces:
Sleeping
Sleeping
import os | |
import streamlit as st | |
import fitz # PyMuPDF | |
from sentence_transformers import SentenceTransformer | |
from pinecone import Pinecone | |
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings | |
from langchain_pinecone import PineconeVectorStore | |
from langchain_text_splitters import CharacterTextSplitter | |
# Load the sentence transformer model | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
def pdf_to_text(pdf_file): | |
doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
st.title("PDF to Embeddings") | |
# User inputs for Pinecone | |
api_key = st.text_input("Enter your Pinecone API Key", type="password") | |
index_name = st.text_input("Enter the Pinecone Index Name") | |
# File uploader | |
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") | |
if uploaded_file is not None: | |
# Extract text from PDF | |
text = pdf_to_text(uploaded_file) | |
st.write("Extracted Text:") | |
st.write(text) | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
docs = text_splitter.split_text(text) | |
# Initialize Pinecone Index | |
if api_key and index_name: | |
pc = Pinecone( | |
api_key=api_key | |
) | |
index = pc.Index(index_name) | |
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
os.environ['PINECONE_API_KEY'] = api_key | |
docsearch = PineconeVectorStore.from_texts(docs, embeddings, index_name=index_name) | |
st.write(f"Successfully uploaded {len(docs)} vectors to Pinecone Index '{index_name}'") | |
else: | |
st.write("Please provide your Pinecone API Key and Index Name.") | |