Spaces:

yashsarnaik23
/

upsert_to_pinecone_sentencetransformers

Sleeping

App Files Files Community

yashsarnaik23 commited on Oct 4, 2024

Commit

d8882ca

verified ·

1 Parent(s): 14af674

Upload 2 files

Browse files

Files changed (2) hide show

multi.py +125 -0
requirements.txt +15 -0

multi.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import streamlit as st
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from sentence_transformers import SentenceTransformer
+import os
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import ChatPromptTemplate
+from dotenv import load_dotenv
+from pinecone import Pinecone, ServerlessSpec
+import time
+from langchain_community.vectorstores import Pinecone as LangchainPinecone
+from PyPDF2 import PdfReader
+from langchain.schema import Document
+st.set_page_config(
+    page_title="Upsert to Pinecone",
+    page_icon="📤")
+def load_css(file_path):
+    with open(file_path, "r") as f:
+        return f"<style>{f.read()}</style>"
+# Load and inject CSS
+css = load_css("style.css")
+st.markdown(css, unsafe_allow_html=True)
+# Load environment variables
+load_dotenv()
+st.title('Upsert to Pinecone using \r paraphrase-multilingual-mpnet-base-v2\rEmbeddings📤')
+# PDF file uploader
+uploaded_file = st.file_uploader("Choose a PDF file📁", type="pdf")
+def extract_text_from_pdf(pdf_file):
+    pdf_reader = PdfReader(pdf_file)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+def get_text_chunks(text):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=100,
+    )
+    chunks = text_splitter.split_text(text)
+    return chunks
+def get_embeddings(text_chunks):
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
+    return embeddings.embed_documents(text_chunks)
+def get_vectorstore(text_chunks, index_name):
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
+    # Create Document objects
+    documents = [Document(page_content=chunk) for chunk in text_chunks]
+    # Create and return the vector store
+    vectorstore = LangchainPinecone.from_documents(
+        documents,
+        embeddings,
+        index_name=index_name
+    )
+    return vectorstore
+# Pinecone setup
+key = st.text_input("Enter your Pinecone API key:", type="password")
+index_name = st.text_input("Enter your Pinecone Index name:")
+if key and index_name:
+    # Set the Pinecone API key as an environment variable
+    os.environ['PINECONE_API_KEY'] = key
+    # Initialize Pinecone
+    pc = Pinecone()
+    spec = ServerlessSpec(
+    cloud="aws", region="us-east-1"
+    )
+    # Check if the index exists, if not create it
+    if index_name not in pc.list_indexes().names():
+        pc.create_index(
+            name=index_name,
+            dimension=768,  # Dimension for paraphrase-multilingual-mpnet-base-v2 model
+            metric='cosine',
+            spec=spec
+        )
+        st.info(f"Created new Pinecone index: {index_name}")
+    # Get the index
+    index = pc.Index(index_name)
+    if uploaded_file is not None:
+        text = extract_text_from_pdf(uploaded_file)
+        text_chunks = get_text_chunks(text)
+        if st.button("Generate Embeddings and Create Vectorstore"):
+            with st.spinner("Processing..."):
+                embeddings = get_embeddings(text_chunks)
+                vectorstore = get_vectorstore(text_chunks, index_name)
+            st.success("Embeddings generated and vectorstore created successfully!")
+            st.write(f"Number of chunks: {len(text_chunks)}")
+            st.write(f"Embedding dimension: {len(embeddings[0])}")
+    # You can add more functionality here, such as querying the vectorstore
+else:
+    st.warning("Please enter your Pinecone API key and Index Name to proceed.")
+footer = """
+1. Upload the PDF file you want to vectorize and upload to the Pinecone Database.
+2. Enter your Pinecone API key.
+3. Enter your Pinecone Index name.
+4. Selected environment by default is <h3> us-east-1 </h3> if you want a different one make changes in app.py.
+"""
+st.markdown(footer, unsafe_allow_html=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+streamlit==1.36.0
+langchain==0.3.1
+langchain-community==0.3.0
+langchain-google-genai==2.0.0
+google-generativeai== 0.7.2
+langchain-core==0.3.6
+pinecone==5.3.1
+sentence-transformers==3.1.1
+pypdf==5.0.0
+PyPDF2==3.0.1
+langchain_chroma==0.1.4
+langchainhub==0.1.20
+langchain_experimental==0.3.1
+rapidocr-onnxruntime==1.3.24
+faiss-cpu==1.8.0