yashsarnaik23 commited on
Commit
d8882ca
·
verified ·
1 Parent(s): 14af674

Upload 2 files

Browse files
Files changed (2) hide show
  1. multi.py +125 -0
  2. requirements.txt +15 -0
multi.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.embeddings import HuggingFaceEmbeddings
5
+ from sentence_transformers import SentenceTransformer
6
+ import os
7
+ from langchain.chains import create_retrieval_chain
8
+ from langchain.chains.combine_documents import create_stuff_documents_chain
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+ from dotenv import load_dotenv
11
+ from pinecone import Pinecone, ServerlessSpec
12
+ import time
13
+ from langchain_community.vectorstores import Pinecone as LangchainPinecone
14
+ from PyPDF2 import PdfReader
15
+ from langchain.schema import Document
16
+
17
+ st.set_page_config(
18
+ page_title="Upsert to Pinecone",
19
+ page_icon="📤")
20
+
21
+ def load_css(file_path):
22
+ with open(file_path, "r") as f:
23
+ return f"<style>{f.read()}</style>"
24
+
25
+ # Load and inject CSS
26
+ css = load_css("style.css")
27
+ st.markdown(css, unsafe_allow_html=True)
28
+
29
+ # Load environment variables
30
+ load_dotenv()
31
+
32
+ st.title('Upsert to Pinecone using \r paraphrase-multilingual-mpnet-base-v2\rEmbeddings📤')
33
+
34
+ # PDF file uploader
35
+ uploaded_file = st.file_uploader("Choose a PDF file📁", type="pdf")
36
+
37
+ def extract_text_from_pdf(pdf_file):
38
+ pdf_reader = PdfReader(pdf_file)
39
+ text = ""
40
+
41
+ for page in pdf_reader.pages:
42
+ text += page.extract_text()
43
+
44
+ return text
45
+
46
+ def get_text_chunks(text):
47
+ text_splitter = RecursiveCharacterTextSplitter(
48
+ chunk_size=1000,
49
+ chunk_overlap=100,
50
+ )
51
+
52
+ chunks = text_splitter.split_text(text)
53
+ return chunks
54
+
55
+ def get_embeddings(text_chunks):
56
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
57
+ return embeddings.embed_documents(text_chunks)
58
+
59
+ def get_vectorstore(text_chunks, index_name):
60
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
61
+
62
+ # Create Document objects
63
+ documents = [Document(page_content=chunk) for chunk in text_chunks]
64
+
65
+ # Create and return the vector store
66
+ vectorstore = LangchainPinecone.from_documents(
67
+ documents,
68
+ embeddings,
69
+ index_name=index_name
70
+ )
71
+
72
+ return vectorstore
73
+
74
+ # Pinecone setup
75
+ key = st.text_input("Enter your Pinecone API key:", type="password")
76
+ index_name = st.text_input("Enter your Pinecone Index name:")
77
+
78
+ if key and index_name:
79
+ # Set the Pinecone API key as an environment variable
80
+ os.environ['PINECONE_API_KEY'] = key
81
+
82
+ # Initialize Pinecone
83
+ pc = Pinecone()
84
+ spec = ServerlessSpec(
85
+ cloud="aws", region="us-east-1"
86
+ )
87
+
88
+ # Check if the index exists, if not create it
89
+ if index_name not in pc.list_indexes().names():
90
+ pc.create_index(
91
+ name=index_name,
92
+ dimension=768, # Dimension for paraphrase-multilingual-mpnet-base-v2 model
93
+ metric='cosine',
94
+ spec=spec
95
+ )
96
+ st.info(f"Created new Pinecone index: {index_name}")
97
+
98
+ # Get the index
99
+ index = pc.Index(index_name)
100
+
101
+ if uploaded_file is not None:
102
+ text = extract_text_from_pdf(uploaded_file)
103
+ text_chunks = get_text_chunks(text)
104
+
105
+ if st.button("Generate Embeddings and Create Vectorstore"):
106
+ with st.spinner("Processing..."):
107
+ embeddings = get_embeddings(text_chunks)
108
+ vectorstore = get_vectorstore(text_chunks, index_name)
109
+
110
+ st.success("Embeddings generated and vectorstore created successfully!")
111
+ st.write(f"Number of chunks: {len(text_chunks)}")
112
+ st.write(f"Embedding dimension: {len(embeddings[0])}")
113
+
114
+ # You can add more functionality here, such as querying the vectorstore
115
+ else:
116
+ st.warning("Please enter your Pinecone API key and Index Name to proceed.")
117
+
118
+ footer = """
119
+ 1. Upload the PDF file you want to vectorize and upload to the Pinecone Database.
120
+ 2. Enter your Pinecone API key.
121
+ 3. Enter your Pinecone Index name.
122
+ 4. Selected environment by default is <h3> us-east-1 </h3> if you want a different one make changes in app.py.
123
+ """
124
+
125
+ st.markdown(footer, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.36.0
2
+ langchain==0.3.1
3
+ langchain-community==0.3.0
4
+ langchain-google-genai==2.0.0
5
+ google-generativeai== 0.7.2
6
+ langchain-core==0.3.6
7
+ pinecone==5.3.1
8
+ sentence-transformers==3.1.1
9
+ pypdf==5.0.0
10
+ PyPDF2==3.0.1
11
+ langchain_chroma==0.1.4
12
+ langchainhub==0.1.20
13
+ langchain_experimental==0.3.1
14
+ rapidocr-onnxruntime==1.3.24
15
+ faiss-cpu==1.8.0