Kushwanth Chowday Kandala commited on
Commit
517f1a0
1 Parent(s): 065cb17

insert uploaded document to pinecone

Browse files
Files changed (1) hide show
  1. app.py +30 -20
app.py CHANGED
@@ -5,7 +5,7 @@ import numpy as np
5
  import pandas as pd
6
  from io import StringIO
7
  import PyPDF2
8
- from tqdm import tqdm
9
  import math
10
  from transformers import pipeline
11
  # import json
@@ -150,26 +150,12 @@ def chat_actions():
150
  if "chat_history" not in st.session_state:
151
  st.session_state["chat_history"] = []
152
 
153
-
154
  st.chat_input("show me the contents of ML paper published on xxx with article no. xx?", on_submit=chat_actions, key="chat_input")
155
 
156
  for i in st.session_state["chat_history"]:
157
  with st.chat_message(name=i["role"]):
158
  st.write(i["content"])
159
 
160
- ### Creating a Index(Pinecone Vector Database)
161
- # %%writefile .env
162
- # PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
163
- # PINECONE_ENV=os.getenv("PINECONE_ENV")
164
- # PINECONE_ENVIRONMENT=os.getenv("PINECONE_ENVIRONMENT")
165
-
166
- # import os
167
- # import pinecone
168
-
169
- # from pinecone import Index, GRPCIndex
170
- # pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
171
- # st.text(pinecone)
172
-
173
  def print_out(pages):
174
  for i in range(len(pages)):
175
  text = pages[i].extract_text().strip()
@@ -184,9 +170,11 @@ def combine_text(pages):
184
  p = math.pow(1024, 2)
185
  mbsize = round(len(bytesize) / p, 2)
186
  st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
 
187
 
188
  def create_embeddings():
189
  # Get the uploaded file
 
190
  with st.sidebar:
191
  uploaded_files = st.session_state["uploaded_files"]
192
  for uploaded_file in uploaded_files:
@@ -194,14 +182,37 @@ def create_embeddings():
194
  reader = PyPDF2.PdfReader(uploaded_file)
195
  pages = reader.pages
196
  print_out(pages)
197
- combine_text(pages)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
- st.write("created_embeddings")
200
 
201
  # Display the contents of the file
202
  # st.write(file_contents)
203
 
204
-
205
  with st.sidebar:
206
  st.markdown("""
207
  ***:red[Follow this steps]***
@@ -234,5 +245,4 @@ with st.sidebar:
234
  # pages = reader.pages
235
  # print_out(pages)
236
  # combine_text(pages)
237
- # promt_engineer(text)
238
-
 
5
  import pandas as pd
6
  from io import StringIO
7
  import PyPDF2
8
+ from tqdm.auto import tqdm
9
  import math
10
  from transformers import pipeline
11
  # import json
 
150
  if "chat_history" not in st.session_state:
151
  st.session_state["chat_history"] = []
152
 
 
153
  st.chat_input("show me the contents of ML paper published on xxx with article no. xx?", on_submit=chat_actions, key="chat_input")
154
 
155
  for i in st.session_state["chat_history"]:
156
  with st.chat_message(name=i["role"]):
157
  st.write(i["content"])
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  def print_out(pages):
160
  for i in range(len(pages)):
161
  text = pages[i].extract_text().strip()
 
170
  p = math.pow(1024, 2)
171
  mbsize = round(len(bytesize) / p, 2)
172
  st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
173
+ return concatenates_text
174
 
175
  def create_embeddings():
176
  # Get the uploaded file
177
+ inputtext = ""
178
  with st.sidebar:
179
  uploaded_files = st.session_state["uploaded_files"]
180
  for uploaded_file in uploaded_files:
 
182
  reader = PyPDF2.PdfReader(uploaded_file)
183
  pages = reader.pages
184
  print_out(pages)
185
+ inputtext = combine_text(pages)
186
+
187
+ # connect to pinecone index
188
+ pinecone = connect_pinecone()
189
+ index = get_pinecone_semantic_index(pinecone)
190
+
191
+ # The maximum metadata size per vector is 40KB
192
+ batch_size = 10000
193
+ for i in tqdm(range(0, len(inputtext), batch_size)):
194
+ # find end of batch
195
+ end = min(i + batch_size, len(inputtext))
196
+ # create ids batch
197
+ ids = [str(i) for i in range(i, end)]
198
+ # create metadata batch
199
+ metadata = [{"text": text} for text in inputtext[i:end]]
200
+ # create embeddings
201
+ xc = model.encode(inputtext[i:end])
202
+ # create records list for upsert
203
+ records = zip(ids, xc, metadata)
204
+ # upsert records
205
+ index.upsert(vectors=records)
206
+
207
+ with st.sidebar:
208
+ st.write("created vector embeddings!")
209
+ # check no of records in the index
210
+ st.write(f"{index.describe_index_stats()}")
211
 
 
212
 
213
  # Display the contents of the file
214
  # st.write(file_contents)
215
 
 
216
  with st.sidebar:
217
  st.markdown("""
218
  ***:red[Follow this steps]***
 
245
  # pages = reader.pages
246
  # print_out(pages)
247
  # combine_text(pages)
248
+ # promt_engineer(text)