Spaces:
Runtime error
Runtime error
Kushwanth Chowday Kandala
commited on
insert uploaded document to pinecone
Browse files
app.py
CHANGED
@@ -5,7 +5,7 @@ import numpy as np
|
|
5 |
import pandas as pd
|
6 |
from io import StringIO
|
7 |
import PyPDF2
|
8 |
-
from tqdm import tqdm
|
9 |
import math
|
10 |
from transformers import pipeline
|
11 |
# import json
|
@@ -150,26 +150,12 @@ def chat_actions():
|
|
150 |
if "chat_history" not in st.session_state:
|
151 |
st.session_state["chat_history"] = []
|
152 |
|
153 |
-
|
154 |
st.chat_input("show me the contents of ML paper published on xxx with article no. xx?", on_submit=chat_actions, key="chat_input")
|
155 |
|
156 |
for i in st.session_state["chat_history"]:
|
157 |
with st.chat_message(name=i["role"]):
|
158 |
st.write(i["content"])
|
159 |
|
160 |
-
### Creating a Index(Pinecone Vector Database)
|
161 |
-
# %%writefile .env
|
162 |
-
# PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
|
163 |
-
# PINECONE_ENV=os.getenv("PINECONE_ENV")
|
164 |
-
# PINECONE_ENVIRONMENT=os.getenv("PINECONE_ENVIRONMENT")
|
165 |
-
|
166 |
-
# import os
|
167 |
-
# import pinecone
|
168 |
-
|
169 |
-
# from pinecone import Index, GRPCIndex
|
170 |
-
# pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
|
171 |
-
# st.text(pinecone)
|
172 |
-
|
173 |
def print_out(pages):
|
174 |
for i in range(len(pages)):
|
175 |
text = pages[i].extract_text().strip()
|
@@ -184,9 +170,11 @@ def combine_text(pages):
|
|
184 |
p = math.pow(1024, 2)
|
185 |
mbsize = round(len(bytesize) / p, 2)
|
186 |
st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
|
|
|
187 |
|
188 |
def create_embeddings():
|
189 |
# Get the uploaded file
|
|
|
190 |
with st.sidebar:
|
191 |
uploaded_files = st.session_state["uploaded_files"]
|
192 |
for uploaded_file in uploaded_files:
|
@@ -194,14 +182,37 @@ def create_embeddings():
|
|
194 |
reader = PyPDF2.PdfReader(uploaded_file)
|
195 |
pages = reader.pages
|
196 |
print_out(pages)
|
197 |
-
combine_text(pages)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
|
199 |
-
st.write("created_embeddings")
|
200 |
|
201 |
# Display the contents of the file
|
202 |
# st.write(file_contents)
|
203 |
|
204 |
-
|
205 |
with st.sidebar:
|
206 |
st.markdown("""
|
207 |
***:red[Follow this steps]***
|
@@ -234,5 +245,4 @@ with st.sidebar:
|
|
234 |
# pages = reader.pages
|
235 |
# print_out(pages)
|
236 |
# combine_text(pages)
|
237 |
-
# promt_engineer(text)
|
238 |
-
|
|
|
5 |
import pandas as pd
|
6 |
from io import StringIO
|
7 |
import PyPDF2
|
8 |
+
from tqdm.auto import tqdm
|
9 |
import math
|
10 |
from transformers import pipeline
|
11 |
# import json
|
|
|
150 |
if "chat_history" not in st.session_state:
|
151 |
st.session_state["chat_history"] = []
|
152 |
|
|
|
153 |
st.chat_input("show me the contents of ML paper published on xxx with article no. xx?", on_submit=chat_actions, key="chat_input")
|
154 |
|
155 |
for i in st.session_state["chat_history"]:
|
156 |
with st.chat_message(name=i["role"]):
|
157 |
st.write(i["content"])
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
def print_out(pages):
|
160 |
for i in range(len(pages)):
|
161 |
text = pages[i].extract_text().strip()
|
|
|
170 |
p = math.pow(1024, 2)
|
171 |
mbsize = round(len(bytesize) / p, 2)
|
172 |
st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
|
173 |
+
return concatenates_text
|
174 |
|
175 |
def create_embeddings():
|
176 |
# Get the uploaded file
|
177 |
+
inputtext = ""
|
178 |
with st.sidebar:
|
179 |
uploaded_files = st.session_state["uploaded_files"]
|
180 |
for uploaded_file in uploaded_files:
|
|
|
182 |
reader = PyPDF2.PdfReader(uploaded_file)
|
183 |
pages = reader.pages
|
184 |
print_out(pages)
|
185 |
+
inputtext = combine_text(pages)
|
186 |
+
|
187 |
+
# connect to pinecone index
|
188 |
+
pinecone = connect_pinecone()
|
189 |
+
index = get_pinecone_semantic_index(pinecone)
|
190 |
+
|
191 |
+
# The maximum metadata size per vector is 40KB
|
192 |
+
batch_size = 10000
|
193 |
+
for i in tqdm(range(0, len(inputtext), batch_size)):
|
194 |
+
# find end of batch
|
195 |
+
end = min(i + batch_size, len(inputtext))
|
196 |
+
# create ids batch
|
197 |
+
ids = [str(i) for i in range(i, end)]
|
198 |
+
# create metadata batch
|
199 |
+
metadata = [{"text": text} for text in inputtext[i:end]]
|
200 |
+
# create embeddings
|
201 |
+
xc = model.encode(inputtext[i:end])
|
202 |
+
# create records list for upsert
|
203 |
+
records = zip(ids, xc, metadata)
|
204 |
+
# upsert records
|
205 |
+
index.upsert(vectors=records)
|
206 |
+
|
207 |
+
with st.sidebar:
|
208 |
+
st.write("created vector embeddings!")
|
209 |
+
# check no of records in the index
|
210 |
+
st.write(f"{index.describe_index_stats()}")
|
211 |
|
|
|
212 |
|
213 |
# Display the contents of the file
|
214 |
# st.write(file_contents)
|
215 |
|
|
|
216 |
with st.sidebar:
|
217 |
st.markdown("""
|
218 |
***:red[Follow this steps]***
|
|
|
245 |
# pages = reader.pages
|
246 |
# print_out(pages)
|
247 |
# combine_text(pages)
|
248 |
+
# promt_engineer(text)
|
|