Kushwanth Chowday Kandala commited on
Commit
7c92df9
1 Parent(s): 517f1a0

maximum metadata size

Browse files
Files changed (1) hide show
  1. app.py +9 -1
app.py CHANGED
@@ -172,6 +172,14 @@ def combine_text(pages):
172
  st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
173
  return concatenates_text
174
 
 
 
 
 
 
 
 
 
175
  def create_embeddings():
176
  # Get the uploaded file
177
  inputtext = ""
@@ -188,7 +196,7 @@ def create_embeddings():
188
  pinecone = connect_pinecone()
189
  index = get_pinecone_semantic_index(pinecone)
190
 
191
- # The maximum metadata size per vector is 40KB
192
  batch_size = 10000
193
  for i in tqdm(range(0, len(inputtext), batch_size)):
194
  # find end of batch
 
172
  st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
173
  return concatenates_text
174
 
175
+ def split_into_chunks(text, chunk_size):
176
+
177
+ chunks = []
178
+ for i in range(0, len(text), chunk_size):
179
+ chunks.append(text[i:i + chunk_size])
180
+
181
+ return chunks
182
+
183
  def create_embeddings():
184
  # Get the uploaded file
185
  inputtext = ""
 
196
  pinecone = connect_pinecone()
197
  index = get_pinecone_semantic_index(pinecone)
198
 
199
+ # The maximum metadata size per vector is 40KB ~ 40000Bytes ~ each text character is 1 to 2 bytes. so rougly given batch size of 10000 to 40000
200
  batch_size = 10000
201
  for i in tqdm(range(0, len(inputtext), batch_size)):
202
  # find end of batch