Kushwanth Chowday Kandala commited on
Commit
f06193e
1 Parent(s): 8a3a5d7

add combine_text functionality prep to chunk the data with the model limits

Browse files
Files changed (1) hide show
  1. app.py +12 -1
app.py CHANGED
@@ -6,6 +6,7 @@ import pandas as pd
6
  from io import StringIO
7
  import PyPDF2
8
  from tqdm import tqdm
 
9
  # import json
10
 
11
  # st.config(PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python")
@@ -142,6 +143,16 @@ def print_out(pages):
142
  text = pages[i].extract_text().strip()
143
  st.write(f"Page {i} : {text}")
144
 
 
 
 
 
 
 
 
 
 
 
145
  with st.sidebar:
146
  st.markdown("""
147
  ***Follow this steps***
@@ -170,4 +181,4 @@ with st.sidebar:
170
  reader = PyPDF2.PdfReader(uploaded_file)
171
  pages = reader.pages
172
  print_out(pages)
173
-
 
6
  from io import StringIO
7
  import PyPDF2
8
  from tqdm import tqdm
9
+ import math
10
  # import json
11
 
12
  # st.config(PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python")
 
143
  text = pages[i].extract_text().strip()
144
  st.write(f"Page {i} : {text}")
145
 
146
+ def combine_text(pages):
147
+ concatenates_text = ""
148
+ for page in tqdm(pages):
149
+ text = page.extract_text().strip()
150
+ concatenates_text += text
151
+ bytesize = bytes(text, "utf-8")
152
+ p = math.pow(1024, 2)
153
+ mbsize = round(bytesize / p, 2)
154
+ st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
155
+
156
  with st.sidebar:
157
  st.markdown("""
158
  ***Follow this steps***
 
181
  reader = PyPDF2.PdfReader(uploaded_file)
182
  pages = reader.pages
183
  print_out(pages)
184
+ combine_text(pages)