AKnvd commited on
Commit
2693914
1 Parent(s): c15d82a
Files changed (3) hide show
  1. app.py +27 -18
  2. bg.png +0 -0
  3. bkgnd1.jpg +0 -0
app.py CHANGED
@@ -17,24 +17,31 @@ COMPLETIONS_MODEL = "gpt-4"
17
  openai.api_key = "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"
18
  COMPLETIONS_API_PARAMS = {
19
  "temperature": 0.0,
20
- "max_tokens": 300,
21
  "model": COMPLETIONS_MODEL,
22
  }
23
 
24
  @st.cache_data
25
  def run_on_chunks(data):
26
  response = []
27
- chunk = data_chunk(data , chunk_size = 1000)
28
- for i in chunk:
 
 
 
 
 
29
  response.append(GPT_4_API(i))
 
 
30
  return response
31
-
32
  def data_chunk(lst , chunk_size):
33
  return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
34
-
35
  def check_file_format(filename):
36
  return filename.rsplit('.', 1)[1].lower()
37
-
38
  def pdf_to_images(pdf_file):
39
  images = []
40
  with fitz.open(pdf_file) as doc:
@@ -43,7 +50,7 @@ def pdf_to_images(pdf_file):
43
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
44
  images.append(img)
45
  return images
46
-
47
  def OCR(pdf_file):
48
  pdf_reader = PyPDF2.PdfReader(pdf_file)
49
  pdf_writer = PyPDF2.PdfWriter()
@@ -65,7 +72,6 @@ def OCR(pdf_file):
65
  pdf_file.close()
66
  return text
67
 
68
-
69
  def txt_extraction(file_path):
70
  file_contents = file_path.read().decode("utf-8")
71
  return file_contents
@@ -92,8 +98,7 @@ def download_docx(text):
92
  )
93
 
94
  def GPT_4_API(data):
95
- print("request_send")
96
- header = """ create 20 question and answeres from this paragraph, Answer should strictly be exact lines from this paragraph without question answer numbers"."\n\nContext:\n"""
97
  QA = header + "".join(str(list(data)))
98
  response = openai.ChatCompletion.create(messages = [{"role": "user", "content": f"{QA}"},],**COMPLETIONS_API_PARAMS)
99
  return response["choices"][0]["message"]["content"]
@@ -127,18 +132,18 @@ def set_png_as_page_bg(png_file):
127
 
128
  def Extract_pdf_content(pdf_name):
129
 
130
- page_text = []
131
  pdf_reader = PyPDF2.PdfReader(pdf_name)
132
  num_pages = len(pdf_reader.pages)
133
 
134
  for page in range(num_pages):
135
  pdf_page = pdf_reader.pages[page]
136
- page_text.append(pdf_page.extract_text())
137
-
138
- return page_text[0]
139
 
140
  def process(uploaded_file):
141
- st.write("Filename:", uploaded_file.name)
142
  data = Extract_pdf_content(uploaded_file)
143
  return data
144
 
@@ -147,18 +152,22 @@ if __name__=="__main__":
147
  pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
148
  PAGE_CONFIG = {"page_title":"StColab.io","page_icon":":smiley:","layout":"centered"}
149
  st.set_page_config(**PAGE_CONFIG)
150
- main_bg = 'bg.png'
151
  set_png_as_page_bg(main_bg)
152
 
153
- st.title("pdf data extraction web application")
154
- uploaded_file = st.file_uploader("Upload a PDF file", type = ["pdf","docx","txt"])
155
 
156
  if uploaded_file is not None:
157
 
158
  if check_file_format(uploaded_file.name) == "pdf":
159
  data = process(uploaded_file)
 
 
160
  if data == '':
 
161
  data = OCR(uploaded_file)
 
162
 
163
  elif check_file_format(uploaded_file.name) == "docx":
164
  data = docx_extraction(uploaded_file)
17
  openai.api_key = "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"
18
  COMPLETIONS_API_PARAMS = {
19
  "temperature": 0.0,
20
+ "max_tokens": 1000,
21
  "model": COMPLETIONS_MODEL,
22
  }
23
 
24
  @st.cache_data
25
  def run_on_chunks(data):
26
  response = []
27
+ chunk = data_chunk(data , chunk_size = 2500)
28
+ num = 0
29
+ text = st.empty()
30
+
31
+ for i in chunk:
32
+ num = num + 1
33
+ text.write(f"{num}th API request sent out of {len(chunk)}")
34
  response.append(GPT_4_API(i))
35
+ text.empty()
36
+
37
  return response
38
+
39
  def data_chunk(lst , chunk_size):
40
  return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
41
+
42
  def check_file_format(filename):
43
  return filename.rsplit('.', 1)[1].lower()
44
+
45
  def pdf_to_images(pdf_file):
46
  images = []
47
  with fitz.open(pdf_file) as doc:
50
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
51
  images.append(img)
52
  return images
53
+
54
  def OCR(pdf_file):
55
  pdf_reader = PyPDF2.PdfReader(pdf_file)
56
  pdf_writer = PyPDF2.PdfWriter()
72
  pdf_file.close()
73
  return text
74
 
 
75
  def txt_extraction(file_path):
76
  file_contents = file_path.read().decode("utf-8")
77
  return file_contents
98
  )
99
 
100
  def GPT_4_API(data):
101
+ header = """ create 12 question and answeres from given paragraph dont use numbers to point out questions and answers, Answers should strictly be exact lines from this paragraph"."\n\nContext:\n"""
 
102
  QA = header + "".join(str(list(data)))
103
  response = openai.ChatCompletion.create(messages = [{"role": "user", "content": f"{QA}"},],**COMPLETIONS_API_PARAMS)
104
  return response["choices"][0]["message"]["content"]
132
 
133
  def Extract_pdf_content(pdf_name):
134
 
135
+ page_text = ""
136
  pdf_reader = PyPDF2.PdfReader(pdf_name)
137
  num_pages = len(pdf_reader.pages)
138
 
139
  for page in range(num_pages):
140
  pdf_page = pdf_reader.pages[page]
141
+ page_text = page_text + pdf_page.extract_text()
142
+
143
+ return page_text
144
 
145
  def process(uploaded_file):
146
+
147
  data = Extract_pdf_content(uploaded_file)
148
  return data
149
 
152
  pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
153
  PAGE_CONFIG = {"page_title":"StColab.io","page_icon":":smiley:","layout":"centered"}
154
  st.set_page_config(**PAGE_CONFIG)
155
+ main_bg = 'bkgnd1.jpg'
156
  set_png_as_page_bg(main_bg)
157
 
158
+ st.title("Advanced Text processing Tool")
159
+ uploaded_file = st.file_uploader("Upload a Files here", type = ["pdf","docx","txt"])
160
 
161
  if uploaded_file is not None:
162
 
163
  if check_file_format(uploaded_file.name) == "pdf":
164
  data = process(uploaded_file)
165
+
166
+ text = st.empty()
167
  if data == '':
168
+ text.write("applying OCR")
169
  data = OCR(uploaded_file)
170
+ text.empty()
171
 
172
  elif check_file_format(uploaded_file.name) == "docx":
173
  data = docx_extraction(uploaded_file)
bg.png DELETED
Binary file (192 kB)
bkgnd1.jpg ADDED