Spaces:
Sleeping
Sleeping
ANASDAVOODTK
commited on
Commit
•
ee165bf
1
Parent(s):
23f5ed9
new p
Browse files- app.py +165 -11
- bg.png +0 -0
- enlarged.pdf +0 -0
- requirements.txt +11 -1
app.py
CHANGED
@@ -1,15 +1,169 @@
|
|
1 |
-
import
|
2 |
import os
|
3 |
-
import
|
4 |
-
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
|
|
|
8 |
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
import os
|
3 |
+
import cv2
|
4 |
+
from PIL import Image
|
5 |
+
import pandas as pd
|
6 |
+
from io import BytesIO
|
7 |
+
import streamlit as st
|
8 |
+
import openai
|
9 |
+
import PyPDF2
|
10 |
+
import base64
|
11 |
+
import pypdfium2 as pdfium
|
12 |
+
from pdf2image import convert_from_path
|
13 |
+
import docx
|
14 |
+
from docx import Document
|
15 |
+
import fitz
|
16 |
+
import pytesseract
|
17 |
+
|
18 |
+
COMPLETIONS_MODEL = "gpt-4"
|
19 |
+
openai.api_key = "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"
|
20 |
+
COMPLETIONS_API_PARAMS = {
|
21 |
+
"temperature": 0.0,
|
22 |
+
"max_tokens": 300,
|
23 |
+
"model": COMPLETIONS_MODEL,
|
24 |
+
}
|
25 |
+
|
26 |
+
def run_on_chunks(data):
|
27 |
+
response = []
|
28 |
+
chunk = data_chunk(data , chunk_size = 10000)
|
29 |
+
print(chunk)
|
30 |
+
for i in chunk:
|
31 |
+
response.append(GPT_4_API(i))
|
32 |
+
return response
|
33 |
+
|
34 |
+
def data_chunk(lst , chunk_size):
|
35 |
+
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
|
36 |
+
|
37 |
+
def check_file_format(filename):
|
38 |
+
return filename.rsplit('.', 1)[1].lower()
|
39 |
+
|
40 |
+
def pdf_to_images(pdf_file):
|
41 |
+
images = []
|
42 |
+
with fitz.open(pdf_file) as doc:
|
43 |
+
for page in doc:
|
44 |
+
pix = page.get_pixmap(alpha=False)
|
45 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
46 |
+
images.append(img)
|
47 |
+
return images
|
48 |
+
|
49 |
+
def OCR(pdf_file):
|
50 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
51 |
+
pdf_writer = PyPDF2.PdfWriter()
|
52 |
+
for page_num in range(len(pdf_reader.pages)):
|
53 |
+
page = pdf_reader.pages[page_num]
|
54 |
+
page.scale_by(2)
|
55 |
+
pdf_writer.add_page(page)
|
56 |
+
|
57 |
+
with open('enlarged.pdf', 'wb') as f:
|
58 |
+
pdf_writer.write(f)
|
59 |
+
|
60 |
+
images = pdf_to_images('enlarged.pdf')
|
61 |
+
text = ''
|
62 |
+
for image in images:
|
63 |
+
size = (image.width * 2, image.height * 2)
|
64 |
+
image = image.resize(size, Image.ANTIALIAS)
|
65 |
+
text += pytesseract.image_to_string(image)
|
66 |
+
|
67 |
+
print(text)
|
68 |
+
pdf_file.close()
|
69 |
+
return text
|
70 |
+
|
71 |
+
|
72 |
+
def txt_extraction(file_path):
|
73 |
+
file_contents = file_path.read().decode("utf-8")
|
74 |
+
return file_contents
|
75 |
+
|
76 |
+
def docx_extraction(path):
|
77 |
+
doc = docx.Document(path)
|
78 |
+
full_text = []
|
79 |
+
for para in doc.paragraphs:
|
80 |
+
full_text.append(para.text)
|
81 |
+
return '\n'.join(full_text)
|
82 |
+
|
83 |
+
def download_docx(text):
|
84 |
+
document = Document()
|
85 |
+
document.add_paragraph(text)
|
86 |
+
output = BytesIO()
|
87 |
+
document.save(output)
|
88 |
+
output.seek(0)
|
89 |
+
st.download_button(
|
90 |
+
label="Download as .docx",
|
91 |
+
data=output,
|
92 |
+
file_name="document.docx",
|
93 |
+
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
94 |
+
)
|
95 |
+
|
96 |
+
@st.cache_data()
|
97 |
+
def GPT_4_API(data):
|
98 |
+
print("request_send")
|
99 |
+
header = """ create 20 question and answeres from this paragraph, Answer should strictly be exact lines from this paragraph"."\n\nContext:\n"""
|
100 |
+
QA = header + "".join(str(list(data)))
|
101 |
+
response = openai.ChatCompletion.create(messages = [{"role": "user", "content": f"{QA}"},],**COMPLETIONS_API_PARAMS)
|
102 |
+
return response["choices"][0]["message"]["content"]
|
103 |
+
|
104 |
+
def my_text_editor(default_text, key, height=800):
|
105 |
+
textarea = st.text_area(key, height=height, value=default_text)
|
106 |
+
return textarea
|
107 |
+
|
108 |
+
def get_base64_of_bin_file(bin_file):
|
109 |
+
with open(bin_file, 'rb') as f:
|
110 |
+
data = f.read()
|
111 |
+
return base64.b64encode(data).decode()
|
112 |
+
|
113 |
+
def set_png_as_page_bg(png_file):
|
114 |
+
bin_str = get_base64_of_bin_file(png_file)
|
115 |
+
page_bg_img = '''
|
116 |
+
<style>
|
117 |
+
.stApp {
|
118 |
+
background-image: url("data:image/png;base64,%s");
|
119 |
+
background-size: cover;
|
120 |
+
}
|
121 |
+
</style>
|
122 |
+
''' % bin_str
|
123 |
+
st.markdown(page_bg_img, unsafe_allow_html=True)
|
124 |
+
return
|
125 |
+
|
126 |
+
def Extract_pdf_content(pdf_name):
|
127 |
+
|
128 |
+
page_text = []
|
129 |
+
pdf_reader = PyPDF2.PdfReader(pdf_name)
|
130 |
+
num_pages = len(pdf_reader.pages)
|
131 |
+
|
132 |
+
for page in range(num_pages):
|
133 |
+
pdf_page = pdf_reader.pages[page]
|
134 |
+
page_text.append(pdf_page.extract_text())
|
135 |
+
|
136 |
+
return page_text[0]
|
137 |
+
|
138 |
+
def process(uploaded_file):
|
139 |
+
st.write("Filename:", uploaded_file.name)
|
140 |
+
data = Extract_pdf_content(uploaded_file)
|
141 |
+
return data
|
142 |
+
|
143 |
+
if __name__=="__main__":
|
144 |
+
pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
|
145 |
+
PAGE_CONFIG = {"page_title":"StColab.io","page_icon":":smiley:","layout":"centered"}
|
146 |
+
st.set_page_config(**PAGE_CONFIG)
|
147 |
+
main_bg = 'bg.png'
|
148 |
+
set_png_as_page_bg(main_bg)
|
149 |
|
150 |
+
st.title("pdf data extraction web application")
|
151 |
+
uploaded_file = st.file_uploader("Upload a PDF file", type = ["pdf","docx","txt"])
|
152 |
|
153 |
+
text = ""
|
154 |
+
if uploaded_file is not None:
|
155 |
+
|
156 |
+
if check_file_format(uploaded_file.name) == "pdf":
|
157 |
+
data = process(uploaded_file)
|
158 |
+
if data==" ":
|
159 |
+
data = OCR(uploaded_file)
|
160 |
|
161 |
+
elif check_file_format(uploaded_file.name) == "docx":
|
162 |
+
data = docx_extraction(uploaded_file)
|
163 |
+
|
164 |
+
else:
|
165 |
+
data = txt_extraction(uploaded_file)
|
166 |
+
|
167 |
+
response = run_on_chunks(data)
|
168 |
+
text = my_text_editor(response[0],"text-editor", height=800)
|
169 |
+
download_docx(text)
|
bg.png
ADDED
enlarged.pdf
ADDED
Binary file (13.6 kB). View file
|
requirements.txt
CHANGED
@@ -1 +1,11 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
opencv-python
|
3 |
+
Pillow
|
4 |
+
pandas
|
5 |
+
streamlit
|
6 |
+
openai
|
7 |
+
PyPDF2
|
8 |
+
pdf2image
|
9 |
+
python-docx
|
10 |
+
PyMuPDF
|
11 |
+
pytesseract
|