ANASDAVOODTK commited on
Commit
ee165bf
1 Parent(s): 23f5ed9
Files changed (4) hide show
  1. app.py +165 -11
  2. bg.png +0 -0
  3. enlarged.pdf +0 -0
  4. requirements.txt +11 -1
app.py CHANGED
@@ -1,15 +1,169 @@
1
- import streamlit as st
2
  import os
3
- import pyperclip
4
- os.system('sudo apt-get install -y xclip')
5
- st.title("Copy text button code1")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- user_input = st.text_input("Enter your text here:")
 
8
 
9
- if st.button("Display Text"):
10
- st.write(user_input)
 
 
 
 
 
11
 
12
- if st.button("Copy to Clipboard"):
13
- pyperclip.copy(user_input)
14
- os.system("echo -n $'%s' | xsel -ib" % pyperclip.paste())
15
- st.write("Text copied to clipboard!")
 
 
 
 
 
1
+ import numpy as np
2
  import os
3
+ import cv2
4
+ from PIL import Image
5
+ import pandas as pd
6
+ from io import BytesIO
7
+ import streamlit as st
8
+ import openai
9
+ import PyPDF2
10
+ import base64
11
+ import pypdfium2 as pdfium
12
+ from pdf2image import convert_from_path
13
+ import docx
14
+ from docx import Document
15
+ import fitz
16
+ import pytesseract
17
+
18
+ COMPLETIONS_MODEL = "gpt-4"
19
+ openai.api_key = "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"
20
+ COMPLETIONS_API_PARAMS = {
21
+ "temperature": 0.0,
22
+ "max_tokens": 300,
23
+ "model": COMPLETIONS_MODEL,
24
+ }
25
+
26
+ def run_on_chunks(data):
27
+ response = []
28
+ chunk = data_chunk(data , chunk_size = 10000)
29
+ print(chunk)
30
+ for i in chunk:
31
+ response.append(GPT_4_API(i))
32
+ return response
33
+
34
+ def data_chunk(lst , chunk_size):
35
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
36
+
37
+ def check_file_format(filename):
38
+ return filename.rsplit('.', 1)[1].lower()
39
+
40
+ def pdf_to_images(pdf_file):
41
+ images = []
42
+ with fitz.open(pdf_file) as doc:
43
+ for page in doc:
44
+ pix = page.get_pixmap(alpha=False)
45
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
46
+ images.append(img)
47
+ return images
48
+
49
+ def OCR(pdf_file):
50
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
51
+ pdf_writer = PyPDF2.PdfWriter()
52
+ for page_num in range(len(pdf_reader.pages)):
53
+ page = pdf_reader.pages[page_num]
54
+ page.scale_by(2)
55
+ pdf_writer.add_page(page)
56
+
57
+ with open('enlarged.pdf', 'wb') as f:
58
+ pdf_writer.write(f)
59
+
60
+ images = pdf_to_images('enlarged.pdf')
61
+ text = ''
62
+ for image in images:
63
+ size = (image.width * 2, image.height * 2)
64
+ image = image.resize(size, Image.ANTIALIAS)
65
+ text += pytesseract.image_to_string(image)
66
+
67
+ print(text)
68
+ pdf_file.close()
69
+ return text
70
+
71
+
72
+ def txt_extraction(file_path):
73
+ file_contents = file_path.read().decode("utf-8")
74
+ return file_contents
75
+
76
+ def docx_extraction(path):
77
+ doc = docx.Document(path)
78
+ full_text = []
79
+ for para in doc.paragraphs:
80
+ full_text.append(para.text)
81
+ return '\n'.join(full_text)
82
+
83
+ def download_docx(text):
84
+ document = Document()
85
+ document.add_paragraph(text)
86
+ output = BytesIO()
87
+ document.save(output)
88
+ output.seek(0)
89
+ st.download_button(
90
+ label="Download as .docx",
91
+ data=output,
92
+ file_name="document.docx",
93
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
94
+ )
95
+
96
+ @st.cache_data()
97
+ def GPT_4_API(data):
98
+ print("request_send")
99
+ header = """ create 20 question and answeres from this paragraph, Answer should strictly be exact lines from this paragraph"."\n\nContext:\n"""
100
+ QA = header + "".join(str(list(data)))
101
+ response = openai.ChatCompletion.create(messages = [{"role": "user", "content": f"{QA}"},],**COMPLETIONS_API_PARAMS)
102
+ return response["choices"][0]["message"]["content"]
103
+
104
+ def my_text_editor(default_text, key, height=800):
105
+ textarea = st.text_area(key, height=height, value=default_text)
106
+ return textarea
107
+
108
+ def get_base64_of_bin_file(bin_file):
109
+ with open(bin_file, 'rb') as f:
110
+ data = f.read()
111
+ return base64.b64encode(data).decode()
112
+
113
+ def set_png_as_page_bg(png_file):
114
+ bin_str = get_base64_of_bin_file(png_file)
115
+ page_bg_img = '''
116
+ <style>
117
+ .stApp {
118
+ background-image: url("data:image/png;base64,%s");
119
+ background-size: cover;
120
+ }
121
+ </style>
122
+ ''' % bin_str
123
+ st.markdown(page_bg_img, unsafe_allow_html=True)
124
+ return
125
+
126
+ def Extract_pdf_content(pdf_name):
127
+
128
+ page_text = []
129
+ pdf_reader = PyPDF2.PdfReader(pdf_name)
130
+ num_pages = len(pdf_reader.pages)
131
+
132
+ for page in range(num_pages):
133
+ pdf_page = pdf_reader.pages[page]
134
+ page_text.append(pdf_page.extract_text())
135
+
136
+ return page_text[0]
137
+
138
+ def process(uploaded_file):
139
+ st.write("Filename:", uploaded_file.name)
140
+ data = Extract_pdf_content(uploaded_file)
141
+ return data
142
+
143
+ if __name__=="__main__":
144
+ pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
145
+ PAGE_CONFIG = {"page_title":"StColab.io","page_icon":":smiley:","layout":"centered"}
146
+ st.set_page_config(**PAGE_CONFIG)
147
+ main_bg = 'bg.png'
148
+ set_png_as_page_bg(main_bg)
149
 
150
+ st.title("pdf data extraction web application")
151
+ uploaded_file = st.file_uploader("Upload a PDF file", type = ["pdf","docx","txt"])
152
 
153
+ text = ""
154
+ if uploaded_file is not None:
155
+
156
+ if check_file_format(uploaded_file.name) == "pdf":
157
+ data = process(uploaded_file)
158
+ if data==" ":
159
+ data = OCR(uploaded_file)
160
 
161
+ elif check_file_format(uploaded_file.name) == "docx":
162
+ data = docx_extraction(uploaded_file)
163
+
164
+ else:
165
+ data = txt_extraction(uploaded_file)
166
+
167
+ response = run_on_chunks(data)
168
+ text = my_text_editor(response[0],"text-editor", height=800)
169
+ download_docx(text)
bg.png ADDED
enlarged.pdf ADDED
Binary file (13.6 kB). View file
requirements.txt CHANGED
@@ -1 +1,11 @@
1
- pyperclip
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ opencv-python
3
+ Pillow
4
+ pandas
5
+ streamlit
6
+ openai
7
+ PyPDF2
8
+ pdf2image
9
+ python-docx
10
+ PyMuPDF
11
+ pytesseract