Spaces:
Sleeping
Sleeping
3commit
Browse files- app.py +27 -18
- bg.png +0 -0
- bkgnd1.jpg +0 -0
app.py
CHANGED
@@ -17,24 +17,31 @@ COMPLETIONS_MODEL = "gpt-4"
|
|
17 |
openai.api_key = "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"
|
18 |
COMPLETIONS_API_PARAMS = {
|
19 |
"temperature": 0.0,
|
20 |
-
"max_tokens":
|
21 |
"model": COMPLETIONS_MODEL,
|
22 |
}
|
23 |
|
24 |
@st.cache_data
|
25 |
def run_on_chunks(data):
|
26 |
response = []
|
27 |
-
chunk = data_chunk(data , chunk_size =
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
29 |
response.append(GPT_4_API(i))
|
|
|
|
|
30 |
return response
|
31 |
-
|
32 |
def data_chunk(lst , chunk_size):
|
33 |
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
|
34 |
-
|
35 |
def check_file_format(filename):
|
36 |
return filename.rsplit('.', 1)[1].lower()
|
37 |
-
|
38 |
def pdf_to_images(pdf_file):
|
39 |
images = []
|
40 |
with fitz.open(pdf_file) as doc:
|
@@ -43,7 +50,7 @@ def pdf_to_images(pdf_file):
|
|
43 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
44 |
images.append(img)
|
45 |
return images
|
46 |
-
|
47 |
def OCR(pdf_file):
|
48 |
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
49 |
pdf_writer = PyPDF2.PdfWriter()
|
@@ -65,7 +72,6 @@ def OCR(pdf_file):
|
|
65 |
pdf_file.close()
|
66 |
return text
|
67 |
|
68 |
-
|
69 |
def txt_extraction(file_path):
|
70 |
file_contents = file_path.read().decode("utf-8")
|
71 |
return file_contents
|
@@ -92,8 +98,7 @@ def download_docx(text):
|
|
92 |
)
|
93 |
|
94 |
def GPT_4_API(data):
|
95 |
-
|
96 |
-
header = """ create 20 question and answeres from this paragraph, Answer should strictly be exact lines from this paragraph without question answer numbers"."\n\nContext:\n"""
|
97 |
QA = header + "".join(str(list(data)))
|
98 |
response = openai.ChatCompletion.create(messages = [{"role": "user", "content": f"{QA}"},],**COMPLETIONS_API_PARAMS)
|
99 |
return response["choices"][0]["message"]["content"]
|
@@ -127,18 +132,18 @@ def set_png_as_page_bg(png_file):
|
|
127 |
|
128 |
def Extract_pdf_content(pdf_name):
|
129 |
|
130 |
-
page_text =
|
131 |
pdf_reader = PyPDF2.PdfReader(pdf_name)
|
132 |
num_pages = len(pdf_reader.pages)
|
133 |
|
134 |
for page in range(num_pages):
|
135 |
pdf_page = pdf_reader.pages[page]
|
136 |
-
page_text
|
137 |
-
|
138 |
-
return page_text
|
139 |
|
140 |
def process(uploaded_file):
|
141 |
-
|
142 |
data = Extract_pdf_content(uploaded_file)
|
143 |
return data
|
144 |
|
@@ -147,18 +152,22 @@ if __name__=="__main__":
|
|
147 |
pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
|
148 |
PAGE_CONFIG = {"page_title":"StColab.io","page_icon":":smiley:","layout":"centered"}
|
149 |
st.set_page_config(**PAGE_CONFIG)
|
150 |
-
main_bg = '
|
151 |
set_png_as_page_bg(main_bg)
|
152 |
|
153 |
-
st.title("
|
154 |
-
uploaded_file = st.file_uploader("Upload a
|
155 |
|
156 |
if uploaded_file is not None:
|
157 |
|
158 |
if check_file_format(uploaded_file.name) == "pdf":
|
159 |
data = process(uploaded_file)
|
|
|
|
|
160 |
if data == '':
|
|
|
161 |
data = OCR(uploaded_file)
|
|
|
162 |
|
163 |
elif check_file_format(uploaded_file.name) == "docx":
|
164 |
data = docx_extraction(uploaded_file)
|
17 |
openai.api_key = "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"
|
18 |
COMPLETIONS_API_PARAMS = {
|
19 |
"temperature": 0.0,
|
20 |
+
"max_tokens": 1000,
|
21 |
"model": COMPLETIONS_MODEL,
|
22 |
}
|
23 |
|
24 |
@st.cache_data
|
25 |
def run_on_chunks(data):
|
26 |
response = []
|
27 |
+
chunk = data_chunk(data , chunk_size = 2500)
|
28 |
+
num = 0
|
29 |
+
text = st.empty()
|
30 |
+
|
31 |
+
for i in chunk:
|
32 |
+
num = num + 1
|
33 |
+
text.write(f"{num}th API request sent out of {len(chunk)}")
|
34 |
response.append(GPT_4_API(i))
|
35 |
+
text.empty()
|
36 |
+
|
37 |
return response
|
38 |
+
|
39 |
def data_chunk(lst , chunk_size):
|
40 |
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
|
41 |
+
|
42 |
def check_file_format(filename):
|
43 |
return filename.rsplit('.', 1)[1].lower()
|
44 |
+
|
45 |
def pdf_to_images(pdf_file):
|
46 |
images = []
|
47 |
with fitz.open(pdf_file) as doc:
|
50 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
51 |
images.append(img)
|
52 |
return images
|
53 |
+
|
54 |
def OCR(pdf_file):
|
55 |
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
56 |
pdf_writer = PyPDF2.PdfWriter()
|
72 |
pdf_file.close()
|
73 |
return text
|
74 |
|
|
|
75 |
def txt_extraction(file_path):
|
76 |
file_contents = file_path.read().decode("utf-8")
|
77 |
return file_contents
|
98 |
)
|
99 |
|
100 |
def GPT_4_API(data):
|
101 |
+
header = """ create 12 question and answeres from given paragraph dont use numbers to point out questions and answers, Answers should strictly be exact lines from this paragraph"."\n\nContext:\n"""
|
|
|
102 |
QA = header + "".join(str(list(data)))
|
103 |
response = openai.ChatCompletion.create(messages = [{"role": "user", "content": f"{QA}"},],**COMPLETIONS_API_PARAMS)
|
104 |
return response["choices"][0]["message"]["content"]
|
132 |
|
133 |
def Extract_pdf_content(pdf_name):
|
134 |
|
135 |
+
page_text = ""
|
136 |
pdf_reader = PyPDF2.PdfReader(pdf_name)
|
137 |
num_pages = len(pdf_reader.pages)
|
138 |
|
139 |
for page in range(num_pages):
|
140 |
pdf_page = pdf_reader.pages[page]
|
141 |
+
page_text = page_text + pdf_page.extract_text()
|
142 |
+
|
143 |
+
return page_text
|
144 |
|
145 |
def process(uploaded_file):
|
146 |
+
|
147 |
data = Extract_pdf_content(uploaded_file)
|
148 |
return data
|
149 |
|
152 |
pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
|
153 |
PAGE_CONFIG = {"page_title":"StColab.io","page_icon":":smiley:","layout":"centered"}
|
154 |
st.set_page_config(**PAGE_CONFIG)
|
155 |
+
main_bg = 'bkgnd1.jpg'
|
156 |
set_png_as_page_bg(main_bg)
|
157 |
|
158 |
+
st.title("Advanced Text processing Tool")
|
159 |
+
uploaded_file = st.file_uploader("Upload a Files here", type = ["pdf","docx","txt"])
|
160 |
|
161 |
if uploaded_file is not None:
|
162 |
|
163 |
if check_file_format(uploaded_file.name) == "pdf":
|
164 |
data = process(uploaded_file)
|
165 |
+
|
166 |
+
text = st.empty()
|
167 |
if data == '':
|
168 |
+
text.write("applying OCR")
|
169 |
data = OCR(uploaded_file)
|
170 |
+
text.empty()
|
171 |
|
172 |
elif check_file_format(uploaded_file.name) == "docx":
|
173 |
data = docx_extraction(uploaded_file)
|
bg.png
DELETED
Binary file (192 kB)
|
bkgnd1.jpg
ADDED