prjt / app.py
AKnvd's picture
3commit
2693914
import numpy as np
import os
import cv2
from PIL import Image
from io import BytesIO
import streamlit as st
import openai
import PyPDF2
import base64
import pypdfium2 as pdfium
import docx
from docx import Document
import fitz
import pytesseract
COMPLETIONS_MODEL = "gpt-4"
openai.api_key = "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"
COMPLETIONS_API_PARAMS = {
"temperature": 0.0,
"max_tokens": 1000,
"model": COMPLETIONS_MODEL,
}
@st.cache_data
def run_on_chunks(data):
response = []
chunk = data_chunk(data , chunk_size = 2500)
num = 0
text = st.empty()
for i in chunk:
num = num + 1
text.write(f"{num}th API request sent out of {len(chunk)}")
response.append(GPT_4_API(i))
text.empty()
return response
def data_chunk(lst , chunk_size):
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
def check_file_format(filename):
return filename.rsplit('.', 1)[1].lower()
def pdf_to_images(pdf_file):
images = []
with fitz.open(pdf_file) as doc:
for page in doc:
pix = page.get_pixmap(alpha=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
return images
def OCR(pdf_file):
pdf_reader = PyPDF2.PdfReader(pdf_file)
pdf_writer = PyPDF2.PdfWriter()
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
page.scale_by(2)
pdf_writer.add_page(page)
with open('enlarged.pdf', 'wb') as f:
pdf_writer.write(f)
images = pdf_to_images('enlarged.pdf')
text = ''
for image in images:
size = (image.width * 2, image.height * 2)
image = image.resize(size, Image.ANTIALIAS)
text += pytesseract.image_to_string(image)
pdf_file.close()
return text
def txt_extraction(file_path):
file_contents = file_path.read().decode("utf-8")
return file_contents
def docx_extraction(path):
doc = docx.Document(path)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return '\n'.join(full_text)
def download_docx(text):
document = Document()
document.add_paragraph(text)
output = BytesIO()
document.save(output)
output.seek(0)
st.download_button(
label="Download as .docx",
data=output,
file_name="document.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
def GPT_4_API(data):
header = """ create 12 question and answeres from given paragraph dont use numbers to point out questions and answers, Answers should strictly be exact lines from this paragraph"."\n\nContext:\n"""
QA = header + "".join(str(list(data)))
response = openai.ChatCompletion.create(messages = [{"role": "user", "content": f"{QA}"},],**COMPLETIONS_API_PARAMS)
return response["choices"][0]["message"]["content"]
def my_text_editor(_text , default_text, key, height=800):
string = ""
for i in default_text:
string = string + i
textarea = _text.text_area(key, height=height, value=string)
return textarea , text
def get_base64_of_bin_file(bin_file):
with open(bin_file, 'rb') as f:
data = f.read()
return base64.b64encode(data).decode()
def set_png_as_page_bg(png_file):
bin_str = get_base64_of_bin_file(png_file)
page_bg_img = '''
<style>
.stApp {
background-image: url("data:image/png;base64,%s");
background-size: cover;
}
</style>
''' % bin_str
st.markdown(page_bg_img, unsafe_allow_html=True)
return
def Extract_pdf_content(pdf_name):
page_text = ""
pdf_reader = PyPDF2.PdfReader(pdf_name)
num_pages = len(pdf_reader.pages)
for page in range(num_pages):
pdf_page = pdf_reader.pages[page]
page_text = page_text + pdf_page.extract_text()
return page_text
def process(uploaded_file):
data = Extract_pdf_content(uploaded_file)
return data
if __name__=="__main__":
pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
PAGE_CONFIG = {"page_title":"StColab.io","page_icon":":smiley:","layout":"centered"}
st.set_page_config(**PAGE_CONFIG)
main_bg = 'bkgnd1.jpg'
set_png_as_page_bg(main_bg)
st.title("Advanced Text processing Tool")
uploaded_file = st.file_uploader("Upload a Files here", type = ["pdf","docx","txt"])
if uploaded_file is not None:
if check_file_format(uploaded_file.name) == "pdf":
data = process(uploaded_file)
text = st.empty()
if data == '':
text.write("applying OCR")
data = OCR(uploaded_file)
text.empty()
elif check_file_format(uploaded_file.name) == "docx":
data = docx_extraction(uploaded_file)
else:
data = txt_extraction(uploaded_file)
if st.button("re-generate set of questions and answers"):
text = st.empty()
st.cache_data.clear()
response = run_on_chunks(data)
textdata , text = my_text_editor(text ,response,"text-editor-1", height=650)
download_docx(textdata)
else:
text = st.empty()
response = run_on_chunks(data)
textdata , text = my_text_editor(text ,response,"text-editor-1", height=650)
download_docx(textdata)