Spaces:
Sleeping
Sleeping
import numpy as np | |
import os | |
import cv2 | |
from PIL import Image | |
from io import BytesIO | |
import streamlit as st | |
import openai | |
import PyPDF2 | |
import base64 | |
import pypdfium2 as pdfium | |
import docx | |
from docx import Document | |
import fitz | |
import pytesseract | |
COMPLETIONS_MODEL = "gpt-4" | |
openai.api_key = "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" | |
COMPLETIONS_API_PARAMS = { | |
"temperature": 0.0, | |
"max_tokens": 1000, | |
"model": COMPLETIONS_MODEL, | |
} | |
def run_on_chunks(data): | |
response = [] | |
chunk = data_chunk(data , chunk_size = 2500) | |
num = 0 | |
text = st.empty() | |
for i in chunk: | |
num = num + 1 | |
text.write(f"{num}th API request sent out of {len(chunk)}") | |
response.append(GPT_4_API(i)) | |
text.empty() | |
return response | |
def data_chunk(lst , chunk_size): | |
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] | |
def check_file_format(filename): | |
return filename.rsplit('.', 1)[1].lower() | |
def pdf_to_images(pdf_file): | |
images = [] | |
with fitz.open(pdf_file) as doc: | |
for page in doc: | |
pix = page.get_pixmap(alpha=False) | |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
images.append(img) | |
return images | |
def OCR(pdf_file): | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
pdf_writer = PyPDF2.PdfWriter() | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
page.scale_by(2) | |
pdf_writer.add_page(page) | |
with open('enlarged.pdf', 'wb') as f: | |
pdf_writer.write(f) | |
images = pdf_to_images('enlarged.pdf') | |
text = '' | |
for image in images: | |
size = (image.width * 2, image.height * 2) | |
image = image.resize(size, Image.ANTIALIAS) | |
text += pytesseract.image_to_string(image) | |
pdf_file.close() | |
return text | |
def txt_extraction(file_path): | |
file_contents = file_path.read().decode("utf-8") | |
return file_contents | |
def docx_extraction(path): | |
doc = docx.Document(path) | |
full_text = [] | |
for para in doc.paragraphs: | |
full_text.append(para.text) | |
return '\n'.join(full_text) | |
def download_docx(text): | |
document = Document() | |
document.add_paragraph(text) | |
output = BytesIO() | |
document.save(output) | |
output.seek(0) | |
st.download_button( | |
label="Download as .docx", | |
data=output, | |
file_name="document.docx", | |
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" | |
) | |
def GPT_4_API(data): | |
header = """ create 12 question and answeres from given paragraph dont use numbers to point out questions and answers, Answers should strictly be exact lines from this paragraph"."\n\nContext:\n""" | |
QA = header + "".join(str(list(data))) | |
response = openai.ChatCompletion.create(messages = [{"role": "user", "content": f"{QA}"},],**COMPLETIONS_API_PARAMS) | |
return response["choices"][0]["message"]["content"] | |
def my_text_editor(_text , default_text, key, height=800): | |
string = "" | |
for i in default_text: | |
string = string + i | |
textarea = _text.text_area(key, height=height, value=string) | |
return textarea , text | |
def get_base64_of_bin_file(bin_file): | |
with open(bin_file, 'rb') as f: | |
data = f.read() | |
return base64.b64encode(data).decode() | |
def set_png_as_page_bg(png_file): | |
bin_str = get_base64_of_bin_file(png_file) | |
page_bg_img = ''' | |
<style> | |
.stApp { | |
background-image: url("data:image/png;base64,%s"); | |
background-size: cover; | |
} | |
</style> | |
''' % bin_str | |
st.markdown(page_bg_img, unsafe_allow_html=True) | |
return | |
def Extract_pdf_content(pdf_name): | |
page_text = "" | |
pdf_reader = PyPDF2.PdfReader(pdf_name) | |
num_pages = len(pdf_reader.pages) | |
for page in range(num_pages): | |
pdf_page = pdf_reader.pages[page] | |
page_text = page_text + pdf_page.extract_text() | |
return page_text | |
def process(uploaded_file): | |
data = Extract_pdf_content(uploaded_file) | |
return data | |
if __name__=="__main__": | |
pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' | |
PAGE_CONFIG = {"page_title":"StColab.io","page_icon":":smiley:","layout":"centered"} | |
st.set_page_config(**PAGE_CONFIG) | |
main_bg = 'bkgnd1.jpg' | |
set_png_as_page_bg(main_bg) | |
st.title("Advanced Text processing Tool") | |
uploaded_file = st.file_uploader("Upload a Files here", type = ["pdf","docx","txt"]) | |
if uploaded_file is not None: | |
if check_file_format(uploaded_file.name) == "pdf": | |
data = process(uploaded_file) | |
text = st.empty() | |
if data == '': | |
text.write("applying OCR") | |
data = OCR(uploaded_file) | |
text.empty() | |
elif check_file_format(uploaded_file.name) == "docx": | |
data = docx_extraction(uploaded_file) | |
else: | |
data = txt_extraction(uploaded_file) | |
if st.button("re-generate set of questions and answers"): | |
text = st.empty() | |
st.cache_data.clear() | |
response = run_on_chunks(data) | |
textdata , text = my_text_editor(text ,response,"text-editor-1", height=650) | |
download_docx(textdata) | |
else: | |
text = st.empty() | |
response = run_on_chunks(data) | |
textdata , text = my_text_editor(text ,response,"text-editor-1", height=650) | |
download_docx(textdata) | |