Spaces:

ANASDAVOODTK
/

prjt

Sleeping

prjt / app.py

3commit

2693914 about 1 year ago

5.67 kB

	import numpy as np
	import os
	import cv2
	from PIL import Image
	from io import BytesIO
	import streamlit as st
	import openai
	import PyPDF2
	import base64
	import pypdfium2 as pdfium
	import docx
	from docx import Document
	import fitz
	import pytesseract

	COMPLETIONS_MODEL = "gpt-4"
	openai.api_key = "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"
	COMPLETIONS_API_PARAMS = {
	"temperature": 0.0,
	"max_tokens": 1000,
	"model": COMPLETIONS_MODEL,
	}

	@st.cache_data
	def run_on_chunks(data):
	response = []
	chunk = data_chunk(data , chunk_size = 2500)
	num = 0
	text = st.empty()

	for i in chunk:
	num = num + 1
	text.write(f"{num}th API request sent out of {len(chunk)}")
	response.append(GPT_4_API(i))
	text.empty()

	return response

	def data_chunk(lst , chunk_size):
	return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]

	def check_file_format(filename):
	return filename.rsplit('.', 1)[1].lower()

	def pdf_to_images(pdf_file):
	images = []
	with fitz.open(pdf_file) as doc:
	for page in doc:
	pix = page.get_pixmap(alpha=False)
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	images.append(img)
	return images

	def OCR(pdf_file):
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	pdf_writer = PyPDF2.PdfWriter()
	for page_num in range(len(pdf_reader.pages)):
	page = pdf_reader.pages[page_num]
	page.scale_by(2)
	pdf_writer.add_page(page)

	with open('enlarged.pdf', 'wb') as f:
	pdf_writer.write(f)

	images = pdf_to_images('enlarged.pdf')
	text = ''
	for image in images:
	size = (image.width * 2, image.height * 2)
	image = image.resize(size, Image.ANTIALIAS)
	text += pytesseract.image_to_string(image)

	pdf_file.close()
	return text

	def txt_extraction(file_path):
	file_contents = file_path.read().decode("utf-8")
	return file_contents

	def docx_extraction(path):
	doc = docx.Document(path)
	full_text = []
	for para in doc.paragraphs:
	full_text.append(para.text)
	return '\n'.join(full_text)


	def download_docx(text):
	document = Document()
	document.add_paragraph(text)
	output = BytesIO()
	document.save(output)
	output.seek(0)
	st.download_button(
	label="Download as .docx",
	data=output,
	file_name="document.docx",
	mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	)

	def GPT_4_API(data):
	header = """ create 12 question and answeres from given paragraph dont use numbers to point out questions and answers, Answers should strictly be exact lines from this paragraph"."\n\nContext:\n"""
	QA = header + "".join(str(list(data)))
	response = openai.ChatCompletion.create(messages = [{"role": "user", "content": f"{QA}"},],**COMPLETIONS_API_PARAMS)
	return response["choices"][0]["message"]["content"]


	def my_text_editor(_text , default_text, key, height=800):
	string = ""
	for i in default_text:
	string = string + i
	textarea = _text.text_area(key, height=height, value=string)
	return textarea , text

	def get_base64_of_bin_file(bin_file):
	with open(bin_file, 'rb') as f:
	data = f.read()
	return base64.b64encode(data).decode()

	def set_png_as_page_bg(png_file):

	bin_str = get_base64_of_bin_file(png_file)
	page_bg_img = '''
	<style>
	.stApp {
	background-image: url("data:image/png;base64,%s");
	background-size: cover;
	}
	</style>
	''' % bin_str
	st.markdown(page_bg_img, unsafe_allow_html=True)
	return

	def Extract_pdf_content(pdf_name):

	page_text = ""
	pdf_reader = PyPDF2.PdfReader(pdf_name)
	num_pages = len(pdf_reader.pages)

	for page in range(num_pages):
	pdf_page = pdf_reader.pages[page]
	page_text = page_text + pdf_page.extract_text()

	return page_text

	def process(uploaded_file):

	data = Extract_pdf_content(uploaded_file)
	return data

	if __name__=="__main__":

	pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
	PAGE_CONFIG = {"page_title":"StColab.io","page_icon":":smiley:","layout":"centered"}
	st.set_page_config(**PAGE_CONFIG)
	main_bg = 'bkgnd1.jpg'
	set_png_as_page_bg(main_bg)

	st.title("Advanced Text processing Tool")
	uploaded_file = st.file_uploader("Upload a Files here", type = ["pdf","docx","txt"])

	if uploaded_file is not None:

	if check_file_format(uploaded_file.name) == "pdf":
	data = process(uploaded_file)

	text = st.empty()
	if data == '':
	text.write("applying OCR")
	data = OCR(uploaded_file)
	text.empty()

	elif check_file_format(uploaded_file.name) == "docx":
	data = docx_extraction(uploaded_file)

	else:
	data = txt_extraction(uploaded_file)


	if st.button("re-generate set of questions and answers"):
	text = st.empty()
	st.cache_data.clear()
	response = run_on_chunks(data)
	textdata , text = my_text_editor(text ,response,"text-editor-1", height=650)
	download_docx(textdata)

	else:
	text = st.empty()
	response = run_on_chunks(data)
	textdata , text = my_text_editor(text ,response,"text-editor-1", height=650)
	download_docx(textdata)