SummaScribe

Build error

App Files Files Community

SummaScribe / app.py

Scarletta975

Update app.py

32e478b verified 7 months ago

raw

history blame contribute delete

7.19 kB

	"""TEXT SUMMARIZATION Web APP"""

	# Importing Packages
	import base64
	import streamlit as st
	import torch
	import io
	from pdf2image import convert_from_path
	from PIL import Image
	from langchain.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from transformers import T5Tokenizer, T5ForConditionalGeneration
	from transformers import pipeline
	from reportlab.pdfgen import canvas


	# Streamlit Page Configuration
	st.set_page_config(layout="wide")


	# Load the tokenizer and model (cached to avoid reloads on rerun)
	@st.cache_resource
	def load_model(checkpoint="Lamini-1"):
	tokenizer = T5Tokenizer.from_pretrained(checkpoint)
	model = T5ForConditionalGeneration.from_pretrained(
	checkpoint,
	device_map="auto",
	torch_dtype=torch.float32,
	offload_folder="offload"
	)
	return tokenizer, model


	tokenizer, base_model = load_model()


	# File Loader & Processing
	def file_processing(file):
	loader = PyPDFLoader(file)
	pages = loader.load_and_split()
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	texts = text_splitter.split_documents(pages)
	return texts


	# Recursive Summarization
	def recursive_summarize(texts, pipe_summ, chunk_summary_len=150, final_summary_len=400):
	summaries = []
	for chunk in texts:
	try:
	result = pipe_summ(
	chunk.page_content,
	max_length=chunk_summary_len,
	min_length=50
	)[0]["summary_text"]
	summaries.append(result)
	except Exception as e:
	st.error(f"Error summarizing chunk: {e}")

	combined = " ".join(summaries)

	# Summarize Again to Compress Further
	final = pipe_summ(
	combined,
	max_length=final_summary_len,
	min_length=100
	)[0]["summary_text"]

	return final


	# Language Model Pipeline -> Summarization
	def llm_pipeline(filepath, summary_length):
	pipe_summ = pipeline(
	"summarization",
	model=base_model,
	tokenizer=tokenizer
	)
	texts = file_processing(filepath)
	return recursive_summarize(texts, pipe_summ, chunk_summary_len=200, final_summary_len=summary_length)


	# Display Background
	def add_bg_from_local(image_file):
	with open(image_file, "rb") as image_file:
	encoded_string = base64.b64encode(image_file.read())
	st.markdown(
	f"""
	<style>
	.stApp {{
	background-image: url(data:image/{"png"};base64,{encoded_string.decode()});
	background-size: cover;
	opacity:0.9;
	}}
	</style>
	""",
	unsafe_allow_html=True,
	)


	add_bg_from_local("Images/background.jpg")

	# Font Style
	with open("font.css") as f:
	st.markdown("<style>{}</style>".format(f.read()), unsafe_allow_html=True)

	# Sidebar
	st.sidebar.image("Images/sidebar_pic.png")
	st.sidebar.title("ABOUT THE APP")
	st.sidebar.write("SummaScribe: Your PDF wingman! 🚀 Now with chunk-wise recursive summarization and inline PDF preview.")
	selected_summary_length = st.sidebar.slider("SELECT SUMMARY STRENGTH", min_value=200, max_value=1500, value=500)


	# Display PDF as images
	def display(file):
	try:
	images = convert_from_path(file, dpi=100, first_page=1, last_page=10)
	img_tags = ""
	for i, img in enumerate(images):
	buf = io.BytesIO()
	img.save(buf, format="PNG")
	b64 = base64.b64encode(buf.getvalue()).decode()
	img_tags += f'<img src="data:image/png;base64,{b64}" style="height:500px; margin-right:10px;" />'

	html = f"""
	<div style="display:flex; overflow-x:auto; white-space:nowrap; border:1px solid #ccc; padding:10px;">
	{img_tags}
	</div>
	"""

	st.components.v1.html(html, height=550, scrolling=True)

	except Exception as e:
	st.error(f"Could not render PDF preview: {e}")
	with open(file, "rb") as f:
	st.download_button(
	label="Download Uploaded PDF",
	data=f,
	file_name=file.split("/")[-1],
	mime="application/pdf"
	)


	# Title Styling
	st.markdown(
	"""
	<style>
	.summascribe-title {
	font-size: 50px;
	text-align: center;
	transition: transform 0.2s ease-in-out;
	}
	.summascribe-title span {
	transition: color 0.2s ease-in-out;
	}
	.summascribe-title:hover span {
	color: #f5fefd;
	}
	.summascribe-title:hover {
	transform: scale(1.15);
	}
	</style>
	""",
	unsafe_allow_html=True,
	)

	text = "SummaScribe"
	colored_text = ''.join(
	['<span style="color: hsl(220, 60%, {}%);">{}</span>'.format(70 - (i * 10 / len(text)), char) for i, char in
	enumerate(text)])
	colored_text_with_malt = colored_text + ' <span style="color: hsl(220, 60%, 70%);">✧</span>'
	st.markdown(f'<h1 class="summascribe-title">{colored_text_with_malt}</h1>', unsafe_allow_html=True)

	st.markdown(
	'<h2 style="font-size:25px;color: #F5FEFD; text-align: center;">Text Document Summarization using LLMs</h2>',
	unsafe_allow_html=True,
	)


	# Main content
	def main():
	uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
	with st.expander("NOTE"):
	st.write(
	"Summascribe currently accepts PDF documents that contain only text and no images."
	)
	if uploaded_file is not None:
	if st.button("Summarize"):
	col1, col2 = st.columns((1, 1))
	filepath = "data/" + uploaded_file.name
	with open(filepath, "wb") as temp_file:
	temp_file.write(uploaded_file.read())
	with col1:
	st.info("Uploaded File")
	display(filepath)
	with col2:
	st.spinner(text="In progress...")
	st.info("Summary")
	summary = llm_pipeline(filepath, selected_summary_length)
	st.success(summary, icon="✅")

	# --- Download options (side by side, full width) ---
	col_txt, col_pdf = st.columns(2)

	with col_txt:
	st.download_button(
	label="Download Summary as TXT",
	data=summary,
	file_name="summary.txt",
	mime="text/plain",
	use_container_width=True
	)

	with col_pdf:
	pdf_buffer = io.BytesIO()
	c = canvas.Canvas(pdf_buffer)
	text_obj = c.beginText(40, 800)
	for line in summary.split("\n"):
	text_obj.textLine(line)
	c.drawText(text_obj)
	c.save()
	pdf_buffer.seek(0)

	st.download_button(
	label="Download Summary as PDF",
	data=pdf_buffer,
	file_name="summary.pdf",
	mime="application/pdf",
	use_container_width=True
	)


	if __name__ == "__main__":
	main()