Spaces:
Build error
Build error
| """TEXT SUMMARIZATION Web APP""" | |
| # Importing Packages | |
| import base64 | |
| import streamlit as st | |
| import torch | |
| import io | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from transformers import T5Tokenizer, T5ForConditionalGeneration | |
| from transformers import pipeline | |
| from reportlab.pdfgen import canvas | |
| # Streamlit Page Configuration | |
| st.set_page_config(layout="wide") | |
| # Load the tokenizer and model (cached to avoid reloads on rerun) | |
| def load_model(checkpoint="Lamini-1"): | |
| tokenizer = T5Tokenizer.from_pretrained(checkpoint) | |
| model = T5ForConditionalGeneration.from_pretrained( | |
| checkpoint, | |
| device_map="auto", | |
| torch_dtype=torch.float32, | |
| offload_folder="offload" | |
| ) | |
| return tokenizer, model | |
| tokenizer, base_model = load_model() | |
| # File Loader & Processing | |
| def file_processing(file): | |
| loader = PyPDFLoader(file) | |
| pages = loader.load_and_split() | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| texts = text_splitter.split_documents(pages) | |
| return texts | |
| # Recursive Summarization | |
| def recursive_summarize(texts, pipe_summ, chunk_summary_len=150, final_summary_len=400): | |
| summaries = [] | |
| for chunk in texts: | |
| try: | |
| result = pipe_summ( | |
| chunk.page_content, | |
| max_length=chunk_summary_len, | |
| min_length=50 | |
| )[0]["summary_text"] | |
| summaries.append(result) | |
| except Exception as e: | |
| st.error(f"Error summarizing chunk: {e}") | |
| combined = " ".join(summaries) | |
| # Summarize Again to Compress Further | |
| final = pipe_summ( | |
| combined, | |
| max_length=final_summary_len, | |
| min_length=100 | |
| )[0]["summary_text"] | |
| return final | |
| # Language Model Pipeline -> Summarization | |
| def llm_pipeline(filepath, summary_length): | |
| pipe_summ = pipeline( | |
| "summarization", | |
| model=base_model, | |
| tokenizer=tokenizer | |
| ) | |
| texts = file_processing(filepath) | |
| return recursive_summarize(texts, pipe_summ, chunk_summary_len=200, final_summary_len=summary_length) | |
| # Display Background | |
| def add_bg_from_local(image_file): | |
| with open(image_file, "rb") as image_file: | |
| encoded_string = base64.b64encode(image_file.read()) | |
| st.markdown( | |
| f""" | |
| <style> | |
| .stApp {{ | |
| background-image: url(data:image/{"png"};base64,{encoded_string.decode()}); | |
| background-size: cover; | |
| opacity:0.9; | |
| }} | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| add_bg_from_local("Images/background.jpg") | |
| # Font Style | |
| with open("font.css") as f: | |
| st.markdown("<style>{}</style>".format(f.read()), unsafe_allow_html=True) | |
| # Sidebar | |
| st.sidebar.image("Images/sidebar_pic.png") | |
| st.sidebar.title("ABOUT THE APP") | |
| st.sidebar.write("SummaScribe: Your PDF wingman! 🚀 Now with **chunk-wise recursive summarization** and inline PDF preview.") | |
| selected_summary_length = st.sidebar.slider("SELECT SUMMARY STRENGTH", min_value=200, max_value=1500, value=500) | |
| # Display PDF as images | |
| def display(file): | |
| try: | |
| images = convert_from_path(file, dpi=100, first_page=1, last_page=10) | |
| img_tags = "" | |
| for i, img in enumerate(images): | |
| buf = io.BytesIO() | |
| img.save(buf, format="PNG") | |
| b64 = base64.b64encode(buf.getvalue()).decode() | |
| img_tags += f'<img src="data:image/png;base64,{b64}" style="height:500px; margin-right:10px;" />' | |
| html = f""" | |
| <div style="display:flex; overflow-x:auto; white-space:nowrap; border:1px solid #ccc; padding:10px;"> | |
| {img_tags} | |
| </div> | |
| """ | |
| st.components.v1.html(html, height=550, scrolling=True) | |
| except Exception as e: | |
| st.error(f"Could not render PDF preview: {e}") | |
| with open(file, "rb") as f: | |
| st.download_button( | |
| label="Download Uploaded PDF", | |
| data=f, | |
| file_name=file.split("/")[-1], | |
| mime="application/pdf" | |
| ) | |
| # Title Styling | |
| st.markdown( | |
| """ | |
| <style> | |
| .summascribe-title { | |
| font-size: 50px; | |
| text-align: center; | |
| transition: transform 0.2s ease-in-out; | |
| } | |
| .summascribe-title span { | |
| transition: color 0.2s ease-in-out; | |
| } | |
| .summascribe-title:hover span { | |
| color: #f5fefd; | |
| } | |
| .summascribe-title:hover { | |
| transform: scale(1.15); | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| text = "SummaScribe" | |
| colored_text = ''.join( | |
| ['<span style="color: hsl(220, 60%, {}%);">{}</span>'.format(70 - (i * 10 / len(text)), char) for i, char in | |
| enumerate(text)]) | |
| colored_text_with_malt = colored_text + ' <span style="color: hsl(220, 60%, 70%);">✧</span>' | |
| st.markdown(f'<h1 class="summascribe-title">{colored_text_with_malt}</h1>', unsafe_allow_html=True) | |
| st.markdown( | |
| '<h2 style="font-size:25px;color: #F5FEFD; text-align: center;">Text Document Summarization using LLMs</h2>', | |
| unsafe_allow_html=True, | |
| ) | |
| # Main content | |
| def main(): | |
| uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"]) | |
| with st.expander("NOTE"): | |
| st.write( | |
| "Summascribe currently accepts PDF documents that contain only text and no images." | |
| ) | |
| if uploaded_file is not None: | |
| if st.button("Summarize"): | |
| col1, col2 = st.columns((1, 1)) | |
| filepath = "data/" + uploaded_file.name | |
| with open(filepath, "wb") as temp_file: | |
| temp_file.write(uploaded_file.read()) | |
| with col1: | |
| st.info("Uploaded File") | |
| display(filepath) | |
| with col2: | |
| st.spinner(text="In progress...") | |
| st.info("Summary") | |
| summary = llm_pipeline(filepath, selected_summary_length) | |
| st.success(summary, icon="✅") | |
| # --- Download options (side by side, full width) --- | |
| col_txt, col_pdf = st.columns(2) | |
| with col_txt: | |
| st.download_button( | |
| label="Download Summary as TXT", | |
| data=summary, | |
| file_name="summary.txt", | |
| mime="text/plain", | |
| use_container_width=True | |
| ) | |
| with col_pdf: | |
| pdf_buffer = io.BytesIO() | |
| c = canvas.Canvas(pdf_buffer) | |
| text_obj = c.beginText(40, 800) | |
| for line in summary.split("\n"): | |
| text_obj.textLine(line) | |
| c.drawText(text_obj) | |
| c.save() | |
| pdf_buffer.seek(0) | |
| st.download_button( | |
| label="Download Summary as PDF", | |
| data=pdf_buffer, | |
| file_name="summary.pdf", | |
| mime="application/pdf", | |
| use_container_width=True | |
| ) | |
| if __name__ == "__main__": | |
| main() | |