pdf-summarizer / app.py
ritchi1's picture
Update app.py
15e827e verified
import gradio as gr
from transformers import pipeline
import PyPDF2
import pdfplumber
# Load the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def extract_text_from_pdf(pdf_file):
"""Extract text from a PDF using PyPDF2 with a fallback to pdfplumber."""
text = ""
try:
# First try with PyPDF2
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page in pdf_reader.pages:
text += page.extract_text()
except Exception as e:
print(f"PyPDF2 failed: {e}")
# Fallback to pdfplumber
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
text += page.extract_text()
return text
def chunk_text(text, max_chunk_size=1024):
"""Split text into smaller chunks to fit within model token limits."""
words = text.split()
for i in range(0, len(words), max_chunk_size):
yield " ".join(words[i:i + max_chunk_size])
def summarize_pdf(pdf_file):
"""Extract text from PDF, chunk it, and summarize."""
try:
# Extract text from the PDF
text = extract_text_from_pdf(pdf_file)
if not text.strip():
return "❌ Could not extract any text from the PDF. Please upload a readable document."
# Chunk text for summarization
summaries = []
for chunk in chunk_text(text):
# Summarize each chunk
summary = summarizer(chunk, max_length=200, min_length=50, do_sample=False)
summaries.append(summary[0]['summary_text'])
# Combine all summaries into one
full_summary = "\n\n".join(summaries)
return full_summary
except Exception as e:
return f"❌ An error occurred: {str(e)}"
# Gradio Interface
interface = gr.Interface(
fn=summarize_pdf,
inputs=gr.File(label="Upload PDF"),
outputs=gr.Textbox(label="Summary"),
title="PDF Summarizer",
description="Upload a PDF file to extract and summarize its content using state-of-the-art AI."
)
interface.launch()