| | import gradio as gr |
| | import fitz |
| | from transformers import pipeline |
| | import textwrap |
| |
|
| | |
| | summarizer = pipeline("summarization", model="facebook/bart-large-cnn") |
| |
|
| | |
| | def extract_text_from_pdf(pdf_file): |
| | text = "" |
| | |
| | with fitz.open(pdf_file.name) as doc: |
| | for page in doc: |
| | text += page.get_text() |
| | return text.strip().replace("\n", " ") |
| |
|
| | |
| | def chunk_text(text, max_chunk_len=1000): |
| | return textwrap.wrap(text, max_chunk_len) |
| |
|
| | |
| | def summarize_long_pdf(file_obj): |
| | full_text = extract_text_from_pdf(file_obj) |
| | if not full_text: |
| | return "β No readable text extracted from the PDF." |
| |
|
| | chunks = chunk_text(full_text, max_chunk_len=1000) |
| | summaries = [] |
| |
|
| | for i, chunk in enumerate(chunks): |
| | try: |
| | summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)[0]['summary_text'] |
| | summaries.append(f"πΉ Part {i+1}: {summary}") |
| | except Exception as e: |
| | summaries.append(f"β οΈ Error summarizing part {i+1}: {e}") |
| |
|
| | return "\n\n".join(summaries) |
| |
|
| | |
| | gr.Interface( |
| | fn=summarize_long_pdf, |
| | inputs=gr.File(label="π₯ Upload Multi-page PDF"), |
| | outputs=gr.Textbox(label="π Full Summary"), |
| | title="π Multi-Page PDF Summarizer", |
| | description="Upload long PDFs (e.g., Morningstar reports). Summarized in chunks using BART." |
| | ).launch() |