Spaces:
Sleeping
Sleeping
import gradio as gr | |
from PyPDF2 import PdfReader | |
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer | |
from gtts import gTTS | |
from io import BytesIO | |
import re | |
import os | |
# Load the LED-large model for summarization | |
model_name = "pszemraj/led-large-book-summary" | |
summarizer = pipeline("summarization", model=model_name, tokenizer=model_name) | |
def extract_abstract_and_summarize(pdf_file): | |
try: | |
if pdf_file is None: | |
raise ValueError("PDF file is not provided.") | |
with open(pdf_file, "rb") as file: | |
pdf_reader = PdfReader(file) | |
abstract_text = "" | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text = page.extract_text() | |
abstract_match = re.search(r"\bAbstract\b", text, re.IGNORECASE) | |
if abstract_match: | |
start_index = abstract_match.end() | |
introduction_match = re.search(r"\bIntroduction\b", text[start_index:], re.IGNORECASE) | |
if introduction_match: | |
end_index = start_index + introduction_match.start() | |
else: | |
end_index = None | |
abstract_text = text[start_index:end_index] | |
break | |
# Summarize the extracted abstract using the LED-large model with a specific max_length | |
result = summarizer(abstract_text, max_length=81) | |
# Extract only the first sentence from the summary | |
if result and isinstance(result, list) and len(result) > 0: | |
summary = result[0].get('summary_text', 'Summary not available.') | |
# Extracting the first sentence | |
first_sentence = summary.split('.')[0] + '.' | |
else: | |
first_sentence = "Summary not available." | |
# Generate audio | |
speech = gTTS(text=first_sentence, lang="en") | |
speech_bytes = BytesIO() | |
speech.write_to_fp(speech_bytes) | |
# Return individual output values | |
return first_sentence, speech_bytes.getvalue(), abstract_text.strip() | |
except Exception as e: | |
raise Exception(str(e)) | |
interface = gr.Interface( | |
fn=extract_abstract_and_summarize, | |
inputs=[gr.File(label="Upload PDF")], | |
outputs=[gr.Textbox(label="Summary"), gr.Audio()], | |
title="PDF Summarization & Audio Generation Tool", | |
description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it using the 'pszemraj/led-large-book-summary' model into one sentence summary, and generates an audio of it. Only upload PDFs with abstracts. Example | |
PDF's are given below, and please click on them to see the summarized text and audio generated. Please read the README.MD for more information about the app.""", | |
examples=[["Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"],["Article 4 Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence.pdf"]],cache_examples=True, | |
) | |
interface.launch() | |