import gradio as gr from PyPDF2 import PdfReader from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer from gtts import gTTS from io import BytesIO import re import os # Load the LED-large model for summarization model_name = "pszemraj/led-large-book-summary" summarizer = pipeline("summarization", model=model_name, tokenizer=model_name) def extract_abstract_and_summarize(pdf_file): try: if pdf_file is None: raise ValueError("PDF file is not provided.") with open(pdf_file, "rb") as file: pdf_reader = PdfReader(file) abstract_text = "" for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text = page.extract_text() abstract_match = re.search(r"\bAbstract\b", text, re.IGNORECASE) if abstract_match: start_index = abstract_match.end() introduction_match = re.search(r"\bIntroduction\b", text[start_index:], re.IGNORECASE) if introduction_match: end_index = start_index + introduction_match.start() else: end_index = None abstract_text = text[start_index:end_index] break # Summarize the extracted abstract using the LED-large model with a specific max_length result = summarizer(abstract_text, max_length=81) # Extract only the first sentence from the summary if result and isinstance(result, list) and len(result) > 0: summary = result[0].get('summary_text', 'Summary not available.') # Extracting the first sentence first_sentence = summary.split('.')[0] + '.' else: first_sentence = "Summary not available." # Generate audio speech = gTTS(text=first_sentence, lang="en") speech_bytes = BytesIO() speech.write_to_fp(speech_bytes) # Return individual output values return first_sentence, speech_bytes.getvalue(), abstract_text.strip() except Exception as e: raise Exception(str(e)) interface = gr.Interface( fn=extract_abstract_and_summarize, inputs=[gr.File(label="Upload PDF")], outputs=[gr.Textbox(label="Summary"), gr.Audio()], title="PDF Summarization & Audio Generation Tool", description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it using the 'pszemraj/led-large-book-summary' model into one sentence summary, and generates an audio of it. Only upload PDFs with abstracts. Example PDF's are given below, and please click on them to see the summarized text and audio generated. Please read the README.MD for more information about the app.""", examples=[[os.path.join(os.path.dirname(__file__), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")],[os.path.join(os.path.dirname(__file__), "Article 4 Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence.pdf")]],cache_examples=True, ) interface.launch()