# -*- coding: utf-8 -*- """app.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1n1zTe_HIqsQ1JvPcV2S3i8-kjq5V4xJo """ #https://huggingface.co/spaces/user2434/SummarizedAbstract # Import necessary libraries import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from gtts import gTTS from io import BytesIO import PyPDF2 # Function to extract abstract from PDF def extract_abstract(pdf_path): with open(pdf_path, 'rb') as file: reader = PyPDF2.PdfReader(file) abstract_start, abstract_end = None, None for page_num, page in enumerate(reader.pages): page_text = page.extract_text() if "Abstract" in page_text: abstract_start = page_num break if abstract_start is not None: for page_num, page in enumerate(reader.pages[abstract_start + 1:]): page_text = page.extract_text() if any(title_word in page_text for title_word in ["Introduction", "Background", "1.", "I."]): abstract_end = abstract_start + page_num + 1 break if abstract_start is not None and abstract_end is not None: abstract_text = ''.join(page.extract_text() for page in reader.pages[abstract_start:abstract_end]) return abstract_text else: return None # Function to summarize abstract using a pre-trained model def summarize_abstract(text): tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary") model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary") inputs = tokenizer(text, max_length=1000, return_tensors="pt", truncation=True) summary_ids = model.generate( inputs['input_ids'], max_length=40, min_length=20, no_repeat_ngram_size=3, encoder_no_repeat_ngram_size=3, repetition_penalty=2.0, num_beams=3, do_sample=True, early_stopping=False ) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) sentences = summary.split('.') if len(sentences) > 1: summary = sentences[0] + '.' return summary # Function to convert text to speech def convert_to_speech(text): tts = gTTS(text, lang='en') buffer = BytesIO() tts.write_to_fp(buffer) buffer.seek(0) return buffer.read() # Function to process PDF and generate summary def process_pdf(pdf_path): abstract_text = extract_abstract(pdf_path) if abstract_text: abstract_text = abstract_text[:1024] summary = summarize_abstract(abstract_text) if summary: return summary, convert_to_speech(summary) # Define Gradio interface inputs = gr.File(label="Upload a PDF with an abstract") # Add a label to the file input summary_text = gr.Text(label="Written summary of the abstract") audio_summary = gr.Audio(label="Audio summary of abstract") # Launch the Gradio interface with an example PDF iface = gr.Interface( fn=process_pdf, inputs=inputs, outputs=[summary_text, audio_summary], title="Summarized Abstract", description="The app will summarize the abstract of a PDF and read it to the user.", examples=["Article 11 Hidden Technical Debt in Machine Learning Systems.pdf" ] ) # Launch the Gradio interface iface.launch()