import PyPDF2 import pdfplumber from pdfminer.high_level import extract_pages, extract_text from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure import re import torch import transformers from transformers import pipeline from datasets import load_dataset import soundfile as sf from IPython.display import Audio from datasets import load_dataset import sentencepiece as spm import os import tempfile import gradio as gr from IPython.display import display, Markdown description = display(Markdown("""##**SpeechAbstractor**\n This app enables users to upload academic articles in PDF format, specifically focusing on abstracts. It efficiently summarizes the abstract and provides an audio playback of the summarized content. Below are some example PDFs for you to experiment with. Feel free to explore the functionality of SpeechAbstractor! (Please note: it works only with articles with an Abstract).""")) examples = [ ["Article_7.pdf"],["Article_11.pdf"] ] #reporting the functions created for the part 1 def text_extraction(element): line_text = element.get_text() line_formats = [] for text_line in element: if isinstance(text_line, LTTextContainer): for character in text_line: if isinstance(character, LTChar): line_formats.append(character.fontname) line_formats.append(character.size) format_per_line = list(set(line_formats)) return (line_text, format_per_line) def read_pdf(pdf_pathy): pdfFileObj = open(pdf_pathy, 'rb') pdfReaded = PyPDF2.PdfReader(pdfFileObj) text_per_pagy = {} for pagenum, page in enumerate(extract_pages(pdf_pathy)): print("Elaborating Page_" +str(pagenum)) pageObj = pdfReaded.pages[pagenum] page_text = [] line_format = [] page_content = [] pdf = page_elements = [(element.y1, element) for element in page._objs] page_elements.sort(key=lambda a: a[0], reverse=True) for i,component in enumerate(page_elements): pos= component[0] element = component[1] if isinstance(element, LTTextContainer): (line_text, format_per_line) = text_extraction(element) page_text.append(line_text) line_format.append(format_per_line) page_content.append(line_text) dctkey = 'Page_'+str(pagenum) text_per_pagy[dctkey]= [page_text, line_format, page_content] pdfFileObj.close() return text_per_pagy def clean_text(text): # remove extra spaces text = re.sub(r'\s+', ' ', text) return text.strip() def extract_abstract(text_per_pagy): abstract_text = "" for page_num, page_text in text_per_pagy.items(): if page_text: page_text = page_text.replace("- ", "") start_index = -1 for variant in ["Abstract", "abstract", "ABSTRACT"]: start_index = page_text.find(variant) if start_index != -1: start_index += len(variant) + 1 break if start_index != -1: end_markers = ["Introduction", "INTRODUCTION", "Background", "Contents", "Keywords"] end_index = -1 for marker in end_markers: temp_index = page_text.find(marker, start_index) if temp_index != -1: end_index = temp_index break if end_index == -1: end_index = len(page_text) abstract = page_text[start_index:end_index].strip() abstract_text += " " + abstract break return abstract_text #let's define a main function that gets the uploaded file (pdf) to do the job def main_function(uploaded_filepath): #put a control to see if there is a file uploaded if uploaded_filepath is None: return "No file loaded", None #read and process the file according to read_pdf text_per_pagy = read_pdf(uploaded_filepath) #cleaning the text and getting the abstract using the 2 other functions for key, value in text_per_pagy.items(): cleaned_text = clean_text(' '.join(value[0])) text_per_pagy[key] = cleaned_text abstract_text = extract_abstract(text_per_pagy) #abstract the summary with my pipeline and model, deciding the length summarizer = pipeline("summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify") summary = summarizer(abstract_text, max_length=65, do_sample=False)[0]['summary_text'] #generating the audio from the text, with my pipeline and model synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) speech = synthesiser(summary, forward_params={"speaker_embeddings": speaker_embedding}) #saving the audio in a temporary file audio_file_path = "summary.wav" sf.write(audio_file_path, speech["audio"], samplerate=speech["sampling_rate"]) #the function returns the 2 pieces we need return summary, audio_file_path #let's communicate with gradio what it has to put in iface = gr.Interface( fn=main_function, inputs=gr.File(type="filepath"), outputs=[gr.Textbox(label="Summary Text"), gr.Audio(label="Summary Audio", type="filepath")], description=description, examples=examples ) #launching the app if __name__ == "__main__": iface.launch()