import PyPDF2 import pdfplumber from pdfminer.high_level import extract_pages, extract_text from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure import re import torch import transformers from transformers import pipeline from datasets import load_dataset import soundfile as sf from IPython.display import Audio import numpy as np from datasets import load_dataset import sentencepiece as spm import os import tempfile import gradio as gr description = """**SpeechAbstractor**\n This app allows you to upload an article with Abstract in .pdf and summarize the Abstract itself and speak it out loud. Some examples are given here below. Please, help yourself!""" examples = [ ["Article_7.pdf"],["Article_9.pdf"] ] #reporting the created functions for the part 1 def text_extraction(element): line_text = element.get_text() line_formats = [] for text_line in element: if isinstance(text_line, LTTextContainer): for character in text_line: if isinstance(character, LTChar): line_formats.append(character.fontname) line_formats.append(character.size) format_per_line = list(set(line_formats)) return (line_text, format_per_line) def read_pdf(pdf_pathy): pdfFileObj = open(pdf_pathy, 'rb') pdfReaded = PyPDF2.PdfReader(pdfFileObj) text_per_pagy = {} for pagenum, page in enumerate(extract_pages(pdf_pathy)): print("Elaborating Page_" +str(pagenum)) pageObj = pdfReaded.pages[pagenum] page_text = [] line_format = [] page_content = [] pdf = pdfplumber.open(pdf_pathy) page_elements = [(element.y1, element) for element in page._objs] page_elements.sort(key=lambda a: a[0], reverse=True) for i,component in enumerate(page_elements): pos= component[0] element = component[1] if isinstance(element, LTTextContainer): (line_text, format_per_line) = text_extraction(element) page_text.append(line_text) line_format.append(format_per_line) page_content.append(line_text) dctkey = 'Page_'+str(pagenum) text_per_pagy[dctkey]= [page_text, line_format, page_content] pdfFileObj.close() return text_per_pagy def clean_text(text): # remove extra spaces text = re.sub(r'\s+', ' ', text) return text.strip() def extract_abstract(text_per_pagy): abstract_text = "" for page_num, page_text in text_per_pagy.items(): if page_text: page_text = page_text.replace("- ", "") start_index = page_text.find("Abstract") if start_index != -1: start_index += len("Abstract") + 1 end_markers = ["Introduction", "Summary", "Overview", "Background"] end_index = -1 for marker in end_markers: temp_index = page_text.find(marker, start_index) if temp_index != -1: end_index = temp_index break if end_index == -1: end_index = len(page_text) abstract = page_text[start_index:end_index].strip() abstract_text += " " + abstract break return abstract_text #let's define a main function that gets the uploaded file (pdf) to do the job def main_function(uploaded_filepath): #put a control to see if there is a file uploaded if uploaded_filepath is None: return "No file loaded", None #read and process the file according to read_pdf text_per_pagy = read_pdf(uploaded_filepath) #cleaning the text and getting the abstract using the 2 other functions for key, value in text_per_pagy.items(): cleaned_text = clean_text(' '.join(value[0])) text_per_pagy[key] = cleaned_text abstract_text = extract_abstract(text_per_pagy) #abstract the summary with my pipeline and model, deciding the length summarizer = pipeline("summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify") summary = summarizer(abstract_text, max_length=50, min_length=30, do_sample=False)[0]['summary_text'] #generating the audio from the text, with my pipeline and model synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) speech = synthesiser(summary, forward_params={"speaker_embeddings": speaker_embedding}) #saving the audio in a temporary file audio_file_path = "summary.wav" sf.write(audio_file_path, speech["audio"], samplerate=speech["sampling_rate"]) #the function returns the 2 pieces we need return summary, audio_file_path #let's communicate with gradio what it has to put in iface = gr.Interface( fn=main_function, inputs=gr.File(type="filepath"), outputs=[gr.Textbox(label="Summary Text"), gr.Audio(label="Summary Audio", type="filepath")], description=description, examples=examples ) #launching the app if __name__ == "__main__": iface.launch()