# https://huggingface.co/spaces/Alioth86/SpeechAbstractor #Please, consider that I have recombined the function I created for the part 1 of assessment #I have added a main function to connect them (for this main function I got some help from ChatGPT-4) #I have created the input/output parts and the titles and the description #and all the gradio features according to the Gradio website instructions. #Please note that I have uploaded it all through git and git LFS. #Here are the imports import PyPDF2 import pdfplumber from pdfminer.high_level import extract_pages, extract_text from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure import re import torch import transformers from transformers import pipeline import nltk from datasets import load_dataset import soundfile as sf from IPython.display import Audio import sentencepiece as spm import os import tempfile import gradio as gr #Here is the code title="SpeechAbstractor" description = """ This app enables users to upload academic articles in PDF format, specifically focusing on abstracts. It efficiently summarizes the abstract and provides an audio playback of the summarized content. Below are some example PDFs for you to experiment with. Feel free to explore the functionality of SpeechAbstractor! (Please note: it works only with articles with an abstract).""" examples = [ ["Article_7.pdf"],["Article_11.pdf"] ] #reporting the functions created for the part 1 def text_extraction(element): line_text = element.get_text() line_formats = [] for text_line in element: if isinstance(text_line, LTTextContainer): for character in text_line: if isinstance(character, LTChar): line_formats.append(character.fontname) line_formats.append(character.size) format_per_line = list(set(line_formats)) return (line_text, format_per_line) def read_pdf(pdf_pathy): pdfFileObj = open(pdf_pathy, 'rb') pdfReaded = PyPDF2.PdfReader(pdfFileObj) text_per_pagy = {} for pagenum, page in enumerate(extract_pages(pdf_pathy)): print("Elaborating Page_" +str(pagenum)) pageObj = pdfReaded.pages[pagenum] page_text = [] line_format = [] page_content = [] pdf = pdfplumber.open(pdf_pathy) page_elements = [(element.y1, element) for element in page._objs] page_elements.sort(key=lambda a: a[0], reverse=True) for i,component in enumerate(page_elements): pos= component[0] element = component[1] if isinstance(element, LTTextContainer): (line_text, format_per_line) = text_extraction(element) page_text.append(line_text) line_format.append(format_per_line) page_content.append(line_text) dctkey = 'Page_'+str(pagenum) text_per_pagy[dctkey]= [page_text, line_format, page_content] pdfFileObj.close() return text_per_pagy def clean_text(text): text = re.sub(r'\s+', ' ', text) return text.strip() def extract_abstract(text_per_pagy): abstract_text = "" for page_num, page_text in text_per_pagy.items(): if page_text: page_text = page_text.replace("- ", "") start_index = -1 for variant in ["Abstract", "abstract", "ABSTRACT"]: start_index = page_text.find(variant) if start_index != -1: start_index += len(variant) + 1 break if start_index != -1: end_markers = ["Introduction", "INTRODUCTION", "Background", "Contents", "Keywords"] end_index = -1 for marker in end_markers: temp_index = page_text.find(marker, start_index) if temp_index != -1: end_index = temp_index break if end_index == -1: end_index = len(page_text) abstract = page_text[start_index:end_index].strip() abstract_text += " " + abstract break return abstract_text #let's define a main function that gets the uploaded file (pdf) to do the job def main_function(uploaded_filepath): #put a control to see if there is a file uploaded if uploaded_filepath is None: return "No file loaded", None #read and process the file according to read_pdf text_per_pagy = read_pdf(uploaded_filepath) #cleaning the text and getting the abstract using the 2 other functions for key, value in text_per_pagy.items(): cleaned_text = clean_text(' '.join(value[0])) text_per_pagy[key] = cleaned_text abstract_text = extract_abstract(text_per_pagy) nltk.download('punkt') #abstract the summary with my pipeline and model, deciding the length summarizer = pipeline("summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify") summary = summarizer(abstract_text, max_length=100, min_length=50, do_sample=False)[0]['summary_text'] #keeping just the first sentence, to be sure. sentences = nltk.tokenize.sent_tokenize(summary) first_sentence = sentences[0] #generating the audio from the text, with my pipeline and model synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) speech = synthesiser(first_sentence, forward_params={"speaker_embeddings": speaker_embedding}) #saving the audio in a temporary file audio_file_path = "summary.wav" sf.write(audio_file_path, speech["audio"], samplerate=speech["sampling_rate"]) #the function returns the 2 pieces we need return first_sentence, audio_file_path #let's communicate with gradio what it has to put in iface = gr.Interface( fn=main_function, inputs=gr.File(type="filepath"), outputs=[gr.Textbox(label="Abstract Summary"), gr.Audio(label="Abstract Summary Audio", type="filepath")], title=title, description=description, examples=examples ) #launching the app if __name__ == "__main__": iface.launch()