Spaces:

Alioth86
/

SpeechAbstractor

Runtime error

App Files Files Community

SpeechAbstractor / app.py

Alioth86

Add application file

842c3de over 1 year ago

raw

history blame contribute delete

6.32 kB

	# https://huggingface.co/spaces/Alioth86/SpeechAbstractor
	#Please, consider that I have recombined the function I created for the part 1 of assessment
	#I have added a main function to connect them (for this main function I got some help from ChatGPT-4)
	#I have created the input/output parts and the titles and the description
	#and all the gradio features according to the Gradio website instructions.
	#Please note that I have uploaded it all through git and git LFS.

	#Here are the imports
	import PyPDF2
	import pdfplumber
	from pdfminer.high_level import extract_pages, extract_text
	from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
	import re
	import torch
	import transformers
	from transformers import pipeline
	import nltk
	from datasets import load_dataset
	import soundfile as sf
	from IPython.display import Audio
	import sentencepiece as spm
	import os
	import tempfile
	import gradio as gr

	#Here is the code
	title="SpeechAbstractor"

	description = """
	This app enables users to upload academic articles in PDF format, specifically focusing on abstracts.
	It efficiently summarizes the abstract and provides an audio playback of the summarized content.
	Below are some example PDFs for you to experiment with. Feel free to explore the functionality of SpeechAbstractor!
	(Please note: it works only with articles with an abstract)."""

	examples = [
	["Article_7.pdf"],["Article_11.pdf"]
	]

	#reporting the functions created for the part 1
	def text_extraction(element):
	line_text = element.get_text()

	line_formats = []
	for text_line in element:
	if isinstance(text_line, LTTextContainer):
	for character in text_line:
	if isinstance(character, LTChar):
	line_formats.append(character.fontname)
	line_formats.append(character.size)
	format_per_line = list(set(line_formats))

	return (line_text, format_per_line)

	def read_pdf(pdf_pathy):
	pdfFileObj = open(pdf_pathy, 'rb')
	pdfReaded = PyPDF2.PdfReader(pdfFileObj)

	text_per_pagy = {}
	for pagenum, page in enumerate(extract_pages(pdf_pathy)):
	print("Elaborating Page_" +str(pagenum))
	pageObj = pdfReaded.pages[pagenum]
	page_text = []
	line_format = []
	page_content = []

	pdf = pdfplumber.open(pdf_pathy)

	page_elements = [(element.y1, element) for element in page._objs]
	page_elements.sort(key=lambda a: a[0], reverse=True)

	for i,component in enumerate(page_elements):
	pos= component[0]
	element = component[1]

	if isinstance(element, LTTextContainer):
	(line_text, format_per_line) = text_extraction(element)
	page_text.append(line_text)
	line_format.append(format_per_line)
	page_content.append(line_text)


	dctkey = 'Page_'+str(pagenum)
	text_per_pagy[dctkey]= [page_text, line_format, page_content]

	pdfFileObj.close()


	return text_per_pagy


	def clean_text(text):

	text = re.sub(r'\s+', ' ', text)

	return text.strip()


	def extract_abstract(text_per_pagy):
	abstract_text = ""

	for page_num, page_text in text_per_pagy.items():
	if page_text:
	page_text = page_text.replace("- ", "")

	start_index = -1
	for variant in ["Abstract", "abstract", "ABSTRACT"]:
	start_index = page_text.find(variant)
	if start_index != -1:
	start_index += len(variant) + 1
	break

	if start_index != -1:
	end_markers = ["Introduction", "INTRODUCTION", "Background", "Contents", "Keywords"]
	end_index = -1

	for marker in end_markers:
	temp_index = page_text.find(marker, start_index)
	if temp_index != -1:
	end_index = temp_index
	break

	if end_index == -1:
	end_index = len(page_text)

	abstract = page_text[start_index:end_index].strip()

	abstract_text += " " + abstract

	break

	return abstract_text

	#let's define a main function that gets the uploaded file (pdf) to do the job
	def main_function(uploaded_filepath):
	#put a control to see if there is a file uploaded
	if uploaded_filepath is None:
	return "No file loaded", None

	#read and process the file according to read_pdf
	text_per_pagy = read_pdf(uploaded_filepath)

	#cleaning the text and getting the abstract using the 2 other functions
	for key, value in text_per_pagy.items():
	cleaned_text = clean_text(' '.join(value[0]))
	text_per_pagy[key] = cleaned_text
	abstract_text = extract_abstract(text_per_pagy)

	nltk.download('punkt')

	#abstract the summary with my pipeline and model, deciding the length
	summarizer = pipeline("summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify")
	summary = summarizer(abstract_text, max_length=100, min_length=50, do_sample=False)[0]['summary_text']

	#keeping just the first sentence, to be sure.
	sentences = nltk.tokenize.sent_tokenize(summary)
	first_sentence = sentences[0]

	#generating the audio from the text, with my pipeline and model
	synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
	speech = synthesiser(first_sentence, forward_params={"speaker_embeddings": speaker_embedding})

	#saving the audio in a temporary file
	audio_file_path = "summary.wav"
	sf.write(audio_file_path, speech["audio"], samplerate=speech["sampling_rate"])

	#the function returns the 2 pieces we need
	return first_sentence, audio_file_path

	#let's communicate with gradio what it has to put in
	iface = gr.Interface(
	fn=main_function,
	inputs=gr.File(type="filepath"),
	outputs=[gr.Textbox(label="Abstract Summary"), gr.Audio(label="Abstract Summary Audio", type="filepath")],
	title=title,
	description=description,
	examples=examples
	)

	#launching the app
	if __name__ == "__main__":
	iface.launch()