Spaces:

gianb
/

PDF_Summarizer_and_TTS

Running

App Files Files Community

PDF_Summarizer_and_TTS / app.py

gianb

Update app.py

c755122 8 months ago

raw

history blame

No virus

1.95 kB

	#https://huggingface.co/spaces/gianb/PDF_Summarized_TTS

	# Here are the imports

	import gradio as gr
	import PyPDF2
	from transformers import pipeline, AutoProcessor, AutoModel, AutoTokenizer
	from PyPDF2 import PdfReader
	import torch
	import soundfile as sf
	from IPython.display import Audio
	from datasets import load_dataset
	from pdfminer.high_level import extract_pages, extract_text
	from io import BytesIO

	#Here is the code

	summarization = pipeline('summarization', model='pszemraj/long-t5-tglobal-base-16384-book-summary')
	synthesiser = pipeline("text-to-speech", model='facebook/mms-tts-eng')

	def abstract_extract(uploaded_file):
	pdf_bytes = BytesIO(uploaded_file)
	pdf_reader = PyPDF2.PdfReader(pdf_bytes)

	abstract = ""

	for page_number in range(len(pdf_reader.pages)):
	text = pdf_reader.pages[page_number].extract_text()

	if "abstract" in text.lower():
	start_index = text.lower().find("abstract")
	end_index = text.lower().find("introduction")
	abstract = text[start_index:end_index]
	break

	return abstract

	def summarize_and_speech(pdf_file):
	abstract_text = abstract_extract(pdf_file)

	summary = summarization(abstract_text, max_length=15, min_length=10)[0]['summary_text']

	tts_output = synthesiser(summary)
	audio_data = tts_output["audio"][0]

	with BytesIO() as buffer:
	sf.write(buffer, audio_data, 16000, format='wav')
	audio_bytes = buffer.getvalue()

	return summary, audio_bytes

	iface = gr.Interface(
	fn=summarize_and_speech,
	inputs=gr.File(label="Upload PDF", type="binary"),
	outputs=[gr.Textbox(label="Abstract Summary:"), gr.Audio(label="Summary Speech")],
	live=True,
	title="Abstract Research Paper Summarizer",
	description="Upload a Research Paper PDF File. The model will generate a one line summary of the Abstract section and a speech audio."
	)

	iface.launch()