Spaces:

Elrmnd
/

Vocal-PDF-Summarizer

Sleeping

App Files Files Community

Vocal-PDF-Summarizer / app.py

Elrmnd

Update app.py

dda7485 10 months ago

raw

history blame contribute delete

No virus

3.69 kB

	# https://elrmnd-vocal-pdf-summarizer.hf.space

	# Import libraries

	import gradio as gr
	import PyPDF2
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	from gtts import gTTS
	from io import BytesIO

	# Function to extract text from PDF
	# Defines a function to extract raw text from a PDF file
	def extract_text(pdf_file):
	pdfReader = PyPDF2.PdfReader(pdf_file)
	pageObj = pdfReader.pages[0]
	return pageObj.extract_text()

	# Function to summarize text
	# Defines a function to summarize the extracted text using facebook/bart-large-cnn
	def summarize_text(text):
	sentences = text.split(". ")
	start = -1 # Default value if "Abstract" is not found
	end = -1

	for i, sentence in enumerate(sentences):
	if "Abstract" in sentence:
	start = i + 1
	end = start + 6
	break

	if start != -1:
	abstract = ". ".join(sentences[start:end + 1])

	# Load BART model & tokenizer
	tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary")
	model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary")

	# Tokenize abstract
	inputs = tokenizer(abstract,
	max_length=1024,
	return_tensors="pt",
	truncation=True)

	# Generate summary
	summary_ids = model.generate(inputs['input_ids'],
	max_length=50,
	min_length=30,
	no_repeat_ngram_size=3,
	encoder_no_repeat_ngram_size=3,
	repetition_penalty=3.5,
	num_beams=4,
	do_sample=True,
	early_stopping=False)

	summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	if '.' in summary:
	index = summary.rindex('.')
	if index != -1:
	summary = summary[:index + 1]
	else:
	summary = "Abstract not found in the document."

	return summary

	# Function to convert text to audio
	# Defines a function to convert text to an audio file using Google Text-to-Speech
	def text_to_audio(text):
	tts = gTTS(text, lang='en')
	buffer = BytesIO()
	tts.write_to_fp(buffer)
	buffer.seek(0)
	return buffer.read()

	### Main function
	### The main function that ties everything together:
	### extracts text, summarizes, and converts to audio.
	def audio_pdf(pdf_file):
	text = extract_text(pdf_file)
	summary = summarize_text(text)
	audio = text_to_audio(summary)
	return summary, audio

	# Define Gradio interface
	# Gradio web interface with a file input, text output to display the summary
	# and audio output to play the audio file. # Launches the interface
	inputs = gr.File()
	summary_text = gr.Text()
	audio_summary = gr.Audio()

	iface = gr.Interface(
	fn=audio_pdf,
	inputs=inputs,
	outputs=[summary_text, audio_summary],
	title="The Vocal PDF Summarizer",
	description="I will summarize PDFs that have an abstract and transform them into audio. If an abstract is not present in the document, a message will be displayed.",
	examples=["Article 11 Hidden Technical Debt in Machine Learning Systems.pdf",
	"Article 6 BloombergGPT_ A Large Language Model for Finance.pdf",
	"Article 5 A Comprehensive Survey on Applications of Transformers for Deep Learning Tasks.pdf",
	"Article 8 Llama 2_ Open Foundation and Fine-Tuned Chat Models.pdf"
	]
	)

	iface.launch() # Launch the interface