Spaces:

hannahisrael03
/

designing_ai_assignment

Sleeping

App Files Files Community

designing_ai_assignment / app.py

hannahisrael03

Update app.py

e39bb72 verified 5 months ago

raw

history blame contribute delete

2.56 kB

	# Set Up

	import torch
	from transformers import pipeline
	from datasets import load_dataset
	import soundfile as sf
	from transformers import AutoTokenizer
	from transformers import AutoModelForSeq2SeqLM
	from datasets import load_dataset
	import gradio as gr

	# Loading the dataset
	dataset = load_dataset("multi_news",trust_remote_code=True)

	# AUDIO
	speech_name = "microsoft/speecht5_tts"
	synthesiser = pipeline("text-to-speech", speech_name)

	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


	# Web Scrapping
	from goose3 import Goose

	# GRADIO: NEWS SUMMARIZER + AUDIO READER (URL + TEXT INPUT)

	# Load the fine-tuned T5 news summarizer
	tokenizer = AutoTokenizer.from_pretrained("hannahisrael03/t5_news_summarizer")
	model = AutoModelForSeq2SeqLM.from_pretrained("hannahisrael03/t5_news_summarizer")

	def summarize_and_synthesize(input_type, input_value):
	if input_type == "URL":
	ARTICLE_MINIMUM_LENGTH = 600
	g = Goose()
	article = g.extract(url=url)

	article_title = article.title
	article_body = article.cleaned_text
	if len(article_body) < ARTICLE_MINIMUM_LENGTH:
	return "The article is too short or could not be properly scraped.", None

	elif input_type== 'Text':
	article_body = input_value

	# Summarize the news article
	inputs = tokenizer(article_body, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
	summary_ids = model.generate(inputs["input_ids"], min_length=30, max_length=100, length_penalty=2.0, num_beams=4, early_stopping=True)
	summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	# Synthesize the summary into audio
	speech = synthesiser(summary_text, forward_params={"speaker_embeddings": speaker_embedding})
	audio_path = "summary_speech.wav"
	sf.write(audio_path, speech["audio"], samplerate=speech["sampling_rate"])

	return summary_text, audio_path


	iface = gr.Interface(
	fn=summarize_and_synthesize,
	inputs=[
	gr.Radio(["URL", "Text"], label="Input Type"),
	gr.Textbox(label="Input Value")
	],
	outputs=[
	gr.Textbox(label="Summary Text"),
	gr.Audio(label="Summary Audio", type="filepath")
	],
	title="News Article Summarizer and Reader",
	description="Select 'URL' to enter the URL of a news article, or select 'Text' to paste the article text directly. You will get a summary and hear the summary read aloud."
	)

	iface.launch()