Spaces:

remzicam
/

ted_talks_summarizer

Running

App Files Files Community

ted_talks_summarizer / app.py

remzicam

Upload 2 files

f0396e5 about 2 years ago

raw

history blame

1.81 kB

	"""TED Talks Summarizer App."""

	from re import sub

	from gradio import Interface, Series, Textbox
	from requests import get


	def clean_text(text):
	"""Cleans subtitle text of ted talks.

	Args:
	text (str): subtitle of ted talk

	Returns:
	cleaned_text (str): cleaned version of subtitle text
	"""
	# remove string inside parantheses (i.e appluse)
	text = sub(r"\(.*\)", "", text)
	# format text by splitting/removing new lines
	text = text.split("\n")[1:]
	# remove empty strings
	text = list(filter(None, text))
	# remove timestamps as they contains pattern of "-->"
	cleaned_text = " ".join([x.strip() for x in text if "-->" not in x])
	return cleaned_text


	def ted_talk_transcriber(link):
	"""Creates transcription of ted talks from url.

	Args:
	link (str): url link of ted talks

	Returns:
	cleaned_transcript (str): transcription of the ted talk
	"""
	# request link of the talk
	page = get(link)
	# extract unique talk id to reach subtitle file
	talk_id = str(page.content).split("project_masters/")[1].split("/")[0]
	raw_text = get(
	f"https://hls.ted.com/project_masters/{talk_id}/subtitles/en/full.vtt"
	).text
	cleaned_transcript = clean_text(raw_text)
	return cleaned_transcript


	transcriber = Interface(
	ted_talk_transcriber,
	"text",
	"text",
	)

	summarizer = Interface.load(
	"huggingface/pszemraj/long-t5-tglobal-base-16384-book-summary"
	)

	logo = "<center><img src='file/TED.png' width=180px></center>"

	Series(
	transcriber,
	summarizer,
	inputs=Textbox(label="Type the TED Talks link"),
	examples=[
	"https://www.ted.com/talks/jen_gunter_the_truth_about_yeast_in_your_body"
	],
	allow_flagging="never",
	description=logo,
	).launch()