remzicam's picture
Upload 2 files
c27e127
"""TED Talks Summarizer App."""
from re import sub
from gradio import Interface, Textbox
from requests import get
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
repo_id = "pszemraj/led-base-book-summary"
model = AutoModelForSeq2SeqLM.from_pretrained(
repo_id,
low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
def clean_text(text: str) -> str:
"""Cleans subtitle text of ted talks.
Args:
text (str): subtitle of ted talk
Returns:
cleaned_text (str): cleaned version of subtitle text
"""
# remove string inside parantheses (i.e appluse)
text = sub(r"\(.*\)", "", text)
# format text by splitting/removing new lines
text = text.split("\n")[1:]
# remove empty strings
text = list(filter(None, text))
# remove timestamps as they contains pattern of "-->"
cleaned_text = " ".join([x.strip() for x in text if "-->" not in x])
return cleaned_text
def ted_talk_transcriber(link: str) -> str:
"""Creates transcription of ted talks from url.
Args:
link (str): url link of ted talks
Returns:
raw_text (str): raw transcription of the ted talk
"""
# request link of the talk
page = get(link)
# extract unique talk id to reach subtitle file
talk_id = str(page.content).split("project_masters/")[1].split("/")[0]
raw_text = get(
f"https://hls.ted.com/project_masters/{talk_id}/subtitles/en/full.vtt"
).text
return raw_text
def text_summarizer(text: str) -> str:
"""Summarizes given text.
Args:
text (str): ted talks transcription
Returns:
str: summary
"""
result = summarizer(
text,
min_length=8,
max_length=256,
no_repeat_ngram_size=3,
encoder_no_repeat_ngram_size=3,
repetition_penalty=3.5,
num_beams=4,
do_sample=False,
early_stopping=True,
)
return result[0]["summary_text"]
def main(link: str) -> str:
"""Summarizes ted talks given link.
Args:
link (str): url link of ted talks
Returns:
str: summary
"""
raw_text = ted_talk_transcriber(link)
cleaned_transcript = clean_text(raw_text)
return text_summarizer(cleaned_transcript)
logo = "<center><img src='file/TED.png' width=180px></center>"
Interface(
main,
inputs=Textbox(label="Type the TED Talks link"),
examples=[
"https://www.ted.com/talks/jen_gunter_the_truth_about_yeast_in_your_body"
],
outputs=Textbox(label="Summary"),
allow_flagging="never",
description=logo,
).launch()