"""TED Talks Summarizer App.""" from re import sub from gradio import Interface, Textbox from requests import get from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline repo_id = "pszemraj/led-base-book-summary" model = AutoModelForSeq2SeqLM.from_pretrained( repo_id, low_cpu_mem_usage=True, ) tokenizer = AutoTokenizer.from_pretrained(repo_id) summarizer = pipeline("summarization", model=model, tokenizer=tokenizer) def clean_text(text: str) -> str: """Cleans subtitle text of ted talks. Args: text (str): subtitle of ted talk Returns: cleaned_text (str): cleaned version of subtitle text """ # remove string inside parantheses (i.e appluse) text = sub(r"\(.*\)", "", text) # format text by splitting/removing new lines text = text.split("\n")[1:] # remove empty strings text = list(filter(None, text)) # remove timestamps as they contains pattern of "-->" cleaned_text = " ".join([x.strip() for x in text if "-->" not in x]) return cleaned_text def ted_talk_transcriber(link: str) -> str: """Creates transcription of ted talks from url. Args: link (str): url link of ted talks Returns: raw_text (str): raw transcription of the ted talk """ # request link of the talk page = get(link) # extract unique talk id to reach subtitle file talk_id = str(page.content).split("project_masters/")[1].split("/")[0] raw_text = get( f"https://hls.ted.com/project_masters/{talk_id}/subtitles/en/full.vtt" ).text return raw_text def text_summarizer(text: str) -> str: """Summarizes given text. Args: text (str): ted talks transcription Returns: str: summary """ result = summarizer( text, min_length=8, max_length=256, no_repeat_ngram_size=3, encoder_no_repeat_ngram_size=3, repetition_penalty=3.5, num_beams=4, do_sample=False, early_stopping=True, ) return result[0]["summary_text"] def main(link: str) -> str: """Summarizes ted talks given link. Args: link (str): url link of ted talks Returns: str: summary """ raw_text = ted_talk_transcriber(link) cleaned_transcript = clean_text(raw_text) return text_summarizer(cleaned_transcript) logo = "
" Interface( main, inputs=Textbox(label="Type the TED Talks link"), examples=[ "https://www.ted.com/talks/jen_gunter_the_truth_about_yeast_in_your_body" ], outputs=Textbox(label="Summary"), allow_flagging="never", description=logo, ).launch()