Spaces:

Oshchepkov
/

youtube_summurize_subtitles

Runtime error

File size: 3,125 Bytes

e2a3db7
 
f5c350f
a88932d
 
 
 
 
e2a3db7
 
 
 
 
a88932d
e2a3db7
 
 
 
 
 
 
 
 
 
 
 
a88932d
 
 
e2a3db7
a88932d
 
 
e2a3db7
a88932d
e2a3db7
 
a88932d
e2a3db7
a88932d
f5c350f
 
 
 
 
 
 
 
 
a88932d
f5c350f
 
a88932d
f5c350f
 
 
 
 
 
a88932d
f5c350f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2a3db7
f5c350f

import streamlit as st 
from urllib.parse import urlparse, parse_qs
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
# https://pypi.org/project/youtube-transcript-api/
from youtube_transcript_api import YouTubeTranscriptApi

def get_video_id(url: str) -> str:
    """
    Examples:
    - http://youtu.be/SA2iWivDJiE
    - http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu
    - http://www.youtube.com/embed/SA2iWivDJiE
    - http://www.youtube.com/v/SA2iWivDJiE?version=3&amp;hl=en_US
    """
    query = urlparse(url)
    if query.hostname == 'youtu.be':
        return query.path[1:]
    if query.hostname in ('www.youtube.com', 'youtube.com'):
        if query.path == '/watch':
            p = parse_qs(query.query)
            return p['v'][0]
        if query.path[:7] == '/embed/':
            return query.path.split('/')[2]
        if query.path[:3] == '/v/':
            return query.path.split('/')[2]
    return None


def get_youtube_subtitle(video_id: str) -> str:
    try:  
        parse = YouTubeTranscriptApi.get_transcript(video_id, languages=['ru'])
        result = ''
        for i in parse:
            if (i['text'][0] =='[') & (i['text'][-1] ==']'): continue
            result += ' ' + i['text']
        result = result.strip()[0].upper() + result.strip()[1:]
        return result.strip()
    except:
        return None

        
if __name__ == "__main__":
    st.header("Annotation of subtitles from YouTube")
    # st.text('Load model...')
    # m_name = '/content/drive/MyDrive/Colab Notebooks/Netology/diplom_neto/summarize1'
    m_name = "csebuetnlp/mT5_multilingual_XLSum"
    # tokenizer = AutoTokenizer.from_pretrained(m_name)
    # model = AutoModelForSeq2SeqLM.from_pretrained(m_name)
    # st.text('Model is loaded')

    url = st.text_input('Enter the URL of the Youtube video', 'https://www.youtube.com/watch?v=HGSVsK32rKA')
    video_id = get_video_id(url)

    if video_id is not None:
        subtitle = get_youtube_subtitle(video_id)
        if subtitle is not None:
            st.subheader('Subtitles')
            st.text(subtitle)
            st.text('Compute summary...')

            # inputs = tokenizer(
            #                     [subtitle],
            #                     max_length=600,
            #                     padding="max_length",
            #                     truncation=True,
            #                     return_tensors="pt",
            #                     )["input_ids"]

            # # inputs = tokenizer(subtitle, return_tensors="pt").input_ids
            # outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
            # summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
            translator = pipeline("summarization", model=m_name, 
                                  tokenizer=m_name, max_length=100, device=0
                                  )

            st.subheader('Summary')
            st.text(translator(subtitle))
        else:
            st.write('Subtitles are disabled for this video')
    else:
        st.write('Video clip is not detected')