File size: 3,125 Bytes
e2a3db7
 
f5c350f
a88932d
 
 
 
 
e2a3db7
 
 
 
 
a88932d
e2a3db7
 
 
 
 
 
 
 
 
 
 
 
a88932d
 
 
e2a3db7
a88932d
 
 
e2a3db7
a88932d
e2a3db7
 
a88932d
e2a3db7
a88932d
f5c350f
 
 
 
 
 
 
 
 
a88932d
f5c350f
 
a88932d
f5c350f
 
 
 
 
 
a88932d
f5c350f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2a3db7
f5c350f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import streamlit as st 
from urllib.parse import urlparse, parse_qs
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
# https://pypi.org/project/youtube-transcript-api/
from youtube_transcript_api import YouTubeTranscriptApi

def get_video_id(url: str) -> str:
    """
    Examples:
    - http://youtu.be/SA2iWivDJiE
    - http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu
    - http://www.youtube.com/embed/SA2iWivDJiE
    - http://www.youtube.com/v/SA2iWivDJiE?version=3&hl=en_US
    """
    query = urlparse(url)
    if query.hostname == 'youtu.be':
        return query.path[1:]
    if query.hostname in ('www.youtube.com', 'youtube.com'):
        if query.path == '/watch':
            p = parse_qs(query.query)
            return p['v'][0]
        if query.path[:7] == '/embed/':
            return query.path.split('/')[2]
        if query.path[:3] == '/v/':
            return query.path.split('/')[2]
    return None


def get_youtube_subtitle(video_id: str) -> str:
    try:  
        parse = YouTubeTranscriptApi.get_transcript(video_id, languages=['ru'])
        result = ''
        for i in parse:
            if (i['text'][0] =='[') & (i['text'][-1] ==']'): continue
            result += ' ' + i['text']
        result = result.strip()[0].upper() + result.strip()[1:]
        return result.strip()
    except:
        return None

        
if __name__ == "__main__":
    st.header("Annotation of subtitles from YouTube")
    # st.text('Load model...')
    # m_name = '/content/drive/MyDrive/Colab Notebooks/Netology/diplom_neto/summarize1'
    m_name = "csebuetnlp/mT5_multilingual_XLSum"
    # tokenizer = AutoTokenizer.from_pretrained(m_name)
    # model = AutoModelForSeq2SeqLM.from_pretrained(m_name)
    # st.text('Model is loaded')

    url = st.text_input('Enter the URL of the Youtube video', 'https://www.youtube.com/watch?v=HGSVsK32rKA')
    video_id = get_video_id(url)

    if video_id is not None:
        subtitle = get_youtube_subtitle(video_id)
        if subtitle is not None:
            st.subheader('Subtitles')
            st.text(subtitle)
            st.text('Compute summary...')

            # inputs = tokenizer(
            #                     [subtitle],
            #                     max_length=600,
            #                     padding="max_length",
            #                     truncation=True,
            #                     return_tensors="pt",
            #                     )["input_ids"]

            # # inputs = tokenizer(subtitle, return_tensors="pt").input_ids
            # outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
            # summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
            translator = pipeline("summarization", model=m_name, 
                                  tokenizer=m_name, max_length=100, device=0
                                  )

            st.subheader('Summary')
            st.text(translator(subtitle))
        else:
            st.write('Subtitles are disabled for this video')
    else:
        st.write('Video clip is not detected')