Oshchepkov commited on
Commit
f5c350f
1 Parent(s): 4a35a01

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -25
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import streamlit as st
2
  from urllib.parse import urlparse, parse_qs
3
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
  # https://pypi.org/project/youtube-transcript-api/
5
  from youtube_transcript_api import YouTubeTranscriptApi
6
 
@@ -23,7 +23,6 @@ def get_video_id(url: str) -> str:
23
  return query.path.split('/')[2]
24
  if query.path[:3] == '/v/':
25
  return query.path.split('/')[2]
26
- # fail?
27
  return None
28
 
29
 
@@ -39,30 +38,44 @@ def get_youtube_subtitle(video_id: str) -> str:
39
  except:
40
  return None
41
 
42
- st.header("Annotation of subtitles from YouTube")
43
- st.text('Load model...')
44
- m_name = 'summarize1'
45
- tokenizer = AutoTokenizer.from_pretrained(m_name)
46
- model = AutoModelForSeq2SeqLM.from_pretrained(m_name)
47
- st.text('Model is loaded')
 
 
 
48
 
49
- url = st.text_input('Enter the URL of the Youtube video', 'https://www.youtube.com/watch?v=HGSVsK32rKA')
50
- video_id = get_video_id(url)
51
 
52
- if video_id is not None:
53
- subtitle = get_youtube_subtitle(video_id)
54
- if subtitle is not None:
55
- st.subheader('Subtitles')
56
- st.text(subtitle)
57
- st.text('Compute summary...')
58
 
59
- inputs = tokenizer(subtitle[:1024], return_tensors="pt").input_ids
60
- outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
61
- summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
62
-
63
- st.subheader('Summary')
64
- st.text(summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  else:
66
- st.write('Subtitles are disabled for this video')
67
- else:
68
- st.write('Video clip is not detected')
 
1
  import streamlit as st
2
  from urllib.parse import urlparse, parse_qs
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
4
  # https://pypi.org/project/youtube-transcript-api/
5
  from youtube_transcript_api import YouTubeTranscriptApi
6
 
 
23
  return query.path.split('/')[2]
24
  if query.path[:3] == '/v/':
25
  return query.path.split('/')[2]
 
26
  return None
27
 
28
 
 
38
  except:
39
  return None
40
 
41
+
42
+ if __name__ == "__main__":
43
+ st.header("Annotation of subtitles from YouTube")
44
+ # st.text('Load model...')
45
+ # m_name = '/content/drive/MyDrive/Colab Notebooks/Netology/diplom_neto/summarize1'
46
+ m_name = "csebuetnlp/mT5_multilingual_XLSum"
47
+ # tokenizer = AutoTokenizer.from_pretrained(m_name)
48
+ # model = AutoModelForSeq2SeqLM.from_pretrained(m_name)
49
+ # st.text('Model is loaded')
50
 
51
+ url = st.text_input('Enter the URL of the Youtube video', 'https://www.youtube.com/watch?v=HGSVsK32rKA')
52
+ video_id = get_video_id(url)
53
 
54
+ if video_id is not None:
55
+ subtitle = get_youtube_subtitle(video_id)
56
+ if subtitle is not None:
57
+ st.subheader('Subtitles')
58
+ st.text(subtitle)
59
+ st.text('Compute summary...')
60
 
61
+ # inputs = tokenizer(
62
+ # [subtitle],
63
+ # max_length=600,
64
+ # padding="max_length",
65
+ # truncation=True,
66
+ # return_tensors="pt",
67
+ # )["input_ids"]
68
+
69
+ # # inputs = tokenizer(subtitle, return_tensors="pt").input_ids
70
+ # outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
71
+ # summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
72
+ translator = pipeline("summarization", model=m_name,
73
+ tokenizer=m_name, max_length=100, device=0
74
+ )
75
+
76
+ st.subheader('Summary')
77
+ st.text(translator(subtitle))
78
+ else:
79
+ st.write('Subtitles are disabled for this video')
80
  else:
81
+ st.write('Video clip is not detected')