Instantaneous1 commited on
Commit
e25787c
0 Parent(s):

first commit

Browse files
Files changed (8) hide show
  1. .gitignore +3 -0
  2. README.md +1 -0
  3. app.py +101 -0
  4. bert_solution.py +101 -0
  5. requirements.txt +6 -0
  6. spacy_solution.py +102 -0
  7. word2vec_solution.ipynb +475 -0
  8. word2vec_solution.py +170 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ env/
2
+ .ipynb_checkpoints/
3
+ *.csv
README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # BERT-Video-Search
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from urllib.parse import urlparse
3
+ import pandas as pd
4
+ from streamlit_player import st_player
5
+ from youtube_transcript_api import YouTubeTranscriptApi
6
+ import streamlit as st
7
+ from sentence_transformers import SentenceTransformer
8
+ import faiss, numpy as np
9
+
10
+ MODEL = None
11
+
12
+
13
+ @st.cache_data
14
+ def parse_subtitles(url):
15
+ url_data = urlparse(url)
16
+ print("Id:", url_data.query[2::])
17
+ subtitles = YouTubeTranscriptApi.get_transcript(url_data.query[2::])
18
+ return pd.DataFrame(subtitles)
19
+
20
+
21
+ def init():
22
+ global MODEL
23
+ MODEL = SentenceTransformer("msmarco-distilbert-base-dot-prod-v3")
24
+
25
+
26
+ def store_embeddings(subtitle_df):
27
+ encoded_data = MODEL.encode(subtitle_df.text.tolist())
28
+ encoded_data = np.asarray(encoded_data.astype("float32"))
29
+ index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
30
+ index.add_with_ids(encoded_data, np.array(range(0, len(subtitle_df))))
31
+ return index
32
+
33
+
34
+ def search(subtitle_df, query, top_k, index):
35
+ query_vector = MODEL.encode([query])
36
+ top_k = index.search(query_vector, top_k)
37
+ top_k_ids = top_k[1].tolist()[0]
38
+ top_k_ids = list(np.unique(top_k_ids))
39
+ return subtitle_df.iloc[top_k_ids]
40
+
41
+
42
+ @st.cache_data
43
+ def get_relevant_line(subtitle_df, searchphrase):
44
+ index = store_embeddings(subtitle_df)
45
+ return search(subtitle_df, searchphrase, 6, index)
46
+
47
+
48
+ if __name__ == "__main__":
49
+ init()
50
+ vid_url = st.text_input("Youtube video")
51
+ if vid_url:
52
+ vid_placeholder = st.empty()
53
+
54
+ with vid_placeholder.container():
55
+ st_player(vid_url, playing=True)
56
+ searchphrase = st.text_input(
57
+ "Search keywords relevant to section you are searching for in this video"
58
+ )
59
+ analysis_placeholder = st.empty()
60
+ analysis_placeholder.empty()
61
+ subtitle_df = parse_subtitles(vid_url)
62
+ subtitle_df.to_csv("subtitles.csv")
63
+
64
+ if searchphrase:
65
+ print("\n\n\n Searching", searchphrase)
66
+ search_results = get_relevant_line(subtitle_df, searchphrase)
67
+ # print(df)
68
+ with analysis_placeholder.container():
69
+ if len(search_results):
70
+ st.text("Relevant sections below: ")
71
+ for cap, start in zip(
72
+ search_results["text"].to_list(),
73
+ search_results["start"].to_list(),
74
+ ):
75
+ col1, col2 = st.columns([1, 4])
76
+ col1.button(
77
+ "Jump to time: " + str(start),
78
+ key=" ".join(
79
+ [
80
+ "Jump",
81
+ vid_url,
82
+ str(start),
83
+ str(random.randint(0, 9999999)),
84
+ cap,
85
+ ]
86
+ ),
87
+ )
88
+
89
+ col2.markdown(cap)
90
+ else:
91
+ st.text("No relevant section found, try something else ...")
92
+
93
+ for k, v in st.session_state.items():
94
+ if k.startswith("Jump") and v is True:
95
+ print(k.split(maxsplit=3))
96
+ _, new_url, start, _ = k.split(maxsplit=3)
97
+ vid_placeholder.empty()
98
+ with vid_placeholder.container():
99
+ st_player(
100
+ vid_url + "&t={}s".format(round(float(start))), playing=True
101
+ )
bert_solution.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from urllib.parse import urlparse
3
+ import pandas as pd
4
+ from streamlit_player import st_player
5
+ from youtube_transcript_api import YouTubeTranscriptApi
6
+ import streamlit as st
7
+ from sentence_transformers import SentenceTransformer
8
+ import faiss, numpy as np
9
+
10
+ MODEL = None
11
+
12
+
13
+ def init():
14
+ global MODEL
15
+ MODEL = SentenceTransformer("msmarco-distilbert-base-dot-prod-v3")
16
+
17
+
18
+ @st.cache_data
19
+ def parse_subtitles(url):
20
+ url_data = urlparse(url)
21
+ print("Id:", url_data.query[2::])
22
+ subtitles = YouTubeTranscriptApi.get_transcript(url_data.query[2::])
23
+ return pd.DataFrame(subtitles)
24
+
25
+
26
+ def store_embeddings(subtitle_df):
27
+ encoded_data = MODEL.encode(subtitle_df.text.tolist())
28
+ encoded_data = np.asarray(encoded_data.astype("float32"))
29
+ index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
30
+ index.add_with_ids(encoded_data, np.array(range(0, len(subtitle_df))))
31
+ return index
32
+
33
+
34
+ def search(subtitle_df, query, top_k, index):
35
+ query_vector = MODEL.encode([query])
36
+ top_k = index.search(query_vector, top_k)
37
+ top_k_ids = top_k[1].tolist()[0]
38
+ top_k_ids = list(np.unique(top_k_ids))
39
+ return subtitle_df.iloc[top_k_ids]
40
+
41
+
42
+ @st.cache_data
43
+ def get_relevant_line(subtitle_df, searchphrase):
44
+ index = store_embeddings(subtitle_df)
45
+ return search(subtitle_df, searchphrase, 6, index)
46
+
47
+
48
+ if __name__ == "__main__":
49
+ init()
50
+ vid_url = st.text_input("Youtube video")
51
+ if vid_url:
52
+ vid_placeholder = st.empty()
53
+
54
+ with vid_placeholder.container():
55
+ st_player(vid_url, playing=True)
56
+ searchphrase = st.text_input(
57
+ "Search keywords relevant to section you are searching for in this video"
58
+ )
59
+ analysis_placeholder = st.empty()
60
+ analysis_placeholder.empty()
61
+ subtitle_df = parse_subtitles(vid_url)
62
+ subtitle_df.to_csv("subtitles.csv")
63
+
64
+ if searchphrase:
65
+ print("\n\n\n Searching", searchphrase)
66
+ search_results = get_relevant_line(subtitle_df, searchphrase)
67
+ # print(df)
68
+ with analysis_placeholder.container():
69
+ if len(search_results):
70
+ st.text("Relevant sections below: ")
71
+ for cap, start in zip(
72
+ search_results["text"].to_list(),
73
+ search_results["start"].to_list(),
74
+ ):
75
+ col1, col2 = st.columns([1, 4])
76
+ col1.button(
77
+ "Jump to time: " + str(start),
78
+ key=" ".join(
79
+ [
80
+ "Jump",
81
+ vid_url,
82
+ str(start),
83
+ str(random.randint(0, 9999999)),
84
+ cap,
85
+ ]
86
+ ),
87
+ )
88
+
89
+ col2.markdown(cap)
90
+ else:
91
+ st.text("No relevant section found, try something else ...")
92
+
93
+ for k, v in st.session_state.items():
94
+ if k.startswith("Jump") and v is True:
95
+ print(k.split(maxsplit=3))
96
+ _, new_url, start, _ = k.split(maxsplit=3)
97
+ vid_placeholder.empty()
98
+ with vid_placeholder.container():
99
+ st_player(
100
+ vid_url + "&t={}s".format(round(float(start))), playing=True
101
+ )
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ sentence_transformers
2
+ youtube_transcript_api
3
+ streamlit_player
4
+ streamlit
5
+ pandas
6
+ faiss-cpu
spacy_solution.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from spacy.matcher import PhraseMatcher
2
+ from scipy import spatial
3
+
4
+ import spacy
5
+
6
+
7
+ # method for reading a pdf file
8
+ def readPdfFile():
9
+ return open("text.txt").read()
10
+
11
+
12
+ # customer sentence segmenter for creating spacy document object
13
+ def setCustomBoundaries(doc):
14
+ # traversing through tokens in document object
15
+ for token in doc[:-1]:
16
+ if token.text == ";":
17
+ doc[token.i + 1].is_sent_start = True
18
+ if token.text == ".":
19
+ doc[token.i + 1].is_sent_start = False
20
+ return doc
21
+
22
+
23
+ # create spacy document object from pdf text
24
+ def getSpacyDocument(pdf_text, nlp):
25
+ main_doc = nlp(pdf_text) # create spacy document object
26
+
27
+ return main_doc
28
+
29
+
30
+ # method for searching keyword from the text
31
+ def search_for_keyword(keyword, doc_obj, nlp):
32
+ phrase_matcher = PhraseMatcher(nlp.vocab)
33
+ phrase_list = [nlp(keyword)]
34
+ phrase_matcher.add("Text Extractor", None, *phrase_list)
35
+
36
+ matched_items = phrase_matcher(doc_obj)
37
+
38
+ matched_text = []
39
+ for match_id, start, end in matched_items:
40
+ text = nlp.vocab.strings[match_id]
41
+ span = doc_obj[start:end]
42
+ matched_text.append(span.sent.text)
43
+ return matched_text
44
+
45
+
46
+ # convert keywords to vector
47
+ def createKeywordsVectors(keyword, nlp):
48
+ doc = nlp(keyword) # convert to document object
49
+
50
+ return doc.vector
51
+
52
+
53
+ # method to find cosine similarity
54
+ def cosineSimilarity(vect1, vect2):
55
+ # return cosine distance
56
+ return 1 - spatial.distance.cosine(vect1, vect2)
57
+
58
+
59
+ # method to find similar words
60
+ def getSimilarWords(keyword, nlp):
61
+ similarity_list = []
62
+
63
+ keyword_vector = createKeywordsVectors(keyword, nlp)
64
+
65
+ for tokens in nlp.vocab:
66
+ if tokens.has_vector:
67
+ if tokens.is_lower:
68
+ if tokens.is_alpha:
69
+ similarity_list.append(
70
+ (tokens, cosineSimilarity(keyword_vector, tokens.vector))
71
+ )
72
+
73
+ similarity_list = sorted(similarity_list, key=lambda item: -item[1])
74
+ similarity_list = similarity_list[:30]
75
+
76
+ top_similar_words = [item[0].text for item in similarity_list]
77
+
78
+ top_similar_words = top_similar_words[:3]
79
+ top_similar_words.append(keyword)
80
+
81
+ for token in nlp(keyword):
82
+ top_similar_words.insert(0, token.lemma_)
83
+
84
+ for words in top_similar_words:
85
+ if words.endswith("s"):
86
+ top_similar_words.append(words[0 : len(words) - 1])
87
+
88
+ top_similar_words = list(set(top_similar_words))
89
+
90
+ top_similar_words = [words for words in top_similar_words]
91
+
92
+ return ", ".join(top_similar_words)
93
+
94
+
95
+ if __name__ == "__main__":
96
+ # spacy english model (large)
97
+ nlp = spacy.load("en_core_web_lg")
98
+ # nlp.add_pipe(setCustomBoundaries, before="parser")
99
+ keywords = "how"
100
+ main_doc = nlp(readPdfFile())
101
+ # similar_keywords = getSimilarWords(keywords, nlp)
102
+ print(search_for_keyword(keywords, main_doc, nlp))
word2vec_solution.ipynb ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "8876bf48-386e-476f-bdae-c374120c6482",
6
+ "metadata": {},
7
+ "source": [
8
+ "<center><a target=\"_blank\" href=\"https://githubtocolab.com/sayan1999/YouTube-Video-Summarizer/blob/main/summary.ipynb\">\n",
9
+ " <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
10
+ "</a></center>"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "markdown",
15
+ "id": "a697eb0d-013f-414c-8251-129f366be8c5",
16
+ "metadata": {},
17
+ "source": [
18
+ "## Imports"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 14,
24
+ "id": "E7yeksy_qR0l",
25
+ "metadata": {
26
+ "id": "E7yeksy_qR0l"
27
+ },
28
+ "outputs": [],
29
+ "source": [
30
+ "import os\n",
31
+ "from yt_dlp import YoutubeDL\n",
32
+ "import glob\n",
33
+ "import webvtt\n",
34
+ "import pandas as pd\n",
35
+ "\n",
36
+ "from scipy import spatial\n",
37
+ "from gensim.models import word2vec\n",
38
+ "\n",
39
+ "from collections import namedtuple\n",
40
+ "import nltk\n",
41
+ "import pandas as pd\n",
42
+ "import gensim\n",
43
+ "from nltk.corpus import stopwords\n",
44
+ "from nltk.stem import WordNetLemmatizer"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 15,
50
+ "id": "2c3bcd80-765c-4d4e-9737-4eb08fc8ade0",
51
+ "metadata": {
52
+ "id": "2c3bcd80-765c-4d4e-9737-4eb08fc8ade0"
53
+ },
54
+ "outputs": [],
55
+ "source": [
56
+ "def dl_transcript(url):\n",
57
+ " with YoutubeDL({\"skip_download\":True, \"writeautomaticsub\":True, \"subtitleslangs\":[\"en\", \"en-us\", \"en-us\", \"en-uk\", \"en-in\", \"en-es\", \"en-fr\"]}) as ydl:\n",
58
+ " if ydl.download(url):\n",
59
+ " print(\"-----------------------------------------------------------------------------------------------------------LINK FAILED\")\n",
60
+ " return None, None\n",
61
+ " else:\n",
62
+ " # print(ydl.extract_info(url))\n",
63
+ " op = [f for f in os.listdir() if f.startswith(os.path.splitext(ydl.prepare_filename(ydl.extract_info(url)))[0]) and f.endswith('.vtt')]\n",
64
+ " if op:\n",
65
+ " return ydl.extract_info(url)['title'], op[0]\n",
66
+ " else:\n",
67
+ " print(\"-----------------------------------------------------------------------------------------------------------file download FAILED\")\n",
68
+ " return None, None"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 89,
74
+ "id": "69123e12-8aac-4642-9bb0-4a7355edfa1a",
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "def init():\n",
79
+ " tokenizer = nltk.RegexpTokenizer(r\"\\w+\")\n",
80
+ " model = gensim.models.KeyedVectors.load_word2vec_format(\n",
81
+ " \"/home/instantinopaul/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz\",\n",
82
+ " binary=True,\n",
83
+ " )\n",
84
+ " # model = None\n",
85
+ " return tokenizer, model\n",
86
+ "\n",
87
+ "\n",
88
+ "def docsimilarity(model, keyword, doc):\n",
89
+ " cutoff = 0.4\n",
90
+ " score = 0\n",
91
+ " for w in doc:\n",
92
+ " sm = model.similarity(keyword, w) if w in model else 0\n",
93
+ " if sm >= cutoff:\n",
94
+ " score += sm\n",
95
+ " return score\n",
96
+ "\n",
97
+ "\n",
98
+ "def get_relevant_line(df, searchphrase, model):\n",
99
+ " wordnet_lemmatizer = WordNetLemmatizer()\n",
100
+ " stop_words = set(stopwords.words(\"english\"))\n",
101
+ " df = preprocess(df, wordnet_lemmatizer, stop_words)\n",
102
+ " \n",
103
+ " keywords = [ wordnet_lemmatizer.lemmatize(\n",
104
+ " wordnet_lemmatizer.lemmatize(wordnet_lemmatizer.lemmatize(kw.lower()), pos=\"v\"),\n",
105
+ " pos=(\"a\"),\n",
106
+ " ) for kw in tokenizer.tokenize(searchphrase)]\n",
107
+ " \n",
108
+ " df[\"similarity\"] = sum(\n",
109
+ " [\n",
110
+ " df[\"docs\"].apply(lambda doc: docsimilarity(model, keyword.lower(), doc))\n",
111
+ " for keyword in keywords\n",
112
+ " if keyword in model\n",
113
+ " ]\n",
114
+ " )\n",
115
+ " df[\"docs\"] = df[\"docs\"].apply(\" \".join)\n",
116
+ " df = df.sort_values(\"similarity\", ascending=False)\n",
117
+ " df.to_csv('result.csv', index=False)\n",
118
+ " return df[df[\"similarity\"]>1][['start', 'end']].values.tolist()"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 16,
124
+ "id": "jrXUUFDJr1_p",
125
+ "metadata": {
126
+ "id": "jrXUUFDJr1_p"
127
+ },
128
+ "outputs": [],
129
+ "source": [
130
+ "import re\n",
131
+ "def parse_subtitles(url):\n",
132
+ " title, vttfile=dl_transcript(url)\n",
133
+ " if vttfile is None:\n",
134
+ " return \"SOME ISSUE WITH VIDEO LINK OR DOWNLOAINDING VIDEO CONTENTS\"\n",
135
+ " print(\"text file expected\", vttfile+'.txt')\n",
136
+ " os.system(f\"cat \\\"{vttfile}\\\" | grep : -v | awk '!seen[$0]++' > \\\"{vttfile}.txt\\\"\")\n",
137
+ " tscript = re.sub(r'[\\s|\\n]',' ',open(f'{vttfile}.txt').read().replace('WEBVTT', '', 1))\n",
138
+ " data = [[caption.start, caption.end, caption.text] for caption in webvtt.read(vttfile)]\n",
139
+ " df = pd.DataFrame(data, columns=['start', 'end', 'caps'])\n",
140
+ " return title, tscript, df"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 85,
146
+ "id": "5603d2f7-78d1-4b3a-9dc4-45e5109e876e",
147
+ "metadata": {},
148
+ "outputs": [],
149
+ "source": [
150
+ "def preprocess(df, wordnet_lemmatizer, stop_words):\n",
151
+ " orig_docs = [\n",
152
+ " [\n",
153
+ " word for word in tokenizer.tokenize(sent)\n",
154
+ " ]\n",
155
+ " for sent in df['caps']\n",
156
+ " ]\n",
157
+ " \n",
158
+ " df['docs'] = [\n",
159
+ " [\n",
160
+ " wordnet_lemmatizer.lemmatize(\n",
161
+ " wordnet_lemmatizer.lemmatize(wordnet_lemmatizer.lemmatize(word.lower()), pos=\"v\"),\n",
162
+ " pos=(\"a\"),\n",
163
+ " )\n",
164
+ " for word in sent\n",
165
+ " if word not in stop_words\n",
166
+ " ]\n",
167
+ " for sent in orig_docs\n",
168
+ " ]\n",
169
+ " print(df['docs'])\n",
170
+ " return df"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": null,
176
+ "id": "a61e9abc-039a-49ec-99dd-d071dcf3da04",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": []
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": null,
184
+ "id": "d132c727-cba3-49ff-8e70-a7e07898061f",
185
+ "metadata": {},
186
+ "outputs": [],
187
+ "source": []
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": 18,
192
+ "id": "830b2c51-9cb6-413a-a63e-305bf299f4a1",
193
+ "metadata": {
194
+ "colab": {
195
+ "base_uri": "https://localhost:8080/",
196
+ "height": 1000
197
+ },
198
+ "id": "WnhasqdYjsug",
199
+ "outputId": "bf5c7abd-2ef3-4a3b-9212-6a9c349647ac",
200
+ "scrolled": true
201
+ },
202
+ "outputs": [
203
+ {
204
+ "name": "stdout",
205
+ "output_type": "stream",
206
+ "text": [
207
+ "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
208
+ "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
209
+ "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
210
+ "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
211
+ "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
212
+ "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
213
+ "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
214
+ "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
215
+ "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
216
+ "[hlsnative] Downloading m3u8 manifest\n",
217
+ "[hlsnative] Total fragments: 3\n",
218
+ "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
219
+ "[download] 100% of 17.29KiB in 00:00:01 at 9.16KiB/s \n",
220
+ "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
221
+ "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
222
+ "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
223
+ "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
224
+ "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
225
+ "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
226
+ "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
227
+ "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
228
+ "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
229
+ "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
230
+ "[download] 100% of 134.88KiB in 00:00:01 at 107.24KiB/s\n",
231
+ "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
232
+ "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
233
+ "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
234
+ "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
235
+ "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
236
+ "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
237
+ "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
238
+ "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
239
+ "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
240
+ "[hlsnative] Downloading m3u8 manifest\n",
241
+ "[hlsnative] Total fragments: 3\n",
242
+ "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
243
+ "[download] 100% of 17.29KiB in 00:00:02 at 7.84KiB/s \n",
244
+ "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
245
+ "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
246
+ "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
247
+ "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
248
+ "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
249
+ "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
250
+ "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
251
+ "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
252
+ "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
253
+ "[hlsnative] Downloading m3u8 manifest\n",
254
+ "[hlsnative] Total fragments: 3\n",
255
+ "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
256
+ "[download] 100% of 17.29KiB in 00:00:01 at 9.09KiB/s \n",
257
+ "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
258
+ "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
259
+ "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
260
+ "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
261
+ "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
262
+ "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
263
+ "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
264
+ "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
265
+ "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
266
+ "[hlsnative] Downloading m3u8 manifest\n",
267
+ "[hlsnative] Total fragments: 3\n",
268
+ "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
269
+ "[download] 100% of 17.29KiB in 00:00:01 at 9.34KiB/s \n",
270
+ "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
271
+ "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
272
+ "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
273
+ "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
274
+ "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
275
+ "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
276
+ "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
277
+ "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
278
+ "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
279
+ "[hlsnative] Downloading m3u8 manifest\n",
280
+ "[hlsnative] Total fragments: 3\n",
281
+ "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
282
+ "[download] 100% of 17.29KiB in 00:00:02 at 7.25KiB/s \n",
283
+ "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
284
+ "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
285
+ "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
286
+ "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
287
+ "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
288
+ "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
289
+ "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
290
+ "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
291
+ "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
292
+ "[hlsnative] Downloading m3u8 manifest\n",
293
+ "[hlsnative] Total fragments: 3\n",
294
+ "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
295
+ "[download] 100% of 17.29KiB in 00:00:02 at 6.87KiB/s \n",
296
+ "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
297
+ "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
298
+ "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
299
+ "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
300
+ "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
301
+ "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
302
+ "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
303
+ "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
304
+ "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
305
+ "[hlsnative] Downloading m3u8 manifest\n",
306
+ "[hlsnative] Total fragments: 3\n",
307
+ "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
308
+ "[download] 100% of 17.29KiB in 00:00:01 at 9.08KiB/s \n",
309
+ "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
310
+ "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
311
+ "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
312
+ "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
313
+ "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
314
+ "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
315
+ "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
316
+ "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
317
+ "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
318
+ "[hlsnative] Downloading m3u8 manifest\n",
319
+ "[hlsnative] Total fragments: 3\n",
320
+ "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
321
+ "[download] 100% of 17.29KiB in 00:00:01 at 8.72KiB/s \n",
322
+ "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
323
+ "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
324
+ "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
325
+ "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
326
+ "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
327
+ "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
328
+ "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
329
+ "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
330
+ "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
331
+ "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
332
+ "[download] 100% of 134.88KiB in 00:00:01 at 110.64KiB/s\n",
333
+ "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
334
+ "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
335
+ "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
336
+ "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
337
+ "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
338
+ "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
339
+ "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
340
+ "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
341
+ "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
342
+ "[hlsnative] Downloading m3u8 manifest\n",
343
+ "[hlsnative] Total fragments: 3\n",
344
+ "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
345
+ "[download] 100% of 17.29KiB in 00:00:01 at 9.11KiB/s \n",
346
+ "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
347
+ "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
348
+ "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
349
+ "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
350
+ "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
351
+ "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
352
+ "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
353
+ "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
354
+ "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
355
+ "[hlsnative] Downloading m3u8 manifest\n",
356
+ "[hlsnative] Total fragments: 3\n",
357
+ "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
358
+ "[download] 100% of 17.29KiB in 00:00:01 at 9.28KiB/s \n",
359
+ "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
360
+ "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
361
+ "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
362
+ "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
363
+ "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
364
+ "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
365
+ "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
366
+ "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
367
+ "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
368
+ "[hlsnative] Downloading m3u8 manifest\n",
369
+ "[hlsnative] Total fragments: 3\n",
370
+ "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
371
+ "[download] 100% of 17.29KiB in 00:00:03 at 5.47KiB/s \n",
372
+ "text file expected Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 | Akash Banerjee [6Ow2zHJQw2M].en.vtt.txt\n"
373
+ ]
374
+ }
375
+ ],
376
+ "source": [
377
+ "title, tscript, df = parse_subtitles('https://www.youtube.com/watch?v=6Ow2zHJQw2M')"
378
+ ]
379
+ },
380
+ {
381
+ "cell_type": "code",
382
+ "execution_count": 22,
383
+ "id": "4cbd40ea-5c1c-4d06-9b79-1e3e44b8749c",
384
+ "metadata": {},
385
+ "outputs": [],
386
+ "source": [
387
+ "tokenizer, model = init()"
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": 90,
393
+ "id": "996e559a-3206-475e-9e67-a6a9ae0c6b27",
394
+ "metadata": {
395
+ "colab": {
396
+ "base_uri": "https://localhost:8080/",
397
+ "height": 1000
398
+ },
399
+ "id": "WnhasqdYjsug",
400
+ "outputId": "bf5c7abd-2ef3-4a3b-9212-6a9c349647ac"
401
+ },
402
+ "outputs": [
403
+ {
404
+ "name": "stdout",
405
+ "output_type": "stream",
406
+ "text": [
407
+ "0 [this, jabalia, refugee, camp, israel, drop, 6...\n",
408
+ "1 [accord, report, explosive, power, bomb, equal...\n",
409
+ "2 [the, entire, refugee, camp, destroy, more, 40...\n",
410
+ "3 [and, still, many, child, many, innocent, pale...\n",
411
+ "4 [israel, even, try, deny, time, they, say, yes...\n",
412
+ " ... \n",
413
+ "127 [but, know, youtube, might, restrict, video, w...\n",
414
+ "128 [so, please, ignore, video, comment, share]\n",
415
+ "129 [and, tell, people, raise, voice, not, ignore,...\n",
416
+ "130 [it, important, disappoint, genocide, gaza, te...\n",
417
+ "131 [otherwise, today, gaza, tomorrow, another, pl...\n",
418
+ "Name: docs, Length: 132, dtype: object\n"
419
+ ]
420
+ },
421
+ {
422
+ "data": {
423
+ "text/plain": [
424
+ "[['00:10:35.477', '00:10:44.114'],\n",
425
+ " ['00:04:30.424', '00:04:41.642'],\n",
426
+ " ['00:08:08.720', '00:08:18.994'],\n",
427
+ " ['00:08:51.922', '00:08:58.454']]"
428
+ ]
429
+ },
430
+ "execution_count": 90,
431
+ "metadata": {},
432
+ "output_type": "execute_result"
433
+ }
434
+ ],
435
+ "source": [
436
+ "searchphrase = \"children death\"\n",
437
+ "get_relevant_line(df, searchphrase, model)"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": null,
443
+ "id": "9442ff82-3777-4d58-8f05-69f7afbeeb60",
444
+ "metadata": {},
445
+ "outputs": [],
446
+ "source": []
447
+ }
448
+ ],
449
+ "metadata": {
450
+ "accelerator": "GPU",
451
+ "colab": {
452
+ "gpuType": "T4",
453
+ "provenance": []
454
+ },
455
+ "kernelspec": {
456
+ "display_name": "search-in-vid",
457
+ "language": "python",
458
+ "name": "search-in-vid"
459
+ },
460
+ "language_info": {
461
+ "codemirror_mode": {
462
+ "name": "ipython",
463
+ "version": 3
464
+ },
465
+ "file_extension": ".py",
466
+ "mimetype": "text/x-python",
467
+ "name": "python",
468
+ "nbconvert_exporter": "python",
469
+ "pygments_lexer": "ipython3",
470
+ "version": "3.10.12"
471
+ }
472
+ },
473
+ "nbformat": 4,
474
+ "nbformat_minor": 5
475
+ }
word2vec_solution.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random, re
2
+ from urllib.parse import urlparse
3
+ from yt_dlp import YoutubeDL
4
+ import glob
5
+ import webvtt
6
+ import pandas as pd
7
+ from streamlit_player import st_player
8
+ from youtube_transcript_api import YouTubeTranscriptApi
9
+
10
+
11
+ from scipy import spatial
12
+ from gensim.models import word2vec
13
+
14
+ from collections import namedtuple
15
+ import nltk
16
+ import pandas as pd
17
+ import gensim
18
+ from nltk.corpus import stopwords
19
+ from nltk.stem import WordNetLemmatizer
20
+
21
+ import streamlit as st
22
+
23
+
24
+ @st.cache_data
25
+ def dl_transcript(url):
26
+ url_data = urlparse(url)
27
+ print("id", url_data.query[2::])
28
+ return YouTubeTranscriptApi.get_transcript(url_data.query[2::])
29
+
30
+
31
+ @st.cache_data
32
+ def init():
33
+ tokenizer = nltk.RegexpTokenizer(r"\w+")
34
+ model = gensim.models.KeyedVectors.load_word2vec_format(
35
+ "archive/GoogleNews-vectors-negative300-SLIM.bin",
36
+ binary=True,
37
+ )
38
+ # model = None
39
+ return tokenizer, model
40
+
41
+
42
+ # @st.cache_data
43
+ def docsimilarity(model, keyword, doc):
44
+ cutoff = 0.4
45
+ score = 0
46
+ for w in doc:
47
+ sm = model.similarity(keyword, w) if w in model else 0
48
+ if sm >= cutoff:
49
+ score += sm
50
+ return score
51
+
52
+
53
+ @st.cache_data
54
+ def get_relevant_line(df, searchphrase):
55
+ tokenizer, model = init()
56
+ wordnet_lemmatizer = WordNetLemmatizer()
57
+ stop_words = set(stopwords.words("english"))
58
+ df = preprocess(df, tokenizer, wordnet_lemmatizer, stop_words)
59
+
60
+ keywords = [
61
+ wordnet_lemmatizer.lemmatize(
62
+ wordnet_lemmatizer.lemmatize(
63
+ wordnet_lemmatizer.lemmatize(kw.lower()), pos="v"
64
+ ),
65
+ pos=("a"),
66
+ )
67
+ for kw in tokenizer.tokenize(searchphrase)
68
+ ]
69
+ print("lemm keywords: ", keywords)
70
+ df["similarity"] = sum(
71
+ [
72
+ df["docs"].apply(lambda doc: docsimilarity(model, keyword.lower(), doc))
73
+ for keyword in keywords
74
+ if keyword in model
75
+ ]
76
+ )
77
+ df["docs"] = df["docs"].apply(" ".join)
78
+ df = df.sort_values("similarity", ascending=False)
79
+ df.to_csv("result.csv", index=False)
80
+ res_idx = df["similarity"] >= 1
81
+ print(
82
+ "Result length: ",
83
+ sum(res_idx),
84
+ )
85
+ return df[res_idx].reset_index().iloc[:4]
86
+
87
+
88
+ @st.cache_data
89
+ def parse_subtitles(url):
90
+ return pd.DataFrame(dl_transcript(url))
91
+
92
+
93
+ # @st.cache_data
94
+ def preprocess(df, tokenizer, wordnet_lemmatizer, stop_words):
95
+ orig_docs = [[word for word in tokenizer.tokenize(sent)] for sent in df["text"]]
96
+
97
+ df["docs"] = [
98
+ [
99
+ wordnet_lemmatizer.lemmatize(
100
+ wordnet_lemmatizer.lemmatize(
101
+ wordnet_lemmatizer.lemmatize(word.lower()), pos="v"
102
+ ),
103
+ pos=("a"),
104
+ )
105
+ for word in sent
106
+ if word not in stop_words
107
+ ]
108
+ for sent in orig_docs
109
+ ]
110
+ # print(df["docs"])
111
+ return df
112
+
113
+
114
+ def vidattstamp(vid_url, start, vid_placeholder):
115
+ vid_url = vid_url + "&t=400s"
116
+ print("Skipping to ", start, vid_url)
117
+ vid_placeholder.empty()
118
+ # with placeholder.container():
119
+ # st_player(vid_url, playing=True, muted=True)
120
+
121
+
122
+ vid_url = st.text_input("Youtube video")
123
+ if vid_url:
124
+ # print(st.session_state)
125
+ placeholder = st.empty()
126
+ analysis_placeholder = st.empty()
127
+ with placeholder.container():
128
+ st_player(vid_url, playing=True)
129
+ analysis_placeholder.empty()
130
+ # st.video(vid_url)
131
+ df = parse_subtitles(vid_url)
132
+ df.to_csv("caps.csv")
133
+ searchphrase = st.text_input(
134
+ "Search keywords relevant to section you are searching for in this video"
135
+ )
136
+ if searchphrase:
137
+ print("\n\n\n Searching", searchphrase)
138
+ df = get_relevant_line(df, searchphrase)
139
+ # print(df)
140
+ with analysis_placeholder.container():
141
+ if len(df):
142
+ st.text("Relevant sections below: ")
143
+ # placeholder.empty()
144
+ # st.dataframe(df)
145
+ for cap, start in zip(df["text"].to_list(), df["start"].to_list()):
146
+ col1, col2 = st.columns([1, 4])
147
+ col1.button(
148
+ "Jump to time: " + str(start),
149
+ key=" ".join(
150
+ [
151
+ "Jump",
152
+ vid_url,
153
+ str(start),
154
+ str(random.randint(0, 9999999)),
155
+ cap,
156
+ ]
157
+ ),
158
+ )
159
+
160
+ col2.markdown(cap)
161
+ else:
162
+ st.text("No relevant section found, try something else ...")
163
+
164
+ for k, v in st.session_state.items():
165
+ if k.startswith("Jump") and v is True:
166
+ print(k.split(maxsplit=3))
167
+ _, new_url, start, _ = k.split(maxsplit=3)
168
+ placeholder.empty()
169
+ with placeholder.container():
170
+ st_player(vid_url + "&t={}s".format(round(float(start))), playing=True)