Spaces:

Instantaneous1
/

bert-video-search-and-jump

Sleeping

App Files Files Community

Instantaneous1 commited on Nov 2, 2023

Commit

e25787c

0 Parent(s):

first commit

Browse files

Files changed (8) hide show

.gitignore +3 -0
README.md +1 -0
app.py +101 -0
bert_solution.py +101 -0
requirements.txt +6 -0
spacy_solution.py +102 -0
word2vec_solution.ipynb +475 -0
word2vec_solution.py +170 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+env/
+.ipynb_checkpoints/
+*.csv

README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # BERT-Video-Search

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import random
+from urllib.parse import urlparse
+import pandas as pd
+from streamlit_player import st_player
+from youtube_transcript_api import YouTubeTranscriptApi
+import streamlit as st
+from sentence_transformers import SentenceTransformer
+import faiss, numpy as np
+MODEL = None
+@st.cache_data
+def parse_subtitles(url):
+    url_data = urlparse(url)
+    print("Id:", url_data.query[2::])
+    subtitles = YouTubeTranscriptApi.get_transcript(url_data.query[2::])
+    return pd.DataFrame(subtitles)
+def init():
+    global MODEL
+    MODEL = SentenceTransformer("msmarco-distilbert-base-dot-prod-v3")
+def store_embeddings(subtitle_df):
+    encoded_data = MODEL.encode(subtitle_df.text.tolist())
+    encoded_data = np.asarray(encoded_data.astype("float32"))
+    index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
+    index.add_with_ids(encoded_data, np.array(range(0, len(subtitle_df))))
+    return index
+def search(subtitle_df, query, top_k, index):
+    query_vector = MODEL.encode([query])
+    top_k = index.search(query_vector, top_k)
+    top_k_ids = top_k[1].tolist()[0]
+    top_k_ids = list(np.unique(top_k_ids))
+    return subtitle_df.iloc[top_k_ids]
+@st.cache_data
+def get_relevant_line(subtitle_df, searchphrase):
+    index = store_embeddings(subtitle_df)
+    return search(subtitle_df, searchphrase, 6, index)
+if __name__ == "__main__":
+    init()
+    vid_url = st.text_input("Youtube video")
+    if vid_url:
+        vid_placeholder = st.empty()
+        with vid_placeholder.container():
+            st_player(vid_url, playing=True)
+        searchphrase = st.text_input(
+            "Search keywords relevant to section you are searching for in this video"
+        )
+        analysis_placeholder = st.empty()
+        analysis_placeholder.empty()
+        subtitle_df = parse_subtitles(vid_url)
+        subtitle_df.to_csv("subtitles.csv")
+        if searchphrase:
+            print("\n\n\n Searching", searchphrase)
+            search_results = get_relevant_line(subtitle_df, searchphrase)
+            # print(df)
+            with analysis_placeholder.container():
+                if len(search_results):
+                    st.text("Relevant sections below: ")
+                    for cap, start in zip(
+                        search_results["text"].to_list(),
+                        search_results["start"].to_list(),
+                    ):
+                        col1, col2 = st.columns([1, 4])
+                        col1.button(
+                            "Jump to time: " + str(start),
+                            key=" ".join(
+                                [
+                                    "Jump",
+                                    vid_url,
+                                    str(start),
+                                    str(random.randint(0, 9999999)),
+                                    cap,
+                                ]
+                            ),
+                        )
+                        col2.markdown(cap)
+                else:
+                    st.text("No relevant section found, try something else ...")
+        for k, v in st.session_state.items():
+            if k.startswith("Jump") and v is True:
+                print(k.split(maxsplit=3))
+                _, new_url, start, _ = k.split(maxsplit=3)
+                vid_placeholder.empty()
+                with vid_placeholder.container():
+                    st_player(
+                        vid_url + "&t={}s".format(round(float(start))), playing=True
+                    )

bert_solution.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import random
+from urllib.parse import urlparse
+import pandas as pd
+from streamlit_player import st_player
+from youtube_transcript_api import YouTubeTranscriptApi
+import streamlit as st
+from sentence_transformers import SentenceTransformer
+import faiss, numpy as np
+MODEL = None
+def init():
+    global MODEL
+    MODEL = SentenceTransformer("msmarco-distilbert-base-dot-prod-v3")
+@st.cache_data
+def parse_subtitles(url):
+    url_data = urlparse(url)
+    print("Id:", url_data.query[2::])
+    subtitles = YouTubeTranscriptApi.get_transcript(url_data.query[2::])
+    return pd.DataFrame(subtitles)
+def store_embeddings(subtitle_df):
+    encoded_data = MODEL.encode(subtitle_df.text.tolist())
+    encoded_data = np.asarray(encoded_data.astype("float32"))
+    index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
+    index.add_with_ids(encoded_data, np.array(range(0, len(subtitle_df))))
+    return index
+def search(subtitle_df, query, top_k, index):
+    query_vector = MODEL.encode([query])
+    top_k = index.search(query_vector, top_k)
+    top_k_ids = top_k[1].tolist()[0]
+    top_k_ids = list(np.unique(top_k_ids))
+    return subtitle_df.iloc[top_k_ids]
+@st.cache_data
+def get_relevant_line(subtitle_df, searchphrase):
+    index = store_embeddings(subtitle_df)
+    return search(subtitle_df, searchphrase, 6, index)
+if __name__ == "__main__":
+    init()
+    vid_url = st.text_input("Youtube video")
+    if vid_url:
+        vid_placeholder = st.empty()
+        with vid_placeholder.container():
+            st_player(vid_url, playing=True)
+        searchphrase = st.text_input(
+            "Search keywords relevant to section you are searching for in this video"
+        )
+        analysis_placeholder = st.empty()
+        analysis_placeholder.empty()
+        subtitle_df = parse_subtitles(vid_url)
+        subtitle_df.to_csv("subtitles.csv")
+        if searchphrase:
+            print("\n\n\n Searching", searchphrase)
+            search_results = get_relevant_line(subtitle_df, searchphrase)
+            # print(df)
+            with analysis_placeholder.container():
+                if len(search_results):
+                    st.text("Relevant sections below: ")
+                    for cap, start in zip(
+                        search_results["text"].to_list(),
+                        search_results["start"].to_list(),
+                    ):
+                        col1, col2 = st.columns([1, 4])
+                        col1.button(
+                            "Jump to time: " + str(start),
+                            key=" ".join(
+                                [
+                                    "Jump",
+                                    vid_url,
+                                    str(start),
+                                    str(random.randint(0, 9999999)),
+                                    cap,
+                                ]
+                            ),
+                        )
+                        col2.markdown(cap)
+                else:
+                    st.text("No relevant section found, try something else ...")
+        for k, v in st.session_state.items():
+            if k.startswith("Jump") and v is True:
+                print(k.split(maxsplit=3))
+                _, new_url, start, _ = k.split(maxsplit=3)
+                vid_placeholder.empty()
+                with vid_placeholder.container():
+                    st_player(
+                        vid_url + "&t={}s".format(round(float(start))), playing=True
+                    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+sentence_transformers
+youtube_transcript_api
+streamlit_player
+streamlit
+pandas
+faiss-cpu

spacy_solution.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from spacy.matcher import PhraseMatcher
+from scipy import spatial
+import spacy
+# method for reading a pdf file
+def readPdfFile():
+    return open("text.txt").read()
+# customer sentence segmenter for creating spacy document object
+def setCustomBoundaries(doc):
+    # traversing through tokens in document object
+    for token in doc[:-1]:
+        if token.text == ";":
+            doc[token.i + 1].is_sent_start = True
+        if token.text == ".":
+            doc[token.i + 1].is_sent_start = False
+    return doc
+# create spacy document object from pdf text
+def getSpacyDocument(pdf_text, nlp):
+    main_doc = nlp(pdf_text)  # create spacy document object
+    return main_doc
+# method for searching keyword from the text
+def search_for_keyword(keyword, doc_obj, nlp):
+    phrase_matcher = PhraseMatcher(nlp.vocab)
+    phrase_list = [nlp(keyword)]
+    phrase_matcher.add("Text Extractor", None, *phrase_list)
+    matched_items = phrase_matcher(doc_obj)
+    matched_text = []
+    for match_id, start, end in matched_items:
+        text = nlp.vocab.strings[match_id]
+        span = doc_obj[start:end]
+        matched_text.append(span.sent.text)
+    return matched_text
+# convert keywords to vector
+def createKeywordsVectors(keyword, nlp):
+    doc = nlp(keyword)  # convert to document object
+    return doc.vector
+# method to find cosine similarity
+def cosineSimilarity(vect1, vect2):
+    # return cosine distance
+    return 1 - spatial.distance.cosine(vect1, vect2)
+# method to find similar words
+def getSimilarWords(keyword, nlp):
+    similarity_list = []
+    keyword_vector = createKeywordsVectors(keyword, nlp)
+    for tokens in nlp.vocab:
+        if tokens.has_vector:
+            if tokens.is_lower:
+                if tokens.is_alpha:
+                    similarity_list.append(
+                        (tokens, cosineSimilarity(keyword_vector, tokens.vector))
+                    )
+    similarity_list = sorted(similarity_list, key=lambda item: -item[1])
+    similarity_list = similarity_list[:30]
+    top_similar_words = [item[0].text for item in similarity_list]
+    top_similar_words = top_similar_words[:3]
+    top_similar_words.append(keyword)
+    for token in nlp(keyword):
+        top_similar_words.insert(0, token.lemma_)
+    for words in top_similar_words:
+        if words.endswith("s"):
+            top_similar_words.append(words[0 : len(words) - 1])
+    top_similar_words = list(set(top_similar_words))
+    top_similar_words = [words for words in top_similar_words]
+    return ", ".join(top_similar_words)
+if __name__ == "__main__":
+    # spacy english model (large)
+    nlp = spacy.load("en_core_web_lg")
+    # nlp.add_pipe(setCustomBoundaries, before="parser")
+    keywords = "how"
+    main_doc = nlp(readPdfFile())
+    # similar_keywords = getSimilarWords(keywords, nlp)
+    print(search_for_keyword(keywords, main_doc, nlp))

word2vec_solution.ipynb ADDED Viewed

	@@ -0,0 +1,475 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8876bf48-386e-476f-bdae-c374120c6482",
+   "metadata": {},
+   "source": [
+    "<center><a target=\"_blank\" href=\"https://githubtocolab.com/sayan1999/YouTube-Video-Summarizer/blob/main/summary.ipynb\">\n",
+    "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+    "</a></center>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a697eb0d-013f-414c-8251-129f366be8c5",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "E7yeksy_qR0l",
+   "metadata": {
+    "id": "E7yeksy_qR0l"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from yt_dlp import YoutubeDL\n",
+    "import glob\n",
+    "import webvtt\n",
+    "import pandas as pd\n",
+    "\n",
+    "from scipy import spatial\n",
+    "from gensim.models import word2vec\n",
+    "\n",
+    "from collections import namedtuple\n",
+    "import nltk\n",
+    "import pandas as pd\n",
+    "import gensim\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem import WordNetLemmatizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "2c3bcd80-765c-4d4e-9737-4eb08fc8ade0",
+   "metadata": {
+    "id": "2c3bcd80-765c-4d4e-9737-4eb08fc8ade0"
+   },
+   "outputs": [],
+   "source": [
+    "def dl_transcript(url):\n",
+    "    with YoutubeDL({\"skip_download\":True, \"writeautomaticsub\":True, \"subtitleslangs\":[\"en\", \"en-us\", \"en-us\", \"en-uk\", \"en-in\", \"en-es\", \"en-fr\"]}) as ydl:\n",
+    "        if ydl.download(url):\n",
+    "            print(\"-----------------------------------------------------------------------------------------------------------LINK FAILED\")\n",
+    "            return None, None\n",
+    "        else:\n",
+    "            # print(ydl.extract_info(url))\n",
+    "            op = [f for f in os.listdir() if f.startswith(os.path.splitext(ydl.prepare_filename(ydl.extract_info(url)))[0]) and f.endswith('.vtt')]\n",
+    "            if op:\n",
+    "              return ydl.extract_info(url)['title'], op[0]\n",
+    "            else:\n",
+    "              print(\"-----------------------------------------------------------------------------------------------------------file download FAILED\")\n",
+    "              return None, None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "69123e12-8aac-4642-9bb0-4a7355edfa1a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def init():\n",
+    "    tokenizer = nltk.RegexpTokenizer(r\"\\w+\")\n",
+    "    model = gensim.models.KeyedVectors.load_word2vec_format(\n",
+    "        \"/home/instantinopaul/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz\",\n",
+    "        binary=True,\n",
+    "    )\n",
+    "    # model = None\n",
+    "    return tokenizer, model\n",
+    "\n",
+    "\n",
+    "def docsimilarity(model, keyword, doc):\n",
+    "    cutoff = 0.4\n",
+    "    score = 0\n",
+    "    for w in doc:\n",
+    "        sm = model.similarity(keyword, w) if w in model else 0\n",
+    "        if sm >= cutoff:\n",
+    "            score += sm\n",
+    "    return score\n",
+    "\n",
+    "\n",
+    "def get_relevant_line(df, searchphrase, model):\n",
+    "    wordnet_lemmatizer = WordNetLemmatizer()\n",
+    "    stop_words = set(stopwords.words(\"english\"))\n",
+    "    df = preprocess(df, wordnet_lemmatizer, stop_words)\n",
+    "    \n",
+    "    keywords = [ wordnet_lemmatizer.lemmatize(\n",
+    "            wordnet_lemmatizer.lemmatize(wordnet_lemmatizer.lemmatize(kw.lower()), pos=\"v\"),\n",
+    "            pos=(\"a\"),\n",
+    "        ) for kw in tokenizer.tokenize(searchphrase)]\n",
+    "    \n",
+    "    df[\"similarity\"] = sum(\n",
+    "        [\n",
+    "            df[\"docs\"].apply(lambda doc: docsimilarity(model, keyword.lower(), doc))\n",
+    "            for keyword in keywords\n",
+    "            if keyword in model\n",
+    "        ]\n",
+    "    )\n",
+    "    df[\"docs\"] = df[\"docs\"].apply(\" \".join)\n",
+    "    df = df.sort_values(\"similarity\", ascending=False)\n",
+    "    df.to_csv('result.csv', index=False)\n",
+    "    return df[df[\"similarity\"]>1][['start', 'end']].values.tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "jrXUUFDJr1_p",
+   "metadata": {
+    "id": "jrXUUFDJr1_p"
+   },
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "def parse_subtitles(url):\n",
+    "  title, vttfile=dl_transcript(url)\n",
+    "  if vttfile is None:\n",
+    "    return \"SOME ISSUE WITH VIDEO LINK OR DOWNLOAINDING VIDEO CONTENTS\"\n",
+    "  print(\"text file expected\", vttfile+'.txt')\n",
+    "  os.system(f\"cat \\\"{vttfile}\\\" | grep : -v | awk '!seen[$0]++' > \\\"{vttfile}.txt\\\"\")\n",
+    "  tscript = re.sub(r'[\\s|\\n]',' ',open(f'{vttfile}.txt').read().replace('WEBVTT', '', 1))\n",
+    "  data = [[caption.start, caption.end, caption.text] for caption in webvtt.read(vttfile)]\n",
+    "  df = pd.DataFrame(data, columns=['start', 'end', 'caps'])\n",
+    "  return title, tscript, df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "5603d2f7-78d1-4b3a-9dc4-45e5109e876e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess(df, wordnet_lemmatizer, stop_words):\n",
+    "    orig_docs = [\n",
+    "        [\n",
+    "            word        for word in tokenizer.tokenize(sent)\n",
+    "        ]\n",
+    "        for sent in df['caps']\n",
+    "    ]\n",
+    "    \n",
+    "    df['docs'] = [\n",
+    "        [\n",
+    "            wordnet_lemmatizer.lemmatize(\n",
+    "                wordnet_lemmatizer.lemmatize(wordnet_lemmatizer.lemmatize(word.lower()), pos=\"v\"),\n",
+    "                pos=(\"a\"),\n",
+    "            )\n",
+    "            for word in sent\n",
+    "            if word not in stop_words\n",
+    "        ]\n",
+    "        for sent in orig_docs\n",
+    "    ]\n",
+    "    print(df['docs'])\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a61e9abc-039a-49ec-99dd-d071dcf3da04",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d132c727-cba3-49ff-8e70-a7e07898061f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "830b2c51-9cb6-413a-a63e-305bf299f4a1",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 1000
+    },
+    "id": "WnhasqdYjsug",
+    "outputId": "bf5c7abd-2ef3-4a3b-9212-6a9c349647ac",
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
+      "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
+      "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
+      "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[hlsnative] Downloading m3u8 manifest\n",
+      "[hlsnative] Total fragments: 3\n",
+      "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] 100% of   17.29KiB in 00:00:01 at 9.16KiB/s                 \n",
+      "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
+      "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
+      "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
+      "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] 100% of  134.88KiB in 00:00:01 at 107.24KiB/s\n",
+      "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
+      "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
+      "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
+      "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[hlsnative] Downloading m3u8 manifest\n",
+      "[hlsnative] Total fragments: 3\n",
+      "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] 100% of   17.29KiB in 00:00:02 at 7.84KiB/s                 \n",
+      "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
+      "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
+      "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
+      "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[hlsnative] Downloading m3u8 manifest\n",
+      "[hlsnative] Total fragments: 3\n",
+      "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] 100% of   17.29KiB in 00:00:01 at 9.09KiB/s                 \n",
+      "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
+      "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
+      "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
+      "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[hlsnative] Downloading m3u8 manifest\n",
+      "[hlsnative] Total fragments: 3\n",
+      "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] 100% of   17.29KiB in 00:00:01 at 9.34KiB/s                 \n",
+      "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
+      "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
+      "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
+      "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[hlsnative] Downloading m3u8 manifest\n",
+      "[hlsnative] Total fragments: 3\n",
+      "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] 100% of   17.29KiB in 00:00:02 at 7.25KiB/s                 \n",
+      "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
+      "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
+      "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
+      "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[hlsnative] Downloading m3u8 manifest\n",
+      "[hlsnative] Total fragments: 3\n",
+      "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] 100% of   17.29KiB in 00:00:02 at 6.87KiB/s                 \n",
+      "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
+      "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
+      "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
+      "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[hlsnative] Downloading m3u8 manifest\n",
+      "[hlsnative] Total fragments: 3\n",
+      "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] 100% of   17.29KiB in 00:00:01 at 9.08KiB/s                 \n",
+      "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
+      "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
+      "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
+      "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[hlsnative] Downloading m3u8 manifest\n",
+      "[hlsnative] Total fragments: 3\n",
+      "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] 100% of   17.29KiB in 00:00:01 at 8.72KiB/s                 \n",
+      "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
+      "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
+      "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
+      "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] 100% of  134.88KiB in 00:00:01 at 110.64KiB/s\n",
+      "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
+      "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
+      "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
+      "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[hlsnative] Downloading m3u8 manifest\n",
+      "[hlsnative] Total fragments: 3\n",
+      "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] 100% of   17.29KiB in 00:00:01 at 9.11KiB/s                 \n",
+      "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
+      "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
+      "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
+      "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[hlsnative] Downloading m3u8 manifest\n",
+      "[hlsnative] Total fragments: 3\n",
+      "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] 100% of   17.29KiB in 00:00:01 at 9.28KiB/s                 \n",
+      "[youtube] Extracting URL: https://www.youtube.com/watch?v=6Ow2zHJQw2M\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading webpage\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading ios player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading android player API JSON\n",
+      "[youtube] 6Ow2zHJQw2M: Downloading m3u8 information\n",
+      "[info] 6Ow2zHJQw2M: Downloading subtitles: en\n",
+      "[info] 6Ow2zHJQw2M: Downloading 1 format(s): 616+251\n",
+      "Deleting existing file Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[info] Writing video subtitles to: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[hlsnative] Downloading m3u8 manifest\n",
+      "[hlsnative] Total fragments: 3\n",
+      "[download] Destination: Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt\n",
+      "[download] 100% of   17.29KiB in 00:00:03 at 5.47KiB/s                 \n",
+      "text file expected Israel Continues To Bomb Palestinians - As the UN Watches Helplessly on Day 26 ｜ Akash Banerjee [6Ow2zHJQw2M].en.vtt.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "title, tscript, df = parse_subtitles('https://www.youtube.com/watch?v=6Ow2zHJQw2M')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "4cbd40ea-5c1c-4d06-9b79-1e3e44b8749c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer, model = init()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "996e559a-3206-475e-9e67-a6a9ae0c6b27",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 1000
+    },
+    "id": "WnhasqdYjsug",
+    "outputId": "bf5c7abd-2ef3-4a3b-9212-6a9c349647ac"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0      [this, jabalia, refugee, camp, israel, drop, 6...\n",
+      "1      [accord, report, explosive, power, bomb, equal...\n",
+      "2      [the, entire, refugee, camp, destroy, more, 40...\n",
+      "3      [and, still, many, child, many, innocent, pale...\n",
+      "4      [israel, even, try, deny, time, they, say, yes...\n",
+      "                             ...                        \n",
+      "127    [but, know, youtube, might, restrict, video, w...\n",
+      "128          [so, please, ignore, video, comment, share]\n",
+      "129    [and, tell, people, raise, voice, not, ignore,...\n",
+      "130    [it, important, disappoint, genocide, gaza, te...\n",
+      "131    [otherwise, today, gaza, tomorrow, another, pl...\n",
+      "Name: docs, Length: 132, dtype: object\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[['00:10:35.477', '00:10:44.114'],\n",
+       " ['00:04:30.424', '00:04:41.642'],\n",
+       " ['00:08:08.720', '00:08:18.994'],\n",
+       " ['00:08:51.922', '00:08:58.454']]"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "searchphrase = \"children death\"\n",
+    "get_relevant_line(df, searchphrase, model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9442ff82-3777-4d58-8f05-69f7afbeeb60",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "search-in-vid",
+   "language": "python",
+   "name": "search-in-vid"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

word2vec_solution.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import random, re
+from urllib.parse import urlparse
+from yt_dlp import YoutubeDL
+import glob
+import webvtt
+import pandas as pd
+from streamlit_player import st_player
+from youtube_transcript_api import YouTubeTranscriptApi
+from scipy import spatial
+from gensim.models import word2vec
+from collections import namedtuple
+import nltk
+import pandas as pd
+import gensim
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import streamlit as st
+@st.cache_data
+def dl_transcript(url):
+    url_data = urlparse(url)
+    print("id", url_data.query[2::])
+    return YouTubeTranscriptApi.get_transcript(url_data.query[2::])
+@st.cache_data
+def init():
+    tokenizer = nltk.RegexpTokenizer(r"\w+")
+    model = gensim.models.KeyedVectors.load_word2vec_format(
+        "archive/GoogleNews-vectors-negative300-SLIM.bin",
+        binary=True,
+    )
+    # model = None
+    return tokenizer, model
+# @st.cache_data
+def docsimilarity(model, keyword, doc):
+    cutoff = 0.4
+    score = 0
+    for w in doc:
+        sm = model.similarity(keyword, w) if w in model else 0
+        if sm >= cutoff:
+            score += sm
+    return score
+@st.cache_data
+def get_relevant_line(df, searchphrase):
+    tokenizer, model = init()
+    wordnet_lemmatizer = WordNetLemmatizer()
+    stop_words = set(stopwords.words("english"))
+    df = preprocess(df, tokenizer, wordnet_lemmatizer, stop_words)
+    keywords = [
+        wordnet_lemmatizer.lemmatize(
+            wordnet_lemmatizer.lemmatize(
+                wordnet_lemmatizer.lemmatize(kw.lower()), pos="v"
+            ),
+            pos=("a"),
+        )
+        for kw in tokenizer.tokenize(searchphrase)
+    ]
+    print("lemm keywords: ", keywords)
+    df["similarity"] = sum(
+        [
+            df["docs"].apply(lambda doc: docsimilarity(model, keyword.lower(), doc))
+            for keyword in keywords
+            if keyword in model
+        ]
+    )
+    df["docs"] = df["docs"].apply(" ".join)
+    df = df.sort_values("similarity", ascending=False)
+    df.to_csv("result.csv", index=False)
+    res_idx = df["similarity"] >= 1
+    print(
+        "Result length: ",
+        sum(res_idx),
+    )
+    return df[res_idx].reset_index().iloc[:4]
+@st.cache_data
+def parse_subtitles(url):
+    return pd.DataFrame(dl_transcript(url))
+# @st.cache_data
+def preprocess(df, tokenizer, wordnet_lemmatizer, stop_words):
+    orig_docs = [[word for word in tokenizer.tokenize(sent)] for sent in df["text"]]
+    df["docs"] = [
+        [
+            wordnet_lemmatizer.lemmatize(
+                wordnet_lemmatizer.lemmatize(
+                    wordnet_lemmatizer.lemmatize(word.lower()), pos="v"
+                ),
+                pos=("a"),
+            )
+            for word in sent
+            if word not in stop_words
+        ]
+        for sent in orig_docs
+    ]
+    # print(df["docs"])
+    return df
+def vidattstamp(vid_url, start, vid_placeholder):
+    vid_url = vid_url + "&t=400s"
+    print("Skipping to ", start, vid_url)
+    vid_placeholder.empty()
+    # with placeholder.container():
+    #     st_player(vid_url, playing=True, muted=True)
+vid_url = st.text_input("Youtube video")
+if vid_url:
+    # print(st.session_state)
+    placeholder = st.empty()
+    analysis_placeholder = st.empty()
+    with placeholder.container():
+        st_player(vid_url, playing=True)
+        analysis_placeholder.empty()
+        # st.video(vid_url)
+    df = parse_subtitles(vid_url)
+    df.to_csv("caps.csv")
+    searchphrase = st.text_input(
+        "Search keywords relevant to section you are searching for in this video"
+    )
+    if searchphrase:
+        print("\n\n\n Searching", searchphrase)
+        df = get_relevant_line(df, searchphrase)
+        # print(df)
+        with analysis_placeholder.container():
+            if len(df):
+                st.text("Relevant sections below: ")
+                # placeholder.empty()
+                # st.dataframe(df)
+                for cap, start in zip(df["text"].to_list(), df["start"].to_list()):
+                    col1, col2 = st.columns([1, 4])
+                    col1.button(
+                        "Jump to time: " + str(start),
+                        key=" ".join(
+                            [
+                                "Jump",
+                                vid_url,
+                                str(start),
+                                str(random.randint(0, 9999999)),
+                                cap,
+                            ]
+                        ),
+                    )
+                    col2.markdown(cap)
+            else:
+                st.text("No relevant section found, try something else ...")
+    for k, v in st.session_state.items():
+        if k.startswith("Jump") and v is True:
+            print(k.split(maxsplit=3))
+            _, new_url, start, _ = k.split(maxsplit=3)
+            placeholder.empty()
+            with placeholder.container():
+                st_player(vid_url + "&t={}s".format(round(float(start))), playing=True)