diff --git "a/notebooks/youtube_transcripts.ipynb" "b/notebooks/youtube_transcripts.ipynb" new file mode 100644--- /dev/null +++ "b/notebooks/youtube_transcripts.ipynb" @@ -0,0 +1,153533 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "import whisper\n", + "from pytube import YouTube\n", + "import pickle\n", + "import pandas as pd\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████| 1.42G/1.42G [03:21<00:00, 7.57MiB/s]\n" + ] + } + ], + "source": [ + "model = whisper.load_model('medium')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "videos = ['https://www.youtube.com/watch?v=8mQZzlQXK1Q', \n", + "'https://www.youtube.com/watch?v=dJ1eDL15_Lw', \n", + "'https://www.youtube.com/watch?v=ikYCr-0GAfw', \n", + "'https://www.youtube.com/watch?v=2QO8tgSA6oQ',\n", + "'https://www.youtube.com/watch?v=-OM8RYUl_rg',\n", + "'https://www.youtube.com/watch?v=TGetyy-LKcY',\n", + "'https://www.youtube.com/watch?v=sDrakgSYvzc',\n", + "'https://www.youtube.com/watch?v=LmB3ZQ2F1MY',\n", + "'https://www.youtube.com/watch?v=CdyJ0iB_k00',\n", + "'https://www.youtube.com/watch?v=gGqbEH69ZaI',\n", + "'https://www.youtube.com/watch?v=2byXYrlDkZs',\n", + "'https://www.youtube.com/watch?v=QRQMPCs7m0E',\n", + "'https://www.youtube.com/watch?v=D1QKYLcvoU8',\n", + "'https://www.youtube.com/watch?v=5eyE20HpaCo',\n", + "'https://www.youtube.com/watch?v=XeeFp63L05k',\n", + "'https://www.youtube.com/watch?v=wKPPf9YNv5c',\n", + "'https://www.youtube.com/watch?v=xY2ftYAnUso',\n", + "'https://www.youtube.com/watch?v=gOt--6HPrIo',\n", + "'https://www.youtube.com/watch?v=L4zFpKpdub8',\n", + "'https://www.youtube.com/watch?v=4YhpWZCdiZc',\n", + "'https://www.youtube.com/watch?v=l2SNesXZoGM',\n", + "'https://www.youtube.com/watch?v=8-2WQF3SWwo',\n", + "'https://www.youtube.com/watch?v=bM0BeeA8RdY',\n", + "'https://www.youtube.com/watch?v=Zl7MbbgE4aU',\n", + "'https://www.youtube.com/watch?v=jQgkVKGqBCE',\n", + "'https://www.youtube.com/watch?v=8aDFvvjC6XM',\n", + "'https://www.youtube.com/watch?v=W3hMmZQAdhw']\n", + "\n", + "#select the v= part of the url\n", + "vids = [x.split('=')[1] for x in videos]" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/2 [00:00