whisper-youtube-2-hf_dataset

Running

App Files Files Community

juancopi81 commited on Jan 23, 2023

Commit

7288748

•

0 Parent(s):

Duplicate from Whispering-GPT/whisper-youtube-2-hf_dataset

Browse files

Files changed (47) hide show

.gitattributes +34 -0
.gitignore +132 -0
README.md +14 -0
app.py +186 -0
datapipeline.py +62 -0
dataset/__init__.py +0 -0
dataset/hf_dataset.py +48 -0
errors.py +4 -0
loading/__init__.py +0 -0
loading/loaderiterator.py +46 -0
loading/serialization.py +30 -0
preprocessing/__init__.py +0 -0
preprocessing/youtubevideopreprocessor.py +88 -0
requirements.txt +54 -0
storing/__init__.py +0 -0
storing/createdb.py +27 -0
storing/sqlitebatchvideostorer.py +28 -0
storing/sqlitecontextmanager.py +23 -0
test/__init__.py +0 -0
test/files/1.json +4 -0
test/files/2.json +4 -0
test/files/3.json +4 -0
test/files/4.json +4 -0
test/files/5.json +4 -0
test/files/6.json +4 -0
test/files/7.json +4 -0
test/test_adddescriptiontransform.py +19 -0
test/test_addtitletransform.py +19 -0
test/test_batchtransformer.py +54 -0
test/test_datapipeline.py +108 -0
test/test_hfdataset.py +21 -0
test/test_loaderiterator.py +54 -0
test/test_sqlitebatchvideostorer.py +53 -0
test/test_sqlitecontextmanager.py +13 -0
test/test_utils.py +34 -0
test/test_video.py +40 -0
test/test_whispertransform.py +31 -0
test/test_youtubevideopreprocessor.py +61 -0
threadeddatapipeline.py +24 -0
transforming/__init__.py +0 -0
transforming/adddescriptiontransform.py +32 -0
transforming/addtitletransform.py +31 -0
transforming/batchtransformer.py +26 -0
transforming/transform.py +11 -0
transforming/whispertransform.py +66 -0
utils.py +62 -0
video.py +20 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,132 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vscode/

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Whisper-youtube-2-hf Dataset
+emoji: 📚
+colorFrom: purple
+colorTo: pink
+sdk: gradio
+sdk_version: 3.10.0
+app_file: app.py
+pinned: false
+license: openrail
+duplicated_from: Whispering-GPT/whisper-youtube-2-hf_dataset
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import math
+import os
+import argparse
+import sqlite3
+import shutil
+import uuid
+from datasets import Dataset, concatenate_datasets
+import gradio as gr
+import torch
+from storing.createdb import create_db
+from preprocessing.youtubevideopreprocessor import YoutubeVideoPreprocessor
+from loading.serialization import JsonSerializer
+from utils import nest_list, is_google_colab
+from datapipeline import create_hardcoded_data_pipeline
+from threadeddatapipeline import ThreadedDataPipeline
+from dataset.hf_dataset import HFDataset
+from huggingface_hub import DatasetCard
+NUM_THREADS = 1
+# Detect if code is running in Colab
+is_colab = is_google_colab()
+colab_instruction = "" if is_colab else """
+<p>You can skip the queue using Colab:
+<a href="https://colab.research.google.com/drive/1zNRnX1lXjlGtBMW8U8S9t4eY1cA0D6lm?usp=sharing">
+<img data-canonical-src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg"></a></p>"""
+device_print = "GPU 🔥" if torch.cuda.is_available() else "CPU 🥶"
+def numvideos_type(x):
+    x = int(x)
+    if x > 12:
+        raise argparse.ArgumentTypeError("Maximum number of videos is 12")
+    if x < 1:
+        raise argparse.ArgumentTypeError("Minimum number of videos is 12")
+    return x
+def parse_args():
+    parser = argparse.ArgumentParser(usage="[arguments] --channel_name --num_videos",
+                                     description="Program to transcribe YouTube videos.")
+    parser.add_argument("--channel_name",
+                        type=str,
+                        required=True,
+                        help="Name of the channel from where the videos will be transcribed")
+    parser.add_argument("--num_videos",
+                        type=numvideos_type,
+                        required=True,
+                        help="Number of videos (min. 1 - max. 12) to transcribe from --channel_name")
+    parser.add_argument("--hf_token",
+                        type=str,
+                        required=True,
+                        help="Token of your HF account. You need a HF account to upload the dataset")
+    parser.add_argument("--hf_dataset_identifier",
+                        type=str,
+                        required=True,
+                        help="The ID of the repository to push to in the following format: <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.")
+    parser.add_argument("--whisper_model",
+                        type=str,
+                        required=True,
+                        help="Select one of the available whispers models",
+                        choices=["tiny", "base", "small", "medium", "large"])
+    args = parser.parse_args()
+    return args
+def transcribe(mode: str,
+               channel_name: str,
+               num_videos: int,
+               hf_token: str,
+               hf_dataset_identifier: str,
+               whisper_model: str) -> str:
+    # Create a unique name for the database
+    unique_filename = str(uuid.uuid4())
+    database_name = unique_filename +".db"
+    create_db(database_name)
+    # Create necessary resources
+    yt_video_processor = YoutubeVideoPreprocessor(mode=mode,
+                                                  serializer=JsonSerializer()) # TODO: Let user select serializer
+    hf_dataset = HFDataset(hf_dataset_identifier)
+    videos_downloaded = hf_dataset.list_of_ids
+    paths, dataset_folder = yt_video_processor.preprocess(channel_name,
+                                                          num_videos,
+                                                          videos_downloaded)
+    nested_listed_length = math.ceil(len(paths) / NUM_THREADS)
+    nested_paths = nest_list(paths, nested_listed_length)
+    data_pipelines = [create_hardcoded_data_pipeline(database_name, whisper_model) for i in range(NUM_THREADS)]
+    # Run pipelines in multiple threads
+    threads = []
+    for data_pipeline, thread_paths in zip(data_pipelines, nested_paths):
+        threads.append(ThreadedDataPipeline(data_pipeline, thread_paths))
+    for thread in threads:
+        thread.start()
+    for thread in threads:
+        thread.join()
+    # Fetch entries and print them
+    connection = sqlite3.connect(database_name)
+    cursor = connection.cursor()
+    cursor.execute("SELECT CHANNEL_NAME, URL, TITLE, DESCRIPTION, TRANSCRIPTION, SEGMENTS FROM VIDEO")
+    videos = cursor.fetchall()
+    num_new_videos = len(videos)
+    dataset = Dataset.from_sql("SELECT CHANNEL_NAME, URL, TITLE, DESCRIPTION, TRANSCRIPTION, SEGMENTS FROM VIDEO", connection)
+    if (hf_dataset.exist==True) and (hf_dataset.is_empty==False):
+        dataset_to_upload = concatenate_datasets([hf_dataset.dataset["train"], dataset])
+    else:
+        dataset_to_upload = dataset
+    dataset_to_upload.push_to_hub(hf_dataset_identifier, token=hf_token)
+    card = DatasetCard.load(hf_dataset_identifier)
+    card.data.tags = ["whisper", "whispering", whisper_model]
+    card.data.task_categories = ["automatic-speech-recognition"]
+    card.push_to_hub(hf_dataset_identifier, token=hf_token)
+    # Close connection
+    connection.close()
+    # Remove db
+    os.remove(database_name)
+    try:
+        shutil.rmtree(dataset_folder)
+    except OSError as e:
+        print("Error: %s : %s" % (dataset_folder, e.strerror))
+    return f"Dataset created or updated at {hf_dataset_identifier}. {num_new_videos} samples were added"
+with gr.Blocks() as demo:
+    md = """# Use Whisper to create a HF dataset from YouTube videos
+    This space will let you create a HF dataset by transcribing videos from YouTube.
+    Enter the name of the YouTube channel or the URL of a YouTube playlist (in the form https://www.youtube.com/playlist?list=****),
+    and the repo_id of the dataset (you need a HuggingFace account).
+    If the dataset already exists, it will only transcribe videos that are not in the dataset.
+    If it does not exists, it will create the dataset. For using this demo, you need a
+    [Hugging Face token](https://huggingface.co/settings/tokens) with write role. Learn more about [tokens](https://huggingface.co/docs/hub/security-tokens).
+    """
+    gr.Markdown(md)
+    gr.HTML(
+        f"""
+        <p style="margin-bottom: 10px; font-size: 94%">
+          Running on <b>{device_print}</b>{(" in a <b>Google Colab</b>." if is_colab else "")}
+        </p>
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            whisper_model = gr.Radio([
+                "tiny", "base", "small", "medium", "large"
+            ], label="Whisper model", value="base")
+            mode = gr.Radio([
+                "channel_name", "playlist"
+            ], label="Get the videos from:", value="channel_name")
+            channel_name = gr.Textbox(label="YouTube Channel or Playlist URL",
+                                      placeholder="Enter the name of the YouTube channel or the URL of the playlist")
+            num_videos = gr.Slider(1, 20000, value=4, step=1, label="Number of videos")
+            hf_token = gr.Textbox(placeholder="Your HF write access token", type="password")
+            hf_dataset_identifier = gr.Textbox(label = 'Dataset Name',
+                                               placeholder = "Enter in the format <username>/<repo_name>")
+            submit_btn = gr.Button("Submit")
+        with gr.Column():
+            output = gr.Text()
+        submit_btn.click(fn=transcribe, inputs=[mode,
+                                                channel_name,
+                                                num_videos,
+                                                hf_token,
+                                                hf_dataset_identifier,
+                                                whisper_model], outputs=[output])
+    gr.Markdown('''
+      ![visitors](https://visitor-badge.glitch.me/badge?page_id=juancopi81.whisper-youtube-2-hf_dataset)
+    ''')
+if not is_colab:
+    demo.queue(concurrency_count=1)
+demo.launch(debug=True, share=is_colab)

datapipeline.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from typing import Dict, List
+from pathlib import Path
+from sqlite3 import Cursor
+from utils import accepts_types, create_videos
+from preprocessing.youtubevideopreprocessor import YoutubeVideoPreprocessor
+from loading.loaderiterator import LoaderIterator
+from transforming.batchtransformer import BatchTransformer
+from storing.sqlitebatchvideostorer import SQLiteBatchVideoStorer
+from storing.sqlitecontextmanager import SQLiteContextManager
+from loading.serialization import JsonSerializer
+from transforming.addtitletransform import AddTitleTransform
+from transforming.adddescriptiontransform import AddDescriptionTransform
+from transforming.whispertransform import WhisperTransform
+class DataPipeline:
+    """A class that wraps the different components of the system. It processes
+    data using these steps: load -> apply transform -> store.
+    """
+    def __init__(self,
+                 loader_iterator: LoaderIterator,
+                 batch_transformer: BatchTransformer,
+                 storer: SQLiteBatchVideoStorer,
+                 sqlite_context_manager: SQLiteContextManager) -> None:
+        self.loader_iterator = loader_iterator
+        self.batch_transformer = batch_transformer
+        self.storer = storer
+        self.sqlite_context_manager = sqlite_context_manager
+    @accepts_types(list)
+    def process(self, load_paths: List[Path]) -> None:
+        """Process files in batches: load -> transform -> store to db."""
+        self.loader_iterator.load_paths = load_paths
+        with self.sqlite_context_manager as db_cursor:
+            for video_data_batch in self.loader_iterator:
+                self._process_video_batch(db_cursor, video_data_batch)
+    def _process_video_batch(self,
+                             db_cursor: Cursor,
+                             video_data_batch: List[Dict]) -> None:
+        videos = create_videos(video_data_batch)
+        transformed_videos = self.batch_transformer.apply(videos)
+        self.storer.store(db_cursor, transformed_videos)
+def create_hardcoded_data_pipeline(db_path, whisper_model: str="base") -> DataPipeline:
+    """Factory function to create a DataPipeline with
+    default arguments.
+    TODO: Create DataPipeline so users can pass the args.
+    """
+    loader_iterator = LoaderIterator(JsonSerializer(), 2)
+    # Whisper transform using based model and timestamps
+    # TODO: Let user select this parameters.
+    batch_transformer = BatchTransformer([AddTitleTransform(),
+                                          AddDescriptionTransform(),
+                                          WhisperTransform(model=whisper_model)])
+    video_storer = SQLiteBatchVideoStorer()
+    sqlite_context_manager = SQLiteContextManager(db_path)
+    return DataPipeline(loader_iterator,
+                        batch_transformer,
+                        video_storer,
+                        sqlite_context_manager)

dataset/__init__.py ADDED Viewed

File without changes

dataset/hf_dataset.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Adapted from Eduardo Matallanas
+from datasets import load_dataset, Dataset
+from datasets.data_files import EmptyDatasetError
+class HFDataset():
+  """
+  Create a dataset to save the transcripts from Youtube.
+  """
+  def __init__(self, name) -> None:
+    self.name = name
+    if name != "":
+      self._init_dataset()
+    else:
+      self.dataset = Dataset.from_dict({})
+      self.exist = False
+      self.is_empty = True
+  def _init_dataset(self):
+    try:
+      self.dataset = load_dataset(self.name)
+      self.exist = True
+      self.is_empty = False
+      self.list_of_ids = self._get_list_of_id()
+    except EmptyDatasetError:
+      self.dataset = Dataset.from_dict({})
+      self.exist = True
+      self.is_empty = True
+      self.list_of_ids = []
+      pass
+    except FileNotFoundError:
+      self.dataset = Dataset.from_dict({})
+      self.exist = False
+      self.is_empty = True
+      self.list_of_ids = []
+      pass
+  def upload(self):
+    self.dataset.push_to_hub(self.name)
+  def _get_list_of_id(self):
+    new_ds = self.dataset.map(
+      lambda x: {"ID": [url.split("=")[-1] for url in x["URL"]]}, batched=True
+    )
+    list_of_ids = []
+    for split in new_ds:
+      ids = new_ds[split]["ID"]
+      list_of_ids.append(ids)
+    return [item for sublist in list_of_ids for item in sublist]

errors.py ADDED Viewed

	@@ -0,0 +1,4 @@

+class DifferentNumberOfArgumentsError(Exception):
+    def __init__(self, message: str) -> None:
+        self.message = message

loading/__init__.py ADDED Viewed

File without changes

loading/loaderiterator.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from pathlib import Path
+from typing import List, Dict, Optional
+from loading.serialization import Serializer
+class LoaderIterator:
+    """Iterator that loads data from multiple files in batches."""
+    def __init__(self,
+                 serializer: Serializer,
+                 num_files_per_iteration: int,
+                 load_paths: Optional[List[Path]] = None) -> None:
+        self.serializer = serializer
+        self.num_files_per_iteration = num_files_per_iteration
+        self._load_paths = load_paths
+        self._current_iteration = None
+    @property
+    def load_paths(self) -> Optional[List[Path]]:
+        return self._load_paths
+    @load_paths.setter
+    def load_paths(self, load_paths: List[Path]) -> None:
+        self._load_paths = load_paths
+    def __iter__(self):
+        self._current_iteration = 0
+        return self
+    def __next__(self) -> List[Dict]:
+        if self._did_load_all_batches():
+            raise StopIteration
+        data_batch = self._load_data_batch()
+        self._current_iteration += 1
+        return data_batch
+    def _did_load_all_batches(self) -> bool:
+        if self._current_iteration >= len(self._load_paths) / self.num_files_per_iteration:
+            return True
+        return False
+    def _load_data_batch(self) -> List[Dict]:
+        start_index = self._current_iteration * self.num_files_per_iteration
+        stop_index = start_index + self.num_files_per_iteration
+        return [self.serializer.load(load_path) for load_path in
+                self._load_paths[start_index:stop_index] if load_path.exists()]

loading/serialization.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import json
+from abc import ABC, abstractmethod
+from typing import Any
+from pathlib import Path
+class Serializer(ABC):
+    @abstractmethod
+    def dump(self, obj: Any, save_path: Path) -> None:
+        pass
+    @abstractmethod
+    def load(self, load_path: Path) -> Any:
+        pass
+class JsonSerializer(Serializer):
+    def __init__(self,
+                 sort_keys: bool = True,
+                 indent: int = 4):
+        self.sort_keys = sort_keys
+        self.indent = indent
+    def dump(self, obj: Any, save_path: Path) -> None:
+        with open(save_path, "w") as file:
+            json.dump(obj, file, sort_keys=self.sort_keys, indent=self.indent)
+    def load(self, load_path: Path) -> Any:
+        with open(load_path, "r") as file:
+            return json.load(file)

preprocessing/__init__.py ADDED Viewed

File without changes

preprocessing/youtubevideopreprocessor.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from typing import List, Generator, Tuple
+from pathlib import Path
+from itertools import islice
+import scrapetube
+from youtubesearchpython import ChannelsSearch
+from pytube import Playlist
+from utils import accepts_types
+from loading.serialization import Serializer
+class YoutubeVideoPreprocessor:
+    """This class is responsible for creating json files of expected as YoutubeVideo
+    objects taking a channel name as input.
+    Each JSON file has the following information:
+    - channel_name: The name of the YouTube channel
+    - url: The url of the video
+    Args:
+        channel_name (`str`):
+            The name of the YouTube channel:
+    Returns:
+        load_paths (`List[Path]`)
+            The paths of the json files of the video of that channel.
+    TODO: Change it to accept also URL of video list, name of video list, etc.
+    """
+    def __init__(self,
+                 mode: str = "channel_name",
+                 serializer = Serializer) -> None:
+        self.mode = mode
+        self.serializer = serializer
+    def preprocess(self,
+                   name: str,
+                   num_videos: int,
+                   videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
+        if self.mode == "channel_name":
+            # TODO: Add credits
+            channels_search = ChannelsSearch(name, limit=1)
+            channel_id = channels_search.result()['result'][0]['id']
+            videos = scrapetube.get_channel(channel_id=channel_id)
+            load_paths, dataset_folder = self._convert_videos_to_json_files(name,
+                                                                            videos,
+                                                                            num_videos,
+                                                                            videos_in_ds)
+            return load_paths, dataset_folder
+        elif self.mode == "playlist":
+            playlist_id = name.split("=")[-1]
+            playlist = Playlist(name)
+            name = playlist.title
+            videos = scrapetube.get_playlist(playlist_id)
+            load_paths, dataset_folder = self._convert_videos_to_json_files(name,
+                                                                            videos,
+                                                                            num_videos,
+                                                                            videos_in_ds)
+            return load_paths, dataset_folder
+        else:
+            # TODO: implement this part
+            youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
+            test_files_folder = youtube_folder/"test/files"
+            return [Path("test.json"), Path("test1.json")], test_files_folder
+    def _convert_videos_to_json_files(self,
+                                      name:str,
+                                      videos: Generator,
+                                      num_videos: int,
+                                      videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
+        load_paths = []
+        youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
+        dataset_folder = youtube_folder/name
+        Path(dataset_folder).mkdir(parents=True, exist_ok=True)
+        i = 0
+        while i < num_videos:
+            try:
+                video = next(videos)
+                if video["videoId"] in videos_in_ds:
+                    continue
+                else:
+                    file_name = f"{i}.json"
+                    save_path = Path(dataset_folder, file_name)
+                    save_path.touch(exist_ok=True)
+                    video_dict = {"channel_name": name,
+                                  "url":f"https://www.youtube.com/watch?v={video['videoId']}"}
+                    self.serializer.dump(obj=video_dict, save_path=save_path)
+                    load_paths.append(save_path)
+                    i += 1
+            except StopIteration:
+                break
+        return load_paths, dataset_folder

requirements.txt ADDED Viewed

	@@ -0,0 +1,54 @@

+aiohttp==3.8.3
+aiosignal==1.2.0
+anyio==3.6.2
+async-timeout==4.0.2
+attrs==22.1.0
+certifi==2022.9.24
+charset-normalizer==2.1.1
+datasets
+dill
+ffmpeg-python==0.2.0
+filelock==3.8.0
+frozenlist==1.3.1
+fsspec==2022.10.0
+future==0.18.2
+h11==0.12.0
+httpcore==0.15.0
+httpx==0.23.0
+huggingface-hub==0.11.0
+idna==3.4
+iniconfig==1.1.1
+more-itertools==9.0.0
+multidict==6.0.2
+multiprocess
+numpy==1.23.4
+packaging==21.3
+pandas==1.5.1
+pluggy==1.0.0
+py==1.11.0
+pyarrow==9.0.0
+pydantic==1.10.2
+pyparsing==3.0.9
+pytest==7.1.3
+python-dateutil==2.8.2
+pytube==12.1.0
+pytz==2022.5
+PyYAML==6.0
+regex==2022.9.13
+requests==2.28.1
+responses==0.18.0
+rfc3986==1.5.0
+scrapetube==2.3.1
+six==1.16.0
+sniffio==1.3.0
+tokenizers==0.13.1
+tomli==2.0.1
+torch==1.12.1
+tqdm==4.64.1
+transformers==4.23.1
+typing-extensions==4.4.0
+urllib3==1.26.12
+git+https://github.com/openai/whisper.git
+xxhash==3.1.0
+yarl==1.8.1
+youtube-search-python==1.6.6

storing/__init__.py ADDED Viewed

File without changes

storing/createdb.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Simple script to create a sqlite db with a single table
+called 'videos'.
+"""
+import sqlite3
+def create_db(db_path: str) -> None:
+    """Create an sqlite db with a single table called 'videos'"""
+    connection = sqlite3.connect(db_path)
+    print(f"Created db successfully at '{db_path}'")
+    connection.execute(
+        '''
+        CREATE TABLE VIDEO
+        (ID INTEGER     PRIMARY KEY     AUTOINCREMENT,
+        CHANNEL_NAME    CHAR(30)        NOT NULL,
+        URL             TEXT            NOT NULL,
+        TITLE           CHAR(100),
+        DESCRIPTION     CHAR(5000),
+        TRANSCRIPTION   TEXT,
+        SEGMENTS        TEXT
+        )
+        '''
+    )
+    print(f"'Video' table created successfully")
+if __name__ == "__main__":
+    create_db("video.db")

storing/sqlitebatchvideostorer.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import json
+import sqlite3
+from typing import List, Tuple
+from utils import accepts_types
+from video import YoutubeVideo
+class SQLiteBatchVideoStorer:
+    """This is class is responsible to insert batch video entries in the db."""
+    def __init__(self, table: str = "video"):
+        self.table = table
+    @accepts_types(sqlite3.Cursor, list)
+    def store(self,
+              db_cursor: sqlite3.Cursor,
+              videos: List[YoutubeVideo]) -> None:
+        """Batch insert list of videos in the 'video' table of the db."""
+        video_list = self._convert_videos_to_list(videos)
+        db_cursor.executemany(f"INSERT INTO {self.table}(channel_name, url, title, description, transcription, segments) VALUES(?, ?, ?, ?, ?, ?)",
+                              video_list)
+    @staticmethod
+    def _convert_videos_to_list(videos: List[YoutubeVideo]) -> List[Tuple[str, str, str, str, str, str]]:
+        for video in videos:
+            # TODO: Find better way to solve this
+            video.segments = json.dumps(video.segments)
+        return [video.to_tuple() for video in videos]

storing/sqlitecontextmanager.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import sqlite3
+class SQLiteContextManager:
+    """Context manager for SQLite db, that handles
+    db open / closing connection.
+    """
+    def __init__(self, db_path: str) -> None:
+        self.db_path = db_path
+        self.connection = None
+    def __enter__(self):
+        """Establish connection with db and return cursor to be used
+        to execute queries.
+        """
+        self.connection = sqlite3.connect(self.db_path)
+        return self.connection.cursor()
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Commit queries and close db connection.
+        """
+        self.connection.commit()
+        self.connection.close()

test/__init__.py ADDED Viewed

File without changes

test/files/1.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "channel_name": "The verge",
+    "url": "https://www.youtube.com/watch?v=YMlTSmusEmA"
+}

test/files/2.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "channel_name": "The verge",
+    "url": "https://www.youtube.com/watch?v=Jzl0hHTc7Jw"
+}

test/files/3.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "channel_name": "The verge",
+    "url": "https://www.youtube.com/watch?v=gV50hpSKHFQ"
+}

test/files/4.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "channel_name": "The verge",
+    "url": "https://www.youtube.com/watch?v=N6ZyzoibXqg"
+}

test/files/5.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "channel_name": "The verge",
+    "url": "https://www.youtube.com/watch?v=q90v9FLXi1E"
+}

test/files/6.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "channel_name": "Tquotes",
+    "url": "https://www.youtube.com/watch?v=NSkoGZ8J1Ag"
+}

test/files/7.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "channel_name": "changminjen",
+    "url": "https://www.youtube.com/watch?v=Ak516vtDTEA"
+}

test/test_adddescriptiontransform.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from youtube_transcriber.transforming.adddescriptiontransform import AddDescriptionTransform
+from youtube_transcriber.video import YoutubeVideo
+def test_add_description_transform_init():
+    transform = AddDescriptionTransform()
+    assert type(transform) == AddDescriptionTransform
+def test_apply():
+    transform = AddDescriptionTransform()
+    raw_video = YoutubeVideo(channel_name="changminjen",
+                             url="https://www.youtube.com/watch?v=Ak516vtDTEA")
+    transformed_video = transform.apply(raw_video)
+    assert type(transformed_video) == YoutubeVideo
+    assert transformed_video.channel_name == raw_video.channel_name
+    assert transformed_video.url == raw_video.url
+    assert transformed_video.title == raw_video.title
+    assert transformed_video.description == "Anakin, my allegiance is to the Republic, to democracy! from Star Wars Episode III: Revenge of the Sith."
+    assert transformed_video.transcription == raw_video.transcription
+    assert transformed_video.segments == raw_video.segments

test/test_addtitletransform.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from youtube_transcriber.transforming.addtitletransform import AddTitleTransform
+from youtube_transcriber.video import YoutubeVideo
+def test_add_title_transform_init():
+    transform = AddTitleTransform()
+    assert type(transform) == AddTitleTransform
+def test_apply():
+    transform = AddTitleTransform()
+    raw_video = YoutubeVideo(channel_name="Tquotes",
+                             url="https://www.youtube.com/watch?v=NSkoGZ8J1Ag")
+    transformed_video = transform.apply(raw_video)
+    assert type(transformed_video) == YoutubeVideo
+    assert transformed_video.channel_name == raw_video.channel_name
+    assert transformed_video.url == raw_video.url
+    assert transformed_video.title == "Steve Jobs quotes Bob Dylan"
+    assert transformed_video.description == raw_video.description
+    assert transformed_video.transcription == raw_video.transcription
+    assert transformed_video.segments == raw_video.segments

test/test_batchtransformer.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import pytest
+from youtube_transcriber.transforming.addtitletransform import AddTitleTransform
+from youtube_transcriber.transforming.adddescriptiontransform import AddDescriptionTransform
+from youtube_transcriber.transforming.batchtransformer import BatchTransformer
+from youtube_transcriber.transforming.whispertransform import WhisperTransform
+from youtube_transcriber.video import YoutubeVideo
+@pytest.fixture
+def batch_transformer():
+    add_title_transform = AddTitleTransform()
+    add_description_transform = AddDescriptionTransform()
+    whisper_transform = WhisperTransform()
+    return BatchTransformer([add_title_transform,
+                             add_description_transform,
+                             whisper_transform])
+def test_batch_transform_init(batch_transformer):
+    assert type(batch_transformer) == BatchTransformer
+    assert len(batch_transformer.transforms) == 3
+    assert type(batch_transformer.transforms[2]) == WhisperTransform
+def test_apply_transforms(batch_transformer):
+    videos = [YoutubeVideo(channel_name="Tquotes",
+                           url="https://www.youtube.com/watch?v=NSkoGZ8J1Ag"),
+              YoutubeVideo(channel_name="changminjen",
+                           url="https://www.youtube.com/watch?v=Ak516vtDTEA")]
+    transformed_videos = batch_transformer.apply(videos)
+    assert len(transformed_videos) == 2
+    assert transformed_videos[0].channel_name == "Tquotes"
+    assert transformed_videos[0].url == "https://www.youtube.com/watch?v=NSkoGZ8J1Ag"
+    assert transformed_videos[0].title == "Steve Jobs quotes Bob Dylan"
+    assert transformed_videos[0].description == ""
+    assert transformed_videos[0].transcription == " Good morning. Good morning and welcome to Apple's 1984 annual shareholders meeting. I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan. That's Bob Dylan. Come writers and critics who prophesize with your pens and keep your eyes wide, the chance won't come again. And don't speak too soon for the wheels still in spin. And there's no telling who that it's naming. For the loser now will be later to win for the times they are a change in. Now."
+    assert transformed_videos[0].segments == [{'start': 0.0, 'end': 2.0, 'text': ' Good morning.'},
+                                              {'start': 2.0, 'end': 11.0, 'text': " Good morning and welcome to Apple's 1984 annual shareholders meeting."},
+                                              {'start': 11.0, 'end': 16.0, 'text': " I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan."},
+                                              {'start': 16.0, 'end': 18.0, 'text': " That's Bob Dylan."},
+                                              {'start': 18.0, 'end': 23.0, 'text': ' Come writers and critics who prophesize with your pens and keep your eyes wide,'},
+                                              {'start': 23.0, 'end': 25.0, 'text': " the chance won't come again."},
+                                              {'start': 25.0, 'end': 28.0, 'text': " And don't speak too soon for the wheels still in spin."},
+                                              {'start': 28.0, 'end': 30.0, 'text': " And there's no telling who that it's naming."},
+                                              {'start': 30.0, 'end': 36.0, 'text': ' For the loser now will be later to win for the times they are a change in.'},
+                                              {'start': 36.0, 'end': 51.0, 'text': ' Now.'}]
+    assert transformed_videos[1].channel_name == "changminjen"
+    assert transformed_videos[1].url == "https://www.youtube.com/watch?v=Ak516vtDTEA"
+    assert transformed_videos[1].title == "My allegiance is to the Republic, to democracy!"
+    assert transformed_videos[1].description == "Anakin, my allegiance is to the Republic, to democracy! from Star Wars Episode III: Revenge of the Sith."
+    assert transformed_videos[1].transcription == " I have brought peace, freedom, justice and security to my new empire. Your new empire don't make me kill you. Anakin, my allegiance is to the Republic, to democracy! If you're not with me, then you're my enemy. Only a Sith deals an absolute."
+    assert transformed_videos[1].segments == [{'start': 0.0, 'end': 8.0, 'text': ' I have brought peace, freedom, justice and security to my new empire.'},
+                                              {'start': 8.0, 'end': 14.0, 'text': " Your new empire don't make me kill you."},
+                                              {'start': 14.0, 'end': 20.0, 'text': ' Anakin, my allegiance is to the Republic, to democracy!'},
+                                              {'start': 20.0, 'end': 26.0, 'text': " If you're not with me, then you're my enemy."},
+                                              {'start': 26.0, 'end': 31.0, 'text': ' Only a Sith deals an absolute.'}]

test/test_datapipeline.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import pytest
+import sqlite3
+from pathlib import Path
+from youtube_transcriber.datapipeline import DataPipeline
+from youtube_transcriber.datapipeline import create_hardcoded_data_pipeline
+from youtube_transcriber.preprocessing.youtubevideopreprocessor import YoutubeVideoPreprocessor
+from youtube_transcriber.loading.loaderiterator import LoaderIterator
+from youtube_transcriber.loading.serialization import JsonSerializer
+from youtube_transcriber.transforming.addtitletransform import AddTitleTransform
+from youtube_transcriber.transforming.adddescriptiontransform import AddDescriptionTransform
+from youtube_transcriber.transforming.whispertransform import WhisperTransform
+from youtube_transcriber.transforming.batchtransformer import BatchTransformer
+from youtube_transcriber.storing.sqlitebatchvideostorer import SQLiteBatchVideoStorer
+from youtube_transcriber.storing.sqlitecontextmanager import SQLiteContextManager
+from youtube_transcriber.storing.createdb import create_db
+@pytest.fixture
+def expected_db_output():
+    return [
+        ("Tquotes",
+         "https://www.youtube.com/watch?v=NSkoGZ8J1Ag",
+         "Steve Jobs quotes Bob Dylan",
+         " Good morning. Good morning and welcome to Apple's 1984 annual shareholders meeting. I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan. That's Bob Dylan. Come writers and critics who prophesize with your pens and keep your eyes wide, the chance won't come again. And don't speak too soon for the wheels still in spin. And there's no telling who that it's naming. For the loser now will be later to win for the times they are a change in. Now."),
+        ("changminjen",
+         "https://www.youtube.com/watch?v=Ak516vtDTEA",
+         "My allegiance is to the Republic, to democracy!",
+         " I have brought peace, freedom, justice and security to my new empire. Your new empire don't make me kill you. Anakin, my allegiance is to the Republic, to democracy! If you're not with me, then you're my enemy. Only a Sith deals an absolute.")
+    ]
+@pytest.fixture
+def data_pipeline():
+    loader_iterator = LoaderIterator(JsonSerializer(), 2)
+    batch_transformer = BatchTransformer([AddTitleTransform(),
+                                          AddDescriptionTransform(),
+                                          WhisperTransform()])
+    video_storer = SQLiteBatchVideoStorer()
+    sqlite_context_manager = SQLiteContextManager("dummy.db")
+    return DataPipeline(loader_iterator,
+                        batch_transformer,
+                        video_storer,
+                        sqlite_context_manager)
+def test_datapipeline_init():
+    data_pipeline = DataPipeline("loader_iterator",
+                                 "transformer",
+                                 "storer",
+                                 "context")
+    assert type(data_pipeline) == DataPipeline
+    assert data_pipeline.loader_iterator == "loader_iterator"
+    assert data_pipeline.batch_transformer == "transformer"
+    assert data_pipeline.storer == "storer"
+    assert data_pipeline.sqlite_context_manager == "context"
+def test_process_files(data_pipeline, expected_db_output):
+    test_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber/test"
+    files = [Path(test_folder/"files/6.json"), Path(test_folder/"files/7.json")]
+    try:
+        create_db("dummy.db")
+        connection = sqlite3.connect("dummy.db")
+        cursor = connection.cursor()
+        data_pipeline.process(files)
+        cursor.execute("SELECT CHANNEL_NAME, URL, TITLE, TRANSCRIPTION FROM VIDEO")
+        videos = cursor.fetchall()
+        for i in range(len(videos)):
+            assert videos[i][0] == expected_db_output[i][0]
+            assert videos[i][1] == expected_db_output[i][1]
+            assert videos[i][2] == expected_db_output[i][2]
+            assert videos[i][3] == expected_db_output[i][3]
+    finally:
+        os.remove("dummy.db")
+def test_process_video_batch(data_pipeline, expected_db_output):
+    video_data = [
+        {
+            "channel_name": "Tquotes",
+            "url": "https://www.youtube.com/watch?v=NSkoGZ8J1Ag",
+        },
+        {
+            "channel_name": "changminjen",
+            "url": "https://www.youtube.com/watch?v=Ak516vtDTEA",
+        }
+    ]
+    try:
+        create_db("dummy.db")
+        connection = sqlite3.connect("dummy.db")
+        cursor = connection.cursor()
+        data_pipeline._process_video_batch(cursor, video_data)
+        cursor.execute("SELECT CHANNEL_NAME, URL, TITLE, TRANSCRIPTION FROM VIDEO")
+        videos = cursor.fetchall()
+        for i in range(len(videos)):
+            assert videos[i][0] == expected_db_output[i][0]
+            assert videos[i][1] == expected_db_output[i][1]
+            assert videos[i][2] == expected_db_output[i][2]
+            assert videos[i][3] == expected_db_output[i][3]
+    finally:
+        os.remove("dummy.db")
+def test_hardcoded_data_pipeline_is_instantiated():
+    data_pipeline = create_hardcoded_data_pipeline()
+    assert type(data_pipeline) == DataPipeline

test/test_hfdataset.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import pytest
+from youtube_transcriber.dataset.hf_dataset import HFDataset
+@pytest.fixture
+def hf_test_dataset():
+    hf_dataset = HFDataset("Whispering-GPT/test_whisper")
+    return hf_dataset
+def test_hf_dataset_init(hf_test_dataset):
+    assert hf_test_dataset.exist == True
+    assert hf_test_dataset.is_empty == False
+def test_get_list_of_ids(hf_test_dataset):
+    expected_list = ["oTUu82C9Fxo", "Rt1rj9uZPoc", "HFyV-bKlY64", "tXQoFOepbf0"]
+    list_of_ids = hf_test_dataset.list_of_ids
+    assert list_of_ids[0] == expected_list[0]
+    assert list_of_ids[1] == expected_list[1]
+    assert list_of_ids[2] == expected_list[2]
+    assert list_of_ids[3] == expected_list[3]

test/test_loaderiterator.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from pathlib import Path
+import pytest
+from youtube_transcriber.loading.loaderiterator import LoaderIterator
+from youtube_transcriber.loading.serialization import JsonSerializer
+@pytest.fixture
+def loader_iterator():
+    test_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber/test"
+    paths = [Path(test_folder/"files/1.json"), Path(test_folder/"files/2.json"),
+             Path("non-existing-path"), Path(test_folder/"files/3.json"),
+             Path(test_folder/"files/4.json"), Path(test_folder/"files/5.json")]
+    return LoaderIterator(JsonSerializer(), 2, paths)
+def test_loader_iterator_init():
+    loader_iterator = LoaderIterator(JsonSerializer(), 3, "dummy_paths")
+    assert type(loader_iterator) == LoaderIterator
+    assert type(loader_iterator.serializer) == JsonSerializer
+    assert loader_iterator.load_paths == "dummy_paths"
+    assert loader_iterator.num_files_per_iteration == 3
+def test_loop_through_loaded_data(loader_iterator):
+    expected_data = [
+        [
+            {
+                "channel_name": "The verge",
+                "url": "https://www.youtube.com/watch?v=YMlTSmusEmA"
+            },
+            {
+                "channel_name": "The verge",
+                "url": "https://www.youtube.com/watch?v=Jzl0hHTc7Jw"
+            }
+        ],
+        [
+            {
+                "channel_name": "The verge",
+                "url": "https://www.youtube.com/watch?v=gV50hpSKHFQ"
+            }
+        ],
+        [
+            {
+                "channel_name": "The verge",
+                "url": "https://www.youtube.com/watch?v=N6ZyzoibXqg"
+            },
+            {
+                "channel_name": "The verge",
+                "url": "https://www.youtube.com/watch?v=q90v9FLXi1E"
+            }
+        ]
+    ]
+    for i, data in enumerate(loader_iterator):
+        assert data == expected_data[i]

test/test_sqlitebatchvideostorer.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+from collections import OrderedDict
+import sqlite3
+import pytest
+from youtube_transcriber.storing.createdb import create_db
+from youtube_transcriber.storing.sqlitebatchvideostorer import SQLiteBatchVideoStorer
+from youtube_transcriber.video import YoutubeVideo
+@pytest.fixture
+def videos():
+    return [YoutubeVideo(channel_name="Tquotes", url="https://www.youtube.com/watch?v=NSkoGZ8J1Ag",
+                         title="Steve Jobs quotes Bob Dylan", description="",
+                         transcription=" Good morning. Good morning and welcome to Apple's 1984 annual shareholders meeting. I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan. That's Bob Dylan. Come writers and critics who prophesize with your pens and keep your eyes wide, the chance won't come again. And don't speak too soon for the wheels still in spin. And there's no telling who that it's naming. For the loser now will be later to win for the times they are a change in. Now.",
+                         segments=[OrderedDict({'start': 0.0, 'end': 2.0, 'text': ' Good morning.'})]),
+            YoutubeVideo(channel_name="changminjen", url="https://www.youtube.com/watch?v=Ak516vtDTEA",
+                         title="My allegiance is to the Republic, to democracy!", description="Anakin, my allegiance is to the Republic, to democracy! from Star Wars Episode III: Revenge of the Sith.",
+                         transcription=" I have brought peace, freedom, justice and security to my new empire. Your new empire dont make me kill you. Anakin, my allegiance is to the Republic, to democracy! If you're not with me, then you're my enemy. Only a Sith deals an absolute.",
+                         segments=[OrderedDict({'start': 0.0, 'end': 8.0, 'text': ' I have brought peace, freedom, justice and security to my new empire.'}),
+                                   OrderedDict({'start': 8.0, 'end': 14.0, 'text': " Your new empire dont make me kill you."})])]
+@pytest.fixture
+def expected_video_list():
+    return [("Tquotes", "https://www.youtube.com/watch?v=NSkoGZ8J1Ag", "Steve Jobs quotes Bob Dylan", "",
+             " Good morning. Good morning and welcome to Apple's 1984 annual shareholders meeting. I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan. That's Bob Dylan. Come writers and critics who prophesize with your pens and keep your eyes wide, the chance won't come again. And don't speak too soon for the wheels still in spin. And there's no telling who that it's naming. For the loser now will be later to win for the times they are a change in. Now.",
+             '[{"start": 0.0, "end": 2.0, "text": " Good morning."}]'),
+            ("changminjen", "https://www.youtube.com/watch?v=Ak516vtDTEA", "My allegiance is to the Republic, to democracy!", "Anakin, my allegiance is to the Republic, to democracy! from Star Wars Episode III: Revenge of the Sith.",
+             " I have brought peace, freedom, justice and security to my new empire. Your new empire dont make me kill you. Anakin, my allegiance is to the Republic, to democracy! If you're not with me, then you're my enemy. Only a Sith deals an absolute.",
+             '[{"start": 0.0, "end": 8.0, "text": " I have brought peace, freedom, justice and security to my new empire."}, {"start": 8.0, "end": 14.0, "text": " Your new empire dont make me kill you."}]')]
+def test_sqlite_batch_video_storer_init():
+    video_storer = SQLiteBatchVideoStorer("table")
+    assert type(video_storer) == SQLiteBatchVideoStorer
+    assert video_storer.table == "table"
+def test_convert_videos_to_list(videos, expected_video_list):
+    videos_list = SQLiteBatchVideoStorer._convert_videos_to_list(videos)
+    assert videos_list == expected_video_list
+def test_videos_are_insterted_in_db(videos, expected_video_list):
+    try:
+        create_db("dummy.db")
+        video_storer = SQLiteBatchVideoStorer("video")
+        connection = sqlite3.connect("dummy.db")
+        cursor = connection.cursor()
+        video_storer.store(cursor, videos)
+        cursor.execute("SELECT CHANNEL_NAME, URL, TITLE, DESCRIPTION, TRANSCRIPTION, SEGMENTS FROM VIDEO")
+        videos = cursor.fetchall()
+        assert videos == expected_video_list
+    finally:
+        os.remove("dummy.db")

test/test_sqlitecontextmanager.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+from sqlite3 import Cursor
+from youtube_transcriber.storing.sqlitecontextmanager import SQLiteContextManager
+def test_sqlite_context_manager_init():
+    sqlite_context_manager = SQLiteContextManager("dummyinit.db")
+    assert type(sqlite_context_manager) == SQLiteContextManager
+def test_enter_context_manager():
+    with SQLiteContextManager("dummy.db") as cursor:
+        assert type(cursor) == Cursor
+    os.remove("dummy.db")

test/test_utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from youtube_transcriber.video import YoutubeVideo
+from youtube_transcriber.utils import create_videos
+from youtube_transcriber.utils import nest_list
+def test_create_videos():
+    video_params = [
+        {'channel_name': 'MrBeast Shorts', 'url': 'https://www.youtube.com/watch?v=mJ4t7iNF86g'},
+        {'channel_name': 'MrBeast Shorts', 'url': 'https://www.youtube.com/watch?v=UPhxU9J46Qk'}
+    ]
+    videos = create_videos(video_params)
+    assert len(videos) == 2
+    assert type(videos[0]) == YoutubeVideo
+    assert videos[1].url == "https://www.youtube.com/watch?v=UPhxU9J46Qk"
+def test_nest_list():
+    l = [0, 1, 2, 3, 4, 5]
+    nested_l = nest_list(l, 6)
+    assert nested_l == [[0, 1, 2, 3, 4, 5]]
+    nested_l = nest_list(l, 5)
+    assert nested_l == [[0, 1, 2, 3, 4], [5]]
+    nested_l = nest_list(l, 4)
+    assert nested_l == [[0, 1, 2, 3], [4, 5]]
+    nested_l = nest_list(l, 3)
+    assert nested_l == [[0, 1, 2], [3, 4, 5]]
+    nested_l = nest_list(l, 2)
+    assert nested_l == [[0, 1], [2, 3], [4, 5]]
+    nested_l = nest_list(l, 1)
+    assert nested_l == [[0], [1], [2], [3], [4], [5]]

test/test_video.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import pytest
+from youtube_transcriber.video import YoutubeVideo
+def test_youtube_video_init():
+    video = YoutubeVideo(channel_name="The verge",
+                         url="https://www.youtube.com/watch?v=Jzl0hHTc7Jw",
+                         title="Pixel 7 Pro and 7 hands-on: more of the same",
+                         description="Google’s Pixel 7 and 7 Pro...",
+                         transcription=" Seven years ago, we set out...",
+                         segments=[{"start": 0.0, "end": 1.3, "text": " Seven years ago"},
+                                   {"start": 1.3, "end": 2.3, "text": " we set out..."}])
+    assert type(video) == YoutubeVideo
+    assert video.channel_name == "The verge"
+    assert video.url == "https://www.youtube.com/watch?v=Jzl0hHTc7Jw"
+    assert video.title == "Pixel 7 Pro and 7 hands-on: more of the same"
+    assert video.description == "Google’s Pixel 7 and 7 Pro..."
+    assert video.transcription == " Seven years ago, we set out..."
+    assert video.segments == [{"start": 0.0, "end": 1.3, "text": " Seven years ago"},
+                              {"start": 1.3, "end": 2.3, "text": " we set out..."}]
+def test_youtube_video_to_tuple():
+    video = YoutubeVideo(channel_name="The verge",
+                         url="https://www.youtube.com/watch?v=Jzl0hHTc7Jw",
+                         title="Pixel 7 Pro and 7 hands-on: more of the same",
+                         description="Google’s Pixel 7 and 7 Pro...",
+                         transcription=" Seven years ago, we set out...",
+                         segments=[{"start": 0.0, "end": 1.3, "text": " Seven years ago"},
+                                   {"start": 1.3, "end": 2.3, "text": " we set out..."}])
+    video_tuple = video.to_tuple()
+    assert len(video_tuple) == 6
+    assert type(video_tuple) == tuple
+    assert video_tuple[0] == "The verge"
+    assert video_tuple[1] == "https://www.youtube.com/watch?v=Jzl0hHTc7Jw"
+    assert video_tuple[2] == "Pixel 7 Pro and 7 hands-on: more of the same"
+    assert video_tuple[3] == "Google’s Pixel 7 and 7 Pro..."
+    assert video_tuple[4] == " Seven years ago, we set out..."
+    assert video_tuple[5] == [{"start": 0.0, "end": 1.3, "text": " Seven years ago"},
+                              {"start": 1.3, "end": 2.3, "text": " we set out..."}]

test/test_whispertransform.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from youtube_transcriber.transforming.whispertransform import WhisperTransform
+from youtube_transcriber.video import YoutubeVideo
+def test_whisper_transform_init():
+    transcriber = WhisperTransform()
+    assert type(transcriber) == WhisperTransform
+    # TODO: Check if loaded model is 'base'
+    assert transcriber.without_timestamps == False
+def test_apply():
+    transcriber = WhisperTransform()
+    raw_video = YoutubeVideo(channel_name="Tquotes",
+                             url="https://www.youtube.com/watch?v=NSkoGZ8J1Ag")
+    transcribed_video = transcriber.apply(raw_video)
+    assert type(transcribed_video) == YoutubeVideo
+    assert transcribed_video.channel_name == raw_video.channel_name
+    assert transcribed_video.url == raw_video.url
+    assert transcribed_video.title == raw_video.title
+    assert transcribed_video.description == raw_video.description
+    assert transcribed_video.transcription == " Good morning. Good morning and welcome to Apple's 1984 annual shareholders meeting. I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan. That's Bob Dylan. Come writers and critics who prophesize with your pens and keep your eyes wide, the chance won't come again. And don't speak too soon for the wheels still in spin. And there's no telling who that it's naming. For the loser now will be later to win for the times they are a change in. Now."
+    assert transcribed_video.segments == [{'start': 0.0, 'end': 2.0, 'text': ' Good morning.'},
+                                          {'start': 2.0, 'end': 11.0, 'text': " Good morning and welcome to Apple's 1984 annual shareholders meeting."},
+                                          {'start': 11.0, 'end': 16.0, 'text': " I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan."},
+                                          {'start': 16.0, 'end': 18.0, 'text': " That's Bob Dylan."},
+                                          {'start': 18.0, 'end': 23.0, 'text': ' Come writers and critics who prophesize with your pens and keep your eyes wide,'},
+                                          {'start': 23.0, 'end': 25.0, 'text': " the chance won't come again."},
+                                          {'start': 25.0, 'end': 28.0, 'text': " And don't speak too soon for the wheels still in spin."},
+                                          {'start': 28.0, 'end': 30.0, 'text': " And there's no telling who that it's naming."},
+                                          {'start': 30.0, 'end': 36.0, 'text': ' For the loser now will be later to win for the times they are a change in.'},
+                                          {'start': 36.0, 'end': 51.0, 'text': ' Now.'}]

test/test_youtubevideopreprocessor.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from pathlib import Path
+import pytest
+import os
+from youtube_transcriber.preprocessing.youtubevideopreprocessor import YoutubeVideoPreprocessor
+from youtube_transcriber.loading.serialization import JsonSerializer
+@pytest.fixture
+def youtube_video_preprocessor():
+    yt_video_preprocessor = YoutubeVideoPreprocessor(mode="channel_name",
+                                                     serializer=JsonSerializer())
+    load_paths, dataset_folder = yt_video_preprocessor.preprocess(name="Best Shorts Quotes",
+                                                                  num_videos=2,
+                                                                  videos_in_ds=["GU2_xlNCJrA"])
+    return load_paths, dataset_folder
+@pytest.fixture
+def expected_file_paths():
+    youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
+    expected_dir = youtube_folder/"Best Shorts Quotes"
+    return [expected_dir/"0.json", expected_dir/"1.json"]
+@pytest.fixture
+def expected_folder_path():
+    youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
+    expected_dir = youtube_folder/"Best Shorts Quotes"
+    return expected_dir
+def test_youtube_video_preprocessor_init():
+    yt_video_preprocessor = YoutubeVideoPreprocessor(mode="channel_name",
+                                                     serializer=JsonSerializer())
+    assert type(yt_video_preprocessor) == YoutubeVideoPreprocessor
+    assert type(yt_video_preprocessor.serializer) == JsonSerializer
+    assert yt_video_preprocessor.mode == "channel_name"
+def test_created_file(youtube_video_preprocessor, expected_file_paths):
+    paths, _ = youtube_video_preprocessor
+    for path in paths:
+        assert os.path.exists(expected_file_paths[0]) == True
+        assert os.path.exists(expected_file_paths[1]) == True
+def test_created_folder(youtube_video_preprocessor, expected_folder_path):
+    _, folder = youtube_video_preprocessor
+    assert folder == expected_folder_path
+def test_loop_through_created_files(youtube_video_preprocessor):
+    expected_data = [
+        {
+            "channel_name": "Best Shorts Quotes",
+            "url": "https://www.youtube.com/watch?v=GU2_xlNCJrA"
+        },
+        {
+            "channel_name": "Best Shorts Quotes",
+            "url": "https://www.youtube.com/watch?v=ttRI4EmmxkY"
+        }
+    ]
+    paths, folder = youtube_video_preprocessor
+    for i, path in enumerate(paths):
+        serializer = JsonSerializer()
+        assert serializer.load(path) == expected_data[i]

threadeddatapipeline.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import threading
+import logging
+from pathlib import Path
+from typing import List
+from datapipeline import DataPipeline
+logging.basicConfig(level=logging.INFO, format="(%(threadName)-5s) %(message)s")
+class ThreadedDataPipeline(threading.Thread):
+    """Class that wraps a data pipeline in a thread."""
+    def __init__(self,
+                 data_pipeline: DataPipeline,
+                 load_paths: List[Path],
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.data_pipeline = data_pipeline
+        self.load_paths = load_paths
+    def run(self) -> None:
+        logging.info("Started processing data.")
+        self.data_pipeline.process(self.load_paths)
+        logging.info("Finished processing data.")

transforming/__init__.py ADDED Viewed

File without changes

transforming/adddescriptiontransform.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import Any
+from pytube import YouTube
+from video import YoutubeVideo
+from utils import accepts_types
+from transforming.transform import Transform
+class AddDescriptionTransform(Transform):
+    """
+    Transform a Video object using PyTube. Adds title to YouTube video DTO.
+    It's a concrete Transform.
+    """
+    @accepts_types(YoutubeVideo)
+    def apply(self, video: YoutubeVideo) -> YoutubeVideo:
+        yt = YouTube(video.url)
+        video_With_description_params = {
+            "channel_name": video.channel_name,
+            "url": video.url,
+            "title": video.title,
+            "description": self._get_video_description(yt),
+            "transcription": video.transcription,
+            "segments": video.segments
+        }
+        return YoutubeVideo(**video_With_description_params)
+    def _get_video_description(self, yt: Any) -> str:
+        return str(yt.description)

transforming/addtitletransform.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from typing import Any
+from pytube import YouTube
+from video import YoutubeVideo
+from utils import accepts_types
+from transforming.transform import Transform
+class AddTitleTransform(Transform):
+    """
+    Transform a Video object using PyTube. Adds title to YouTube video DTO.
+    It's a concrete Transform.
+    """
+    @accepts_types(YoutubeVideo)
+    def apply(self, video: YoutubeVideo) -> YoutubeVideo:
+        yt = YouTube(video.url)
+        video_With_title_params = {
+            "channel_name": video.channel_name,
+            "url": video.url,
+            "title": self._get_video_title(yt),
+            "description": video.description,
+            "transcription": video.transcription,
+            "segments": video.segments
+        }
+        return YoutubeVideo(**video_With_title_params)
+    def _get_video_title(self, yt: Any) -> str:
+            return str(yt.title)

transforming/batchtransformer.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from typing import List
+from video import YoutubeVideo
+from transforming.transform import Transform
+from utils import accepts_types
+class BatchTransformer:
+    """Class that applies multiple transforms to YouTube video object."""
+    def __init__(self, transforms: List[Transform]) -> None:
+        self._transforms = transforms
+    @property
+    def transforms(self) -> List[Transform]:
+        return self._transforms
+    @transforms.setter
+    def transforms(self, transforms: List[Transform]) -> None:
+        self._transforms = transforms
+    @accepts_types(list)
+    def apply(self, videos: List[YoutubeVideo]) -> List[YoutubeVideo]:
+        for transform in self._transforms:
+            videos = list(map(transform.apply, videos))
+        return videos

transforming/transform.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from abc import ABC, abstractmethod
+from video import YoutubeVideo
+class Transform(ABC):
+    """Interface for concrete Transform which transform a video object."""
+    @abstractmethod
+    def apply(self, video: YoutubeVideo) -> YoutubeVideo:
+        """Apply a transform to a video. Method must be implemented by
+        concrete transforms."""

transforming/whispertransform.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+from pathlib import Path
+from typing import Any
+from collections import OrderedDict
+from pytube import YouTube
+import whisper
+from transforming.transform import Transform
+from video import YoutubeVideo
+from utils import accepts_types
+class WhisperTransform(Transform):
+    """
+    Transform a Video object using Whisper model. It's a
+    concrete Transform.
+    Args:
+        model (`str`):
+            Size of Whisper model. Can be tiny, base (default), small, medium, and large.
+        without_timestamps (`bool`, defaults to `False`):
+            To add phrase-level timestamps.
+    """
+    def __init__(self, model: str="base", without_timestamps: bool=False) -> None:
+        self.model = whisper.load_model(model)
+        self.without_timestamps = without_timestamps
+    @accepts_types(YoutubeVideo)
+    def apply(self, video: YoutubeVideo) -> YoutubeVideo:
+        """Creates a new video with transcriptions created by Whisper.
+        """
+        # Create a YouTube object
+        yt = YouTube(video.url)
+        # Get audio from video
+        try:
+            audio_file = self._get_audio_from_video(yt)
+        except Exception as e:
+            print(f"Exception: {e}")
+        result = self.model.transcribe(audio_file,
+                                       without_timestamps=self.without_timestamps)
+        transcription = result["text"]
+        data = []
+        for seg in result['segments']:
+            data.append(OrderedDict({'start': seg['start'], 'end': seg['end'],'text': seg['text']}))
+        os.remove(audio_file)
+        return YoutubeVideo(channel_name = video.channel_name,
+                            url = video.url,
+                            title = video.title,
+                            description = video.description,
+                            transcription = transcription,
+                            segments = data)
+    def _get_audio_from_video(self, yt: Any) -> Path:
+        # TODO: Add credits
+        video = yt.streams.filter(only_audio=True).first()
+        out_file = video.download(output_path=".")
+        base, _ = os.path.splitext(out_file)
+        new_file = base + ".mp3"
+        os.rename(out_file, new_file)
+        return new_file

utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from typing import Dict, List
+from video import YoutubeVideo
+from errors import DifferentNumberOfArgumentsError
+def accepts_types(*expected_types):
+    """Decorator that checks that the arguments of a method are valid.
+    :raise TypeError: If type of argument isn't valid
+    :raise DifferentNumberOfArgumentsError: If number of arguments passed to the
+        decorator and to the method (minus self) aren't the same
+    """
+    def check_types(func):
+        def wrapper(*args, **kwargs):
+            args_without_self = args[1:]
+            _raise_error_if_number_of_passed_and_expected_arguments_dont_match(args_without_self, expected_types)
+            _raise_type_error_if_passed_and_expected_types_dont_match(args_without_self, expected_types)
+            return func(*args, **kwargs)
+        return wrapper
+    return check_types
+def _raise_error_if_number_of_passed_and_expected_arguments_dont_match(passed_args, expected_types):
+    if len(passed_args) != len(expected_types):
+        msg = "Number of arguments passed in decorator " \
+              f"{len(expected_types)} doesn't match with number of " \
+              f"arguments in method, i.e., {len(passed_args)}"
+        raise DifferentNumberOfArgumentsError(msg)
+def _raise_type_error_if_passed_and_expected_types_dont_match(passed_args, expected_types):
+    for (arg, expected_type) in zip(passed_args, expected_types):
+        if not isinstance(arg, expected_type):
+            raise TypeError(f"Argument '{arg}' is of type {type(arg)}. "
+                            f"'{expected_type}' expected instead")
+def create_videos(video_parameters: List[Dict]) -> List[YoutubeVideo]:
+    """Factory function that creates a list of YoutubeVideos from a list of
+    dictionaries representing video parameters
+    """
+    youtube_videos = []
+    for params in video_parameters:
+        youtube_video = YoutubeVideo(channel_name=params["channel_name"],
+                                     url=params["url"])
+        youtube_videos.append(youtube_video)
+    return youtube_videos
+def nest_list(list: list, nested_list_length: int) -> List[List]:
+    new_list = []
+    nested_list = []
+    for item in list:
+        nested_list.append(item)
+        if len(nested_list) == nested_list_length:
+            new_list.append(nested_list)
+            nested_list = []
+    if len(nested_list) != 0:
+        new_list.append(nested_list)
+    return new_list
+def is_google_colab():
+    try:
+        import google.colab
+        return True
+    except:
+        return False

video.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from typing import Dict, List, Optional, Tuple
+from pydantic import BaseModel
+class YoutubeVideo(BaseModel):
+    """This class represent a YouTube video entry
+    """
+    channel_name: str
+    url: str
+    title: Optional[str]
+    description: Optional[str]
+    transcription: Optional[str]
+    segments: Optional[List[Dict]] = None
+    def to_tuple(self) -> Tuple:
+        """Convert TranscribedVideo object to a tuple of the type:
+        (channel_name, url, title, description, transcription, segments).
+        """
+        return (self.channel_name, self.url, self.title,
+                self.description, self.transcription, self.segments)