juancopi81 commited on
Commit
7288748
0 Parent(s):

Duplicate from Whispering-GPT/whisper-youtube-2-hf_dataset

Browse files
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ # vscode
132
+ .vscode/
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Whisper-youtube-2-hf Dataset
3
+ emoji: 📚
4
+ colorFrom: purple
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 3.10.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: openrail
11
+ duplicated_from: Whispering-GPT/whisper-youtube-2-hf_dataset
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import argparse
4
+ import sqlite3
5
+ import shutil
6
+ import uuid
7
+
8
+ from datasets import Dataset, concatenate_datasets
9
+ import gradio as gr
10
+ import torch
11
+
12
+ from storing.createdb import create_db
13
+ from preprocessing.youtubevideopreprocessor import YoutubeVideoPreprocessor
14
+ from loading.serialization import JsonSerializer
15
+ from utils import nest_list, is_google_colab
16
+ from datapipeline import create_hardcoded_data_pipeline
17
+ from threadeddatapipeline import ThreadedDataPipeline
18
+ from dataset.hf_dataset import HFDataset
19
+ from huggingface_hub import DatasetCard
20
+
21
+ NUM_THREADS = 1
22
+
23
+ # Detect if code is running in Colab
24
+ is_colab = is_google_colab()
25
+ colab_instruction = "" if is_colab else """
26
+ <p>You can skip the queue using Colab:
27
+ <a href="https://colab.research.google.com/drive/1zNRnX1lXjlGtBMW8U8S9t4eY1cA0D6lm?usp=sharing">
28
+ <img data-canonical-src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg"></a></p>"""
29
+ device_print = "GPU 🔥" if torch.cuda.is_available() else "CPU 🥶"
30
+
31
+ def numvideos_type(x):
32
+ x = int(x)
33
+ if x > 12:
34
+ raise argparse.ArgumentTypeError("Maximum number of videos is 12")
35
+ if x < 1:
36
+ raise argparse.ArgumentTypeError("Minimum number of videos is 12")
37
+ return x
38
+
39
+ def parse_args():
40
+ parser = argparse.ArgumentParser(usage="[arguments] --channel_name --num_videos",
41
+ description="Program to transcribe YouTube videos.")
42
+ parser.add_argument("--channel_name",
43
+ type=str,
44
+ required=True,
45
+ help="Name of the channel from where the videos will be transcribed")
46
+ parser.add_argument("--num_videos",
47
+ type=numvideos_type,
48
+ required=True,
49
+ help="Number of videos (min. 1 - max. 12) to transcribe from --channel_name")
50
+ parser.add_argument("--hf_token",
51
+ type=str,
52
+ required=True,
53
+ help="Token of your HF account. You need a HF account to upload the dataset")
54
+ parser.add_argument("--hf_dataset_identifier",
55
+ type=str,
56
+ required=True,
57
+ help="The ID of the repository to push to in the following format: <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.")
58
+ parser.add_argument("--whisper_model",
59
+ type=str,
60
+ required=True,
61
+ help="Select one of the available whispers models",
62
+ choices=["tiny", "base", "small", "medium", "large"])
63
+
64
+ args = parser.parse_args()
65
+ return args
66
+
67
+ def transcribe(mode: str,
68
+ channel_name: str,
69
+ num_videos: int,
70
+ hf_token: str,
71
+ hf_dataset_identifier: str,
72
+ whisper_model: str) -> str:
73
+
74
+ # Create a unique name for the database
75
+ unique_filename = str(uuid.uuid4())
76
+ database_name = unique_filename +".db"
77
+
78
+ create_db(database_name)
79
+
80
+ # Create necessary resources
81
+ yt_video_processor = YoutubeVideoPreprocessor(mode=mode,
82
+ serializer=JsonSerializer()) # TODO: Let user select serializer
83
+
84
+ hf_dataset = HFDataset(hf_dataset_identifier)
85
+ videos_downloaded = hf_dataset.list_of_ids
86
+
87
+ paths, dataset_folder = yt_video_processor.preprocess(channel_name,
88
+ num_videos,
89
+ videos_downloaded)
90
+ nested_listed_length = math.ceil(len(paths) / NUM_THREADS)
91
+ nested_paths = nest_list(paths, nested_listed_length)
92
+ data_pipelines = [create_hardcoded_data_pipeline(database_name, whisper_model) for i in range(NUM_THREADS)]
93
+
94
+ # Run pipelines in multiple threads
95
+ threads = []
96
+ for data_pipeline, thread_paths in zip(data_pipelines, nested_paths):
97
+ threads.append(ThreadedDataPipeline(data_pipeline, thread_paths))
98
+ for thread in threads:
99
+ thread.start()
100
+ for thread in threads:
101
+ thread.join()
102
+
103
+ # Fetch entries and print them
104
+ connection = sqlite3.connect(database_name)
105
+ cursor = connection.cursor()
106
+ cursor.execute("SELECT CHANNEL_NAME, URL, TITLE, DESCRIPTION, TRANSCRIPTION, SEGMENTS FROM VIDEO")
107
+ videos = cursor.fetchall()
108
+
109
+ num_new_videos = len(videos)
110
+
111
+ dataset = Dataset.from_sql("SELECT CHANNEL_NAME, URL, TITLE, DESCRIPTION, TRANSCRIPTION, SEGMENTS FROM VIDEO", connection)
112
+
113
+ if (hf_dataset.exist==True) and (hf_dataset.is_empty==False):
114
+ dataset_to_upload = concatenate_datasets([hf_dataset.dataset["train"], dataset])
115
+ else:
116
+ dataset_to_upload = dataset
117
+
118
+ dataset_to_upload.push_to_hub(hf_dataset_identifier, token=hf_token)
119
+ card = DatasetCard.load(hf_dataset_identifier)
120
+ card.data.tags = ["whisper", "whispering", whisper_model]
121
+ card.data.task_categories = ["automatic-speech-recognition"]
122
+ card.push_to_hub(hf_dataset_identifier, token=hf_token)
123
+
124
+ # Close connection
125
+ connection.close()
126
+
127
+ # Remove db
128
+ os.remove(database_name)
129
+ try:
130
+ shutil.rmtree(dataset_folder)
131
+ except OSError as e:
132
+ print("Error: %s : %s" % (dataset_folder, e.strerror))
133
+
134
+ return f"Dataset created or updated at {hf_dataset_identifier}. {num_new_videos} samples were added"
135
+
136
+ with gr.Blocks() as demo:
137
+ md = """# Use Whisper to create a HF dataset from YouTube videos
138
+ This space will let you create a HF dataset by transcribing videos from YouTube.
139
+ Enter the name of the YouTube channel or the URL of a YouTube playlist (in the form https://www.youtube.com/playlist?list=****),
140
+ and the repo_id of the dataset (you need a HuggingFace account).
141
+ If the dataset already exists, it will only transcribe videos that are not in the dataset.
142
+ If it does not exists, it will create the dataset. For using this demo, you need a
143
+ [Hugging Face token](https://huggingface.co/settings/tokens) with write role. Learn more about [tokens](https://huggingface.co/docs/hub/security-tokens).
144
+ """
145
+ gr.Markdown(md)
146
+ gr.HTML(
147
+ f"""
148
+ <p style="margin-bottom: 10px; font-size: 94%">
149
+ Running on <b>{device_print}</b>{(" in a <b>Google Colab</b>." if is_colab else "")}
150
+ </p>
151
+ """
152
+ )
153
+
154
+ with gr.Row():
155
+ with gr.Column():
156
+ whisper_model = gr.Radio([
157
+ "tiny", "base", "small", "medium", "large"
158
+ ], label="Whisper model", value="base")
159
+
160
+ mode = gr.Radio([
161
+ "channel_name", "playlist"
162
+ ], label="Get the videos from:", value="channel_name")
163
+ channel_name = gr.Textbox(label="YouTube Channel or Playlist URL",
164
+ placeholder="Enter the name of the YouTube channel or the URL of the playlist")
165
+ num_videos = gr.Slider(1, 20000, value=4, step=1, label="Number of videos")
166
+ hf_token = gr.Textbox(placeholder="Your HF write access token", type="password")
167
+ hf_dataset_identifier = gr.Textbox(label = 'Dataset Name',
168
+ placeholder = "Enter in the format <username>/<repo_name>")
169
+ submit_btn = gr.Button("Submit")
170
+
171
+ with gr.Column():
172
+ output = gr.Text()
173
+
174
+ submit_btn.click(fn=transcribe, inputs=[mode,
175
+ channel_name,
176
+ num_videos,
177
+ hf_token,
178
+ hf_dataset_identifier,
179
+ whisper_model], outputs=[output])
180
+ gr.Markdown('''
181
+ ![visitors](https://visitor-badge.glitch.me/badge?page_id=juancopi81.whisper-youtube-2-hf_dataset)
182
+ ''')
183
+
184
+ if not is_colab:
185
+ demo.queue(concurrency_count=1)
186
+ demo.launch(debug=True, share=is_colab)
datapipeline.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+ from pathlib import Path
3
+ from sqlite3 import Cursor
4
+
5
+ from utils import accepts_types, create_videos
6
+ from preprocessing.youtubevideopreprocessor import YoutubeVideoPreprocessor
7
+ from loading.loaderiterator import LoaderIterator
8
+ from transforming.batchtransformer import BatchTransformer
9
+ from storing.sqlitebatchvideostorer import SQLiteBatchVideoStorer
10
+ from storing.sqlitecontextmanager import SQLiteContextManager
11
+ from loading.serialization import JsonSerializer
12
+ from transforming.addtitletransform import AddTitleTransform
13
+ from transforming.adddescriptiontransform import AddDescriptionTransform
14
+ from transforming.whispertransform import WhisperTransform
15
+
16
+ class DataPipeline:
17
+ """A class that wraps the different components of the system. It processes
18
+ data using these steps: load -> apply transform -> store.
19
+ """
20
+
21
+ def __init__(self,
22
+ loader_iterator: LoaderIterator,
23
+ batch_transformer: BatchTransformer,
24
+ storer: SQLiteBatchVideoStorer,
25
+ sqlite_context_manager: SQLiteContextManager) -> None:
26
+ self.loader_iterator = loader_iterator
27
+ self.batch_transformer = batch_transformer
28
+ self.storer = storer
29
+ self.sqlite_context_manager = sqlite_context_manager
30
+
31
+ @accepts_types(list)
32
+ def process(self, load_paths: List[Path]) -> None:
33
+ """Process files in batches: load -> transform -> store to db."""
34
+ self.loader_iterator.load_paths = load_paths
35
+ with self.sqlite_context_manager as db_cursor:
36
+ for video_data_batch in self.loader_iterator:
37
+ self._process_video_batch(db_cursor, video_data_batch)
38
+
39
+ def _process_video_batch(self,
40
+ db_cursor: Cursor,
41
+ video_data_batch: List[Dict]) -> None:
42
+ videos = create_videos(video_data_batch)
43
+ transformed_videos = self.batch_transformer.apply(videos)
44
+ self.storer.store(db_cursor, transformed_videos)
45
+
46
+ def create_hardcoded_data_pipeline(db_path, whisper_model: str="base") -> DataPipeline:
47
+ """Factory function to create a DataPipeline with
48
+ default arguments.
49
+ TODO: Create DataPipeline so users can pass the args.
50
+ """
51
+ loader_iterator = LoaderIterator(JsonSerializer(), 2)
52
+ # Whisper transform using based model and timestamps
53
+ # TODO: Let user select this parameters.
54
+ batch_transformer = BatchTransformer([AddTitleTransform(),
55
+ AddDescriptionTransform(),
56
+ WhisperTransform(model=whisper_model)])
57
+ video_storer = SQLiteBatchVideoStorer()
58
+ sqlite_context_manager = SQLiteContextManager(db_path)
59
+ return DataPipeline(loader_iterator,
60
+ batch_transformer,
61
+ video_storer,
62
+ sqlite_context_manager)
dataset/__init__.py ADDED
File without changes
dataset/hf_dataset.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from Eduardo Matallanas
2
+ from datasets import load_dataset, Dataset
3
+ from datasets.data_files import EmptyDatasetError
4
+
5
+ class HFDataset():
6
+ """
7
+ Create a dataset to save the transcripts from Youtube.
8
+ """
9
+ def __init__(self, name) -> None:
10
+ self.name = name
11
+ if name != "":
12
+ self._init_dataset()
13
+ else:
14
+ self.dataset = Dataset.from_dict({})
15
+ self.exist = False
16
+ self.is_empty = True
17
+
18
+ def _init_dataset(self):
19
+ try:
20
+ self.dataset = load_dataset(self.name)
21
+ self.exist = True
22
+ self.is_empty = False
23
+ self.list_of_ids = self._get_list_of_id()
24
+ except EmptyDatasetError:
25
+ self.dataset = Dataset.from_dict({})
26
+ self.exist = True
27
+ self.is_empty = True
28
+ self.list_of_ids = []
29
+ pass
30
+ except FileNotFoundError:
31
+ self.dataset = Dataset.from_dict({})
32
+ self.exist = False
33
+ self.is_empty = True
34
+ self.list_of_ids = []
35
+ pass
36
+
37
+ def upload(self):
38
+ self.dataset.push_to_hub(self.name)
39
+
40
+ def _get_list_of_id(self):
41
+ new_ds = self.dataset.map(
42
+ lambda x: {"ID": [url.split("=")[-1] for url in x["URL"]]}, batched=True
43
+ )
44
+ list_of_ids = []
45
+ for split in new_ds:
46
+ ids = new_ds[split]["ID"]
47
+ list_of_ids.append(ids)
48
+ return [item for sublist in list_of_ids for item in sublist]
errors.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ class DifferentNumberOfArgumentsError(Exception):
2
+
3
+ def __init__(self, message: str) -> None:
4
+ self.message = message
loading/__init__.py ADDED
File without changes
loading/loaderiterator.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import List, Dict, Optional
3
+
4
+ from loading.serialization import Serializer
5
+
6
+ class LoaderIterator:
7
+ """Iterator that loads data from multiple files in batches."""
8
+
9
+ def __init__(self,
10
+ serializer: Serializer,
11
+ num_files_per_iteration: int,
12
+ load_paths: Optional[List[Path]] = None) -> None:
13
+ self.serializer = serializer
14
+ self.num_files_per_iteration = num_files_per_iteration
15
+ self._load_paths = load_paths
16
+ self._current_iteration = None
17
+
18
+ @property
19
+ def load_paths(self) -> Optional[List[Path]]:
20
+ return self._load_paths
21
+
22
+ @load_paths.setter
23
+ def load_paths(self, load_paths: List[Path]) -> None:
24
+ self._load_paths = load_paths
25
+
26
+ def __iter__(self):
27
+ self._current_iteration = 0
28
+ return self
29
+
30
+ def __next__(self) -> List[Dict]:
31
+ if self._did_load_all_batches():
32
+ raise StopIteration
33
+ data_batch = self._load_data_batch()
34
+ self._current_iteration += 1
35
+ return data_batch
36
+
37
+ def _did_load_all_batches(self) -> bool:
38
+ if self._current_iteration >= len(self._load_paths) / self.num_files_per_iteration:
39
+ return True
40
+ return False
41
+
42
+ def _load_data_batch(self) -> List[Dict]:
43
+ start_index = self._current_iteration * self.num_files_per_iteration
44
+ stop_index = start_index + self.num_files_per_iteration
45
+ return [self.serializer.load(load_path) for load_path in
46
+ self._load_paths[start_index:stop_index] if load_path.exists()]
loading/serialization.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any
4
+ from pathlib import Path
5
+
6
+ class Serializer(ABC):
7
+
8
+ @abstractmethod
9
+ def dump(self, obj: Any, save_path: Path) -> None:
10
+ pass
11
+
12
+ @abstractmethod
13
+ def load(self, load_path: Path) -> Any:
14
+ pass
15
+
16
+ class JsonSerializer(Serializer):
17
+
18
+ def __init__(self,
19
+ sort_keys: bool = True,
20
+ indent: int = 4):
21
+ self.sort_keys = sort_keys
22
+ self.indent = indent
23
+
24
+ def dump(self, obj: Any, save_path: Path) -> None:
25
+ with open(save_path, "w") as file:
26
+ json.dump(obj, file, sort_keys=self.sort_keys, indent=self.indent)
27
+
28
+ def load(self, load_path: Path) -> Any:
29
+ with open(load_path, "r") as file:
30
+ return json.load(file)
preprocessing/__init__.py ADDED
File without changes
preprocessing/youtubevideopreprocessor.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Generator, Tuple
2
+ from pathlib import Path
3
+ from itertools import islice
4
+
5
+ import scrapetube
6
+ from youtubesearchpython import ChannelsSearch
7
+ from pytube import Playlist
8
+
9
+ from utils import accepts_types
10
+ from loading.serialization import Serializer
11
+
12
+ class YoutubeVideoPreprocessor:
13
+ """This class is responsible for creating json files of expected as YoutubeVideo
14
+ objects taking a channel name as input.
15
+ Each JSON file has the following information:
16
+ - channel_name: The name of the YouTube channel
17
+ - url: The url of the video
18
+ Args:
19
+ channel_name (`str`):
20
+ The name of the YouTube channel:
21
+ Returns:
22
+ load_paths (`List[Path]`)
23
+ The paths of the json files of the video of that channel.
24
+ TODO: Change it to accept also URL of video list, name of video list, etc.
25
+ """
26
+ def __init__(self,
27
+ mode: str = "channel_name",
28
+ serializer = Serializer) -> None:
29
+ self.mode = mode
30
+ self.serializer = serializer
31
+
32
+ def preprocess(self,
33
+ name: str,
34
+ num_videos: int,
35
+ videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
36
+ if self.mode == "channel_name":
37
+ # TODO: Add credits
38
+ channels_search = ChannelsSearch(name, limit=1)
39
+ channel_id = channels_search.result()['result'][0]['id']
40
+ videos = scrapetube.get_channel(channel_id=channel_id)
41
+ load_paths, dataset_folder = self._convert_videos_to_json_files(name,
42
+ videos,
43
+ num_videos,
44
+ videos_in_ds)
45
+ return load_paths, dataset_folder
46
+ elif self.mode == "playlist":
47
+ playlist_id = name.split("=")[-1]
48
+ playlist = Playlist(name)
49
+ name = playlist.title
50
+ videos = scrapetube.get_playlist(playlist_id)
51
+ load_paths, dataset_folder = self._convert_videos_to_json_files(name,
52
+ videos,
53
+ num_videos,
54
+ videos_in_ds)
55
+ return load_paths, dataset_folder
56
+ else:
57
+ # TODO: implement this part
58
+ youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
59
+ test_files_folder = youtube_folder/"test/files"
60
+ return [Path("test.json"), Path("test1.json")], test_files_folder
61
+
62
+ def _convert_videos_to_json_files(self,
63
+ name:str,
64
+ videos: Generator,
65
+ num_videos: int,
66
+ videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
67
+ load_paths = []
68
+ youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
69
+ dataset_folder = youtube_folder/name
70
+ Path(dataset_folder).mkdir(parents=True, exist_ok=True)
71
+ i = 0
72
+ while i < num_videos:
73
+ try:
74
+ video = next(videos)
75
+ if video["videoId"] in videos_in_ds:
76
+ continue
77
+ else:
78
+ file_name = f"{i}.json"
79
+ save_path = Path(dataset_folder, file_name)
80
+ save_path.touch(exist_ok=True)
81
+ video_dict = {"channel_name": name,
82
+ "url":f"https://www.youtube.com/watch?v={video['videoId']}"}
83
+ self.serializer.dump(obj=video_dict, save_path=save_path)
84
+ load_paths.append(save_path)
85
+ i += 1
86
+ except StopIteration:
87
+ break
88
+ return load_paths, dataset_folder
requirements.txt ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.3
2
+ aiosignal==1.2.0
3
+ anyio==3.6.2
4
+ async-timeout==4.0.2
5
+ attrs==22.1.0
6
+ certifi==2022.9.24
7
+ charset-normalizer==2.1.1
8
+ datasets
9
+ dill
10
+ ffmpeg-python==0.2.0
11
+ filelock==3.8.0
12
+ frozenlist==1.3.1
13
+ fsspec==2022.10.0
14
+ future==0.18.2
15
+ h11==0.12.0
16
+ httpcore==0.15.0
17
+ httpx==0.23.0
18
+ huggingface-hub==0.11.0
19
+ idna==3.4
20
+ iniconfig==1.1.1
21
+ more-itertools==9.0.0
22
+ multidict==6.0.2
23
+ multiprocess
24
+ numpy==1.23.4
25
+ packaging==21.3
26
+ pandas==1.5.1
27
+ pluggy==1.0.0
28
+ py==1.11.0
29
+ pyarrow==9.0.0
30
+ pydantic==1.10.2
31
+ pyparsing==3.0.9
32
+ pytest==7.1.3
33
+ python-dateutil==2.8.2
34
+ pytube==12.1.0
35
+ pytz==2022.5
36
+ PyYAML==6.0
37
+ regex==2022.9.13
38
+ requests==2.28.1
39
+ responses==0.18.0
40
+ rfc3986==1.5.0
41
+ scrapetube==2.3.1
42
+ six==1.16.0
43
+ sniffio==1.3.0
44
+ tokenizers==0.13.1
45
+ tomli==2.0.1
46
+ torch==1.12.1
47
+ tqdm==4.64.1
48
+ transformers==4.23.1
49
+ typing-extensions==4.4.0
50
+ urllib3==1.26.12
51
+ git+https://github.com/openai/whisper.git
52
+ xxhash==3.1.0
53
+ yarl==1.8.1
54
+ youtube-search-python==1.6.6
storing/__init__.py ADDED
File without changes
storing/createdb.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Simple script to create a sqlite db with a single table
2
+ called 'videos'.
3
+ """
4
+
5
+ import sqlite3
6
+
7
+ def create_db(db_path: str) -> None:
8
+ """Create an sqlite db with a single table called 'videos'"""
9
+ connection = sqlite3.connect(db_path)
10
+ print(f"Created db successfully at '{db_path}'")
11
+ connection.execute(
12
+ '''
13
+ CREATE TABLE VIDEO
14
+ (ID INTEGER PRIMARY KEY AUTOINCREMENT,
15
+ CHANNEL_NAME CHAR(30) NOT NULL,
16
+ URL TEXT NOT NULL,
17
+ TITLE CHAR(100),
18
+ DESCRIPTION CHAR(5000),
19
+ TRANSCRIPTION TEXT,
20
+ SEGMENTS TEXT
21
+ )
22
+ '''
23
+ )
24
+ print(f"'Video' table created successfully")
25
+
26
+ if __name__ == "__main__":
27
+ create_db("video.db")
storing/sqlitebatchvideostorer.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sqlite3
3
+ from typing import List, Tuple
4
+
5
+ from utils import accepts_types
6
+ from video import YoutubeVideo
7
+
8
+ class SQLiteBatchVideoStorer:
9
+ """This is class is responsible to insert batch video entries in the db."""
10
+
11
+ def __init__(self, table: str = "video"):
12
+ self.table = table
13
+
14
+ @accepts_types(sqlite3.Cursor, list)
15
+ def store(self,
16
+ db_cursor: sqlite3.Cursor,
17
+ videos: List[YoutubeVideo]) -> None:
18
+ """Batch insert list of videos in the 'video' table of the db."""
19
+ video_list = self._convert_videos_to_list(videos)
20
+ db_cursor.executemany(f"INSERT INTO {self.table}(channel_name, url, title, description, transcription, segments) VALUES(?, ?, ?, ?, ?, ?)",
21
+ video_list)
22
+
23
+ @staticmethod
24
+ def _convert_videos_to_list(videos: List[YoutubeVideo]) -> List[Tuple[str, str, str, str, str, str]]:
25
+ for video in videos:
26
+ # TODO: Find better way to solve this
27
+ video.segments = json.dumps(video.segments)
28
+ return [video.to_tuple() for video in videos]
storing/sqlitecontextmanager.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+
3
+ class SQLiteContextManager:
4
+ """Context manager for SQLite db, that handles
5
+ db open / closing connection.
6
+ """
7
+
8
+ def __init__(self, db_path: str) -> None:
9
+ self.db_path = db_path
10
+ self.connection = None
11
+
12
+ def __enter__(self):
13
+ """Establish connection with db and return cursor to be used
14
+ to execute queries.
15
+ """
16
+ self.connection = sqlite3.connect(self.db_path)
17
+ return self.connection.cursor()
18
+
19
+ def __exit__(self, exc_type, exc_val, exc_tb):
20
+ """Commit queries and close db connection.
21
+ """
22
+ self.connection.commit()
23
+ self.connection.close()
test/__init__.py ADDED
File without changes
test/files/1.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "channel_name": "The verge",
3
+ "url": "https://www.youtube.com/watch?v=YMlTSmusEmA"
4
+ }
test/files/2.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "channel_name": "The verge",
3
+ "url": "https://www.youtube.com/watch?v=Jzl0hHTc7Jw"
4
+ }
test/files/3.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "channel_name": "The verge",
3
+ "url": "https://www.youtube.com/watch?v=gV50hpSKHFQ"
4
+ }
test/files/4.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "channel_name": "The verge",
3
+ "url": "https://www.youtube.com/watch?v=N6ZyzoibXqg"
4
+ }
test/files/5.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "channel_name": "The verge",
3
+ "url": "https://www.youtube.com/watch?v=q90v9FLXi1E"
4
+ }
test/files/6.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "channel_name": "Tquotes",
3
+ "url": "https://www.youtube.com/watch?v=NSkoGZ8J1Ag"
4
+ }
test/files/7.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "channel_name": "changminjen",
3
+ "url": "https://www.youtube.com/watch?v=Ak516vtDTEA"
4
+ }
test/test_adddescriptiontransform.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from youtube_transcriber.transforming.adddescriptiontransform import AddDescriptionTransform
2
+ from youtube_transcriber.video import YoutubeVideo
3
+
4
+ def test_add_description_transform_init():
5
+ transform = AddDescriptionTransform()
6
+ assert type(transform) == AddDescriptionTransform
7
+
8
+ def test_apply():
9
+ transform = AddDescriptionTransform()
10
+ raw_video = YoutubeVideo(channel_name="changminjen",
11
+ url="https://www.youtube.com/watch?v=Ak516vtDTEA")
12
+ transformed_video = transform.apply(raw_video)
13
+ assert type(transformed_video) == YoutubeVideo
14
+ assert transformed_video.channel_name == raw_video.channel_name
15
+ assert transformed_video.url == raw_video.url
16
+ assert transformed_video.title == raw_video.title
17
+ assert transformed_video.description == "Anakin, my allegiance is to the Republic, to democracy! from Star Wars Episode III: Revenge of the Sith."
18
+ assert transformed_video.transcription == raw_video.transcription
19
+ assert transformed_video.segments == raw_video.segments
test/test_addtitletransform.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from youtube_transcriber.transforming.addtitletransform import AddTitleTransform
2
+ from youtube_transcriber.video import YoutubeVideo
3
+
4
+ def test_add_title_transform_init():
5
+ transform = AddTitleTransform()
6
+ assert type(transform) == AddTitleTransform
7
+
8
+ def test_apply():
9
+ transform = AddTitleTransform()
10
+ raw_video = YoutubeVideo(channel_name="Tquotes",
11
+ url="https://www.youtube.com/watch?v=NSkoGZ8J1Ag")
12
+ transformed_video = transform.apply(raw_video)
13
+ assert type(transformed_video) == YoutubeVideo
14
+ assert transformed_video.channel_name == raw_video.channel_name
15
+ assert transformed_video.url == raw_video.url
16
+ assert transformed_video.title == "Steve Jobs quotes Bob Dylan"
17
+ assert transformed_video.description == raw_video.description
18
+ assert transformed_video.transcription == raw_video.transcription
19
+ assert transformed_video.segments == raw_video.segments
test/test_batchtransformer.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from youtube_transcriber.transforming.addtitletransform import AddTitleTransform
4
+ from youtube_transcriber.transforming.adddescriptiontransform import AddDescriptionTransform
5
+ from youtube_transcriber.transforming.batchtransformer import BatchTransformer
6
+ from youtube_transcriber.transforming.whispertransform import WhisperTransform
7
+ from youtube_transcriber.video import YoutubeVideo
8
+
9
+ @pytest.fixture
10
+ def batch_transformer():
11
+ add_title_transform = AddTitleTransform()
12
+ add_description_transform = AddDescriptionTransform()
13
+ whisper_transform = WhisperTransform()
14
+ return BatchTransformer([add_title_transform,
15
+ add_description_transform,
16
+ whisper_transform])
17
+
18
+ def test_batch_transform_init(batch_transformer):
19
+ assert type(batch_transformer) == BatchTransformer
20
+ assert len(batch_transformer.transforms) == 3
21
+ assert type(batch_transformer.transforms[2]) == WhisperTransform
22
+
23
+ def test_apply_transforms(batch_transformer):
24
+ videos = [YoutubeVideo(channel_name="Tquotes",
25
+ url="https://www.youtube.com/watch?v=NSkoGZ8J1Ag"),
26
+ YoutubeVideo(channel_name="changminjen",
27
+ url="https://www.youtube.com/watch?v=Ak516vtDTEA")]
28
+ transformed_videos = batch_transformer.apply(videos)
29
+ assert len(transformed_videos) == 2
30
+ assert transformed_videos[0].channel_name == "Tquotes"
31
+ assert transformed_videos[0].url == "https://www.youtube.com/watch?v=NSkoGZ8J1Ag"
32
+ assert transformed_videos[0].title == "Steve Jobs quotes Bob Dylan"
33
+ assert transformed_videos[0].description == ""
34
+ assert transformed_videos[0].transcription == " Good morning. Good morning and welcome to Apple's 1984 annual shareholders meeting. I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan. That's Bob Dylan. Come writers and critics who prophesize with your pens and keep your eyes wide, the chance won't come again. And don't speak too soon for the wheels still in spin. And there's no telling who that it's naming. For the loser now will be later to win for the times they are a change in. Now."
35
+ assert transformed_videos[0].segments == [{'start': 0.0, 'end': 2.0, 'text': ' Good morning.'},
36
+ {'start': 2.0, 'end': 11.0, 'text': " Good morning and welcome to Apple's 1984 annual shareholders meeting."},
37
+ {'start': 11.0, 'end': 16.0, 'text': " I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan."},
38
+ {'start': 16.0, 'end': 18.0, 'text': " That's Bob Dylan."},
39
+ {'start': 18.0, 'end': 23.0, 'text': ' Come writers and critics who prophesize with your pens and keep your eyes wide,'},
40
+ {'start': 23.0, 'end': 25.0, 'text': " the chance won't come again."},
41
+ {'start': 25.0, 'end': 28.0, 'text': " And don't speak too soon for the wheels still in spin."},
42
+ {'start': 28.0, 'end': 30.0, 'text': " And there's no telling who that it's naming."},
43
+ {'start': 30.0, 'end': 36.0, 'text': ' For the loser now will be later to win for the times they are a change in.'},
44
+ {'start': 36.0, 'end': 51.0, 'text': ' Now.'}]
45
+ assert transformed_videos[1].channel_name == "changminjen"
46
+ assert transformed_videos[1].url == "https://www.youtube.com/watch?v=Ak516vtDTEA"
47
+ assert transformed_videos[1].title == "My allegiance is to the Republic, to democracy!"
48
+ assert transformed_videos[1].description == "Anakin, my allegiance is to the Republic, to democracy! from Star Wars Episode III: Revenge of the Sith."
49
+ assert transformed_videos[1].transcription == " I have brought peace, freedom, justice and security to my new empire. Your new empire don't make me kill you. Anakin, my allegiance is to the Republic, to democracy! If you're not with me, then you're my enemy. Only a Sith deals an absolute."
50
+ assert transformed_videos[1].segments == [{'start': 0.0, 'end': 8.0, 'text': ' I have brought peace, freedom, justice and security to my new empire.'},
51
+ {'start': 8.0, 'end': 14.0, 'text': " Your new empire don't make me kill you."},
52
+ {'start': 14.0, 'end': 20.0, 'text': ' Anakin, my allegiance is to the Republic, to democracy!'},
53
+ {'start': 20.0, 'end': 26.0, 'text': " If you're not with me, then you're my enemy."},
54
+ {'start': 26.0, 'end': 31.0, 'text': ' Only a Sith deals an absolute.'}]
test/test_datapipeline.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pytest
3
+ import sqlite3
4
+ from pathlib import Path
5
+
6
+ from youtube_transcriber.datapipeline import DataPipeline
7
+ from youtube_transcriber.datapipeline import create_hardcoded_data_pipeline
8
+ from youtube_transcriber.preprocessing.youtubevideopreprocessor import YoutubeVideoPreprocessor
9
+ from youtube_transcriber.loading.loaderiterator import LoaderIterator
10
+ from youtube_transcriber.loading.serialization import JsonSerializer
11
+ from youtube_transcriber.transforming.addtitletransform import AddTitleTransform
12
+ from youtube_transcriber.transforming.adddescriptiontransform import AddDescriptionTransform
13
+ from youtube_transcriber.transforming.whispertransform import WhisperTransform
14
+ from youtube_transcriber.transforming.batchtransformer import BatchTransformer
15
+ from youtube_transcriber.storing.sqlitebatchvideostorer import SQLiteBatchVideoStorer
16
+ from youtube_transcriber.storing.sqlitecontextmanager import SQLiteContextManager
17
+ from youtube_transcriber.storing.createdb import create_db
18
+
19
+ @pytest.fixture
20
+ def expected_db_output():
21
+ return [
22
+ ("Tquotes",
23
+ "https://www.youtube.com/watch?v=NSkoGZ8J1Ag",
24
+ "Steve Jobs quotes Bob Dylan",
25
+ " Good morning. Good morning and welcome to Apple's 1984 annual shareholders meeting. I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan. That's Bob Dylan. Come writers and critics who prophesize with your pens and keep your eyes wide, the chance won't come again. And don't speak too soon for the wheels still in spin. And there's no telling who that it's naming. For the loser now will be later to win for the times they are a change in. Now."),
26
+ ("changminjen",
27
+ "https://www.youtube.com/watch?v=Ak516vtDTEA",
28
+ "My allegiance is to the Republic, to democracy!",
29
+ " I have brought peace, freedom, justice and security to my new empire. Your new empire don't make me kill you. Anakin, my allegiance is to the Republic, to democracy! If you're not with me, then you're my enemy. Only a Sith deals an absolute.")
30
+ ]
31
+
32
+ @pytest.fixture
33
+ def data_pipeline():
34
+ loader_iterator = LoaderIterator(JsonSerializer(), 2)
35
+ batch_transformer = BatchTransformer([AddTitleTransform(),
36
+ AddDescriptionTransform(),
37
+ WhisperTransform()])
38
+ video_storer = SQLiteBatchVideoStorer()
39
+ sqlite_context_manager = SQLiteContextManager("dummy.db")
40
+ return DataPipeline(loader_iterator,
41
+ batch_transformer,
42
+ video_storer,
43
+ sqlite_context_manager)
44
+
45
+ def test_datapipeline_init():
46
+ data_pipeline = DataPipeline("loader_iterator",
47
+ "transformer",
48
+ "storer",
49
+ "context")
50
+ assert type(data_pipeline) == DataPipeline
51
+ assert data_pipeline.loader_iterator == "loader_iterator"
52
+ assert data_pipeline.batch_transformer == "transformer"
53
+ assert data_pipeline.storer == "storer"
54
+ assert data_pipeline.sqlite_context_manager == "context"
55
+
56
+ def test_process_files(data_pipeline, expected_db_output):
57
+ test_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber/test"
58
+ files = [Path(test_folder/"files/6.json"), Path(test_folder/"files/7.json")]
59
+ try:
60
+ create_db("dummy.db")
61
+ connection = sqlite3.connect("dummy.db")
62
+ cursor = connection.cursor()
63
+
64
+ data_pipeline.process(files)
65
+
66
+ cursor.execute("SELECT CHANNEL_NAME, URL, TITLE, TRANSCRIPTION FROM VIDEO")
67
+ videos = cursor.fetchall()
68
+
69
+ for i in range(len(videos)):
70
+ assert videos[i][0] == expected_db_output[i][0]
71
+ assert videos[i][1] == expected_db_output[i][1]
72
+ assert videos[i][2] == expected_db_output[i][2]
73
+ assert videos[i][3] == expected_db_output[i][3]
74
+ finally:
75
+ os.remove("dummy.db")
76
+
77
+ def test_process_video_batch(data_pipeline, expected_db_output):
78
+ video_data = [
79
+ {
80
+ "channel_name": "Tquotes",
81
+ "url": "https://www.youtube.com/watch?v=NSkoGZ8J1Ag",
82
+ },
83
+ {
84
+ "channel_name": "changminjen",
85
+ "url": "https://www.youtube.com/watch?v=Ak516vtDTEA",
86
+ }
87
+ ]
88
+ try:
89
+ create_db("dummy.db")
90
+ connection = sqlite3.connect("dummy.db")
91
+ cursor = connection.cursor()
92
+
93
+ data_pipeline._process_video_batch(cursor, video_data)
94
+
95
+ cursor.execute("SELECT CHANNEL_NAME, URL, TITLE, TRANSCRIPTION FROM VIDEO")
96
+ videos = cursor.fetchall()
97
+
98
+ for i in range(len(videos)):
99
+ assert videos[i][0] == expected_db_output[i][0]
100
+ assert videos[i][1] == expected_db_output[i][1]
101
+ assert videos[i][2] == expected_db_output[i][2]
102
+ assert videos[i][3] == expected_db_output[i][3]
103
+ finally:
104
+ os.remove("dummy.db")
105
+
106
+ def test_hardcoded_data_pipeline_is_instantiated():
107
+ data_pipeline = create_hardcoded_data_pipeline()
108
+ assert type(data_pipeline) == DataPipeline
test/test_hfdataset.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from youtube_transcriber.dataset.hf_dataset import HFDataset
4
+
5
+ @pytest.fixture
6
+ def hf_test_dataset():
7
+ hf_dataset = HFDataset("Whispering-GPT/test_whisper")
8
+ return hf_dataset
9
+
10
+ def test_hf_dataset_init(hf_test_dataset):
11
+ assert hf_test_dataset.exist == True
12
+ assert hf_test_dataset.is_empty == False
13
+
14
+ def test_get_list_of_ids(hf_test_dataset):
15
+ expected_list = ["oTUu82C9Fxo", "Rt1rj9uZPoc", "HFyV-bKlY64", "tXQoFOepbf0"]
16
+ list_of_ids = hf_test_dataset.list_of_ids
17
+ assert list_of_ids[0] == expected_list[0]
18
+ assert list_of_ids[1] == expected_list[1]
19
+ assert list_of_ids[2] == expected_list[2]
20
+ assert list_of_ids[3] == expected_list[3]
21
+
test/test_loaderiterator.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+
5
+ from youtube_transcriber.loading.loaderiterator import LoaderIterator
6
+ from youtube_transcriber.loading.serialization import JsonSerializer
7
+
8
+ @pytest.fixture
9
+ def loader_iterator():
10
+ test_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber/test"
11
+ paths = [Path(test_folder/"files/1.json"), Path(test_folder/"files/2.json"),
12
+ Path("non-existing-path"), Path(test_folder/"files/3.json"),
13
+ Path(test_folder/"files/4.json"), Path(test_folder/"files/5.json")]
14
+ return LoaderIterator(JsonSerializer(), 2, paths)
15
+
16
+ def test_loader_iterator_init():
17
+ loader_iterator = LoaderIterator(JsonSerializer(), 3, "dummy_paths")
18
+ assert type(loader_iterator) == LoaderIterator
19
+ assert type(loader_iterator.serializer) == JsonSerializer
20
+ assert loader_iterator.load_paths == "dummy_paths"
21
+ assert loader_iterator.num_files_per_iteration == 3
22
+
23
+ def test_loop_through_loaded_data(loader_iterator):
24
+ expected_data = [
25
+ [
26
+ {
27
+ "channel_name": "The verge",
28
+ "url": "https://www.youtube.com/watch?v=YMlTSmusEmA"
29
+ },
30
+ {
31
+ "channel_name": "The verge",
32
+ "url": "https://www.youtube.com/watch?v=Jzl0hHTc7Jw"
33
+ }
34
+ ],
35
+ [
36
+ {
37
+ "channel_name": "The verge",
38
+ "url": "https://www.youtube.com/watch?v=gV50hpSKHFQ"
39
+ }
40
+ ],
41
+ [
42
+ {
43
+ "channel_name": "The verge",
44
+ "url": "https://www.youtube.com/watch?v=N6ZyzoibXqg"
45
+ },
46
+ {
47
+ "channel_name": "The verge",
48
+ "url": "https://www.youtube.com/watch?v=q90v9FLXi1E"
49
+ }
50
+ ]
51
+ ]
52
+
53
+ for i, data in enumerate(loader_iterator):
54
+ assert data == expected_data[i]
test/test_sqlitebatchvideostorer.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from collections import OrderedDict
3
+ import sqlite3
4
+ import pytest
5
+
6
+ from youtube_transcriber.storing.createdb import create_db
7
+ from youtube_transcriber.storing.sqlitebatchvideostorer import SQLiteBatchVideoStorer
8
+ from youtube_transcriber.video import YoutubeVideo
9
+
10
+ @pytest.fixture
11
+ def videos():
12
+ return [YoutubeVideo(channel_name="Tquotes", url="https://www.youtube.com/watch?v=NSkoGZ8J1Ag",
13
+ title="Steve Jobs quotes Bob Dylan", description="",
14
+ transcription=" Good morning. Good morning and welcome to Apple's 1984 annual shareholders meeting. I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan. That's Bob Dylan. Come writers and critics who prophesize with your pens and keep your eyes wide, the chance won't come again. And don't speak too soon for the wheels still in spin. And there's no telling who that it's naming. For the loser now will be later to win for the times they are a change in. Now.",
15
+ segments=[OrderedDict({'start': 0.0, 'end': 2.0, 'text': ' Good morning.'})]),
16
+ YoutubeVideo(channel_name="changminjen", url="https://www.youtube.com/watch?v=Ak516vtDTEA",
17
+ title="My allegiance is to the Republic, to democracy!", description="Anakin, my allegiance is to the Republic, to democracy! from Star Wars Episode III: Revenge of the Sith.",
18
+ transcription=" I have brought peace, freedom, justice and security to my new empire. Your new empire dont make me kill you. Anakin, my allegiance is to the Republic, to democracy! If you're not with me, then you're my enemy. Only a Sith deals an absolute.",
19
+ segments=[OrderedDict({'start': 0.0, 'end': 8.0, 'text': ' I have brought peace, freedom, justice and security to my new empire.'}),
20
+ OrderedDict({'start': 8.0, 'end': 14.0, 'text': " Your new empire dont make me kill you."})])]
21
+
22
+ @pytest.fixture
23
+ def expected_video_list():
24
+ return [("Tquotes", "https://www.youtube.com/watch?v=NSkoGZ8J1Ag", "Steve Jobs quotes Bob Dylan", "",
25
+ " Good morning. Good morning and welcome to Apple's 1984 annual shareholders meeting. I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan. That's Bob Dylan. Come writers and critics who prophesize with your pens and keep your eyes wide, the chance won't come again. And don't speak too soon for the wheels still in spin. And there's no telling who that it's naming. For the loser now will be later to win for the times they are a change in. Now.",
26
+ '[{"start": 0.0, "end": 2.0, "text": " Good morning."}]'),
27
+ ("changminjen", "https://www.youtube.com/watch?v=Ak516vtDTEA", "My allegiance is to the Republic, to democracy!", "Anakin, my allegiance is to the Republic, to democracy! from Star Wars Episode III: Revenge of the Sith.",
28
+ " I have brought peace, freedom, justice and security to my new empire. Your new empire dont make me kill you. Anakin, my allegiance is to the Republic, to democracy! If you're not with me, then you're my enemy. Only a Sith deals an absolute.",
29
+ '[{"start": 0.0, "end": 8.0, "text": " I have brought peace, freedom, justice and security to my new empire."}, {"start": 8.0, "end": 14.0, "text": " Your new empire dont make me kill you."}]')]
30
+
31
+ def test_sqlite_batch_video_storer_init():
32
+ video_storer = SQLiteBatchVideoStorer("table")
33
+ assert type(video_storer) == SQLiteBatchVideoStorer
34
+ assert video_storer.table == "table"
35
+
36
+ def test_convert_videos_to_list(videos, expected_video_list):
37
+ videos_list = SQLiteBatchVideoStorer._convert_videos_to_list(videos)
38
+ assert videos_list == expected_video_list
39
+
40
+ def test_videos_are_insterted_in_db(videos, expected_video_list):
41
+ try:
42
+ create_db("dummy.db")
43
+ video_storer = SQLiteBatchVideoStorer("video")
44
+ connection = sqlite3.connect("dummy.db")
45
+ cursor = connection.cursor()
46
+
47
+ video_storer.store(cursor, videos)
48
+ cursor.execute("SELECT CHANNEL_NAME, URL, TITLE, DESCRIPTION, TRANSCRIPTION, SEGMENTS FROM VIDEO")
49
+ videos = cursor.fetchall()
50
+
51
+ assert videos == expected_video_list
52
+ finally:
53
+ os.remove("dummy.db")
test/test_sqlitecontextmanager.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from sqlite3 import Cursor
3
+
4
+ from youtube_transcriber.storing.sqlitecontextmanager import SQLiteContextManager
5
+
6
+ def test_sqlite_context_manager_init():
7
+ sqlite_context_manager = SQLiteContextManager("dummyinit.db")
8
+ assert type(sqlite_context_manager) == SQLiteContextManager
9
+
10
+ def test_enter_context_manager():
11
+ with SQLiteContextManager("dummy.db") as cursor:
12
+ assert type(cursor) == Cursor
13
+ os.remove("dummy.db")
test/test_utils.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from youtube_transcriber.video import YoutubeVideo
2
+ from youtube_transcriber.utils import create_videos
3
+ from youtube_transcriber.utils import nest_list
4
+
5
+ def test_create_videos():
6
+ video_params = [
7
+ {'channel_name': 'MrBeast Shorts', 'url': 'https://www.youtube.com/watch?v=mJ4t7iNF86g'},
8
+ {'channel_name': 'MrBeast Shorts', 'url': 'https://www.youtube.com/watch?v=UPhxU9J46Qk'}
9
+ ]
10
+ videos = create_videos(video_params)
11
+ assert len(videos) == 2
12
+ assert type(videos[0]) == YoutubeVideo
13
+ assert videos[1].url == "https://www.youtube.com/watch?v=UPhxU9J46Qk"
14
+
15
+ def test_nest_list():
16
+ l = [0, 1, 2, 3, 4, 5]
17
+
18
+ nested_l = nest_list(l, 6)
19
+ assert nested_l == [[0, 1, 2, 3, 4, 5]]
20
+
21
+ nested_l = nest_list(l, 5)
22
+ assert nested_l == [[0, 1, 2, 3, 4], [5]]
23
+
24
+ nested_l = nest_list(l, 4)
25
+ assert nested_l == [[0, 1, 2, 3], [4, 5]]
26
+
27
+ nested_l = nest_list(l, 3)
28
+ assert nested_l == [[0, 1, 2], [3, 4, 5]]
29
+
30
+ nested_l = nest_list(l, 2)
31
+ assert nested_l == [[0, 1], [2, 3], [4, 5]]
32
+
33
+ nested_l = nest_list(l, 1)
34
+ assert nested_l == [[0], [1], [2], [3], [4], [5]]
test/test_video.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from youtube_transcriber.video import YoutubeVideo
4
+
5
+ def test_youtube_video_init():
6
+ video = YoutubeVideo(channel_name="The verge",
7
+ url="https://www.youtube.com/watch?v=Jzl0hHTc7Jw",
8
+ title="Pixel 7 Pro and 7 hands-on: more of the same",
9
+ description="Google’s Pixel 7 and 7 Pro...",
10
+ transcription=" Seven years ago, we set out...",
11
+ segments=[{"start": 0.0, "end": 1.3, "text": " Seven years ago"},
12
+ {"start": 1.3, "end": 2.3, "text": " we set out..."}])
13
+
14
+ assert type(video) == YoutubeVideo
15
+ assert video.channel_name == "The verge"
16
+ assert video.url == "https://www.youtube.com/watch?v=Jzl0hHTc7Jw"
17
+ assert video.title == "Pixel 7 Pro and 7 hands-on: more of the same"
18
+ assert video.description == "Google’s Pixel 7 and 7 Pro..."
19
+ assert video.transcription == " Seven years ago, we set out..."
20
+ assert video.segments == [{"start": 0.0, "end": 1.3, "text": " Seven years ago"},
21
+ {"start": 1.3, "end": 2.3, "text": " we set out..."}]
22
+
23
+ def test_youtube_video_to_tuple():
24
+ video = YoutubeVideo(channel_name="The verge",
25
+ url="https://www.youtube.com/watch?v=Jzl0hHTc7Jw",
26
+ title="Pixel 7 Pro and 7 hands-on: more of the same",
27
+ description="Google’s Pixel 7 and 7 Pro...",
28
+ transcription=" Seven years ago, we set out...",
29
+ segments=[{"start": 0.0, "end": 1.3, "text": " Seven years ago"},
30
+ {"start": 1.3, "end": 2.3, "text": " we set out..."}])
31
+ video_tuple = video.to_tuple()
32
+ assert len(video_tuple) == 6
33
+ assert type(video_tuple) == tuple
34
+ assert video_tuple[0] == "The verge"
35
+ assert video_tuple[1] == "https://www.youtube.com/watch?v=Jzl0hHTc7Jw"
36
+ assert video_tuple[2] == "Pixel 7 Pro and 7 hands-on: more of the same"
37
+ assert video_tuple[3] == "Google’s Pixel 7 and 7 Pro..."
38
+ assert video_tuple[4] == " Seven years ago, we set out..."
39
+ assert video_tuple[5] == [{"start": 0.0, "end": 1.3, "text": " Seven years ago"},
40
+ {"start": 1.3, "end": 2.3, "text": " we set out..."}]
test/test_whispertransform.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from youtube_transcriber.transforming.whispertransform import WhisperTransform
2
+ from youtube_transcriber.video import YoutubeVideo
3
+
4
+ def test_whisper_transform_init():
5
+ transcriber = WhisperTransform()
6
+ assert type(transcriber) == WhisperTransform
7
+ # TODO: Check if loaded model is 'base'
8
+ assert transcriber.without_timestamps == False
9
+
10
+ def test_apply():
11
+ transcriber = WhisperTransform()
12
+ raw_video = YoutubeVideo(channel_name="Tquotes",
13
+ url="https://www.youtube.com/watch?v=NSkoGZ8J1Ag")
14
+ transcribed_video = transcriber.apply(raw_video)
15
+ assert type(transcribed_video) == YoutubeVideo
16
+ assert transcribed_video.channel_name == raw_video.channel_name
17
+ assert transcribed_video.url == raw_video.url
18
+ assert transcribed_video.title == raw_video.title
19
+ assert transcribed_video.description == raw_video.description
20
+ assert transcribed_video.transcription == " Good morning. Good morning and welcome to Apple's 1984 annual shareholders meeting. I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan. That's Bob Dylan. Come writers and critics who prophesize with your pens and keep your eyes wide, the chance won't come again. And don't speak too soon for the wheels still in spin. And there's no telling who that it's naming. For the loser now will be later to win for the times they are a change in. Now."
21
+ assert transcribed_video.segments == [{'start': 0.0, 'end': 2.0, 'text': ' Good morning.'},
22
+ {'start': 2.0, 'end': 11.0, 'text': " Good morning and welcome to Apple's 1984 annual shareholders meeting."},
23
+ {'start': 11.0, 'end': 16.0, 'text': " I'd like to open the meeting with a part of an old poem about a 20-year-old poem by Dylan."},
24
+ {'start': 16.0, 'end': 18.0, 'text': " That's Bob Dylan."},
25
+ {'start': 18.0, 'end': 23.0, 'text': ' Come writers and critics who prophesize with your pens and keep your eyes wide,'},
26
+ {'start': 23.0, 'end': 25.0, 'text': " the chance won't come again."},
27
+ {'start': 25.0, 'end': 28.0, 'text': " And don't speak too soon for the wheels still in spin."},
28
+ {'start': 28.0, 'end': 30.0, 'text': " And there's no telling who that it's naming."},
29
+ {'start': 30.0, 'end': 36.0, 'text': ' For the loser now will be later to win for the times they are a change in.'},
30
+ {'start': 36.0, 'end': 51.0, 'text': ' Now.'}]
31
+
test/test_youtubevideopreprocessor.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import pytest
3
+ import os
4
+
5
+ from youtube_transcriber.preprocessing.youtubevideopreprocessor import YoutubeVideoPreprocessor
6
+ from youtube_transcriber.loading.serialization import JsonSerializer
7
+
8
+ @pytest.fixture
9
+ def youtube_video_preprocessor():
10
+ yt_video_preprocessor = YoutubeVideoPreprocessor(mode="channel_name",
11
+ serializer=JsonSerializer())
12
+ load_paths, dataset_folder = yt_video_preprocessor.preprocess(name="Best Shorts Quotes",
13
+ num_videos=2,
14
+ videos_in_ds=["GU2_xlNCJrA"])
15
+ return load_paths, dataset_folder
16
+
17
+ @pytest.fixture
18
+ def expected_file_paths():
19
+ youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
20
+ expected_dir = youtube_folder/"Best Shorts Quotes"
21
+ return [expected_dir/"0.json", expected_dir/"1.json"]
22
+
23
+ @pytest.fixture
24
+ def expected_folder_path():
25
+ youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
26
+ expected_dir = youtube_folder/"Best Shorts Quotes"
27
+ return expected_dir
28
+
29
+ def test_youtube_video_preprocessor_init():
30
+ yt_video_preprocessor = YoutubeVideoPreprocessor(mode="channel_name",
31
+ serializer=JsonSerializer())
32
+ assert type(yt_video_preprocessor) == YoutubeVideoPreprocessor
33
+ assert type(yt_video_preprocessor.serializer) == JsonSerializer
34
+ assert yt_video_preprocessor.mode == "channel_name"
35
+
36
+ def test_created_file(youtube_video_preprocessor, expected_file_paths):
37
+ paths, _ = youtube_video_preprocessor
38
+ for path in paths:
39
+ assert os.path.exists(expected_file_paths[0]) == True
40
+ assert os.path.exists(expected_file_paths[1]) == True
41
+
42
+ def test_created_folder(youtube_video_preprocessor, expected_folder_path):
43
+ _, folder = youtube_video_preprocessor
44
+ assert folder == expected_folder_path
45
+
46
+ def test_loop_through_created_files(youtube_video_preprocessor):
47
+ expected_data = [
48
+ {
49
+ "channel_name": "Best Shorts Quotes",
50
+ "url": "https://www.youtube.com/watch?v=GU2_xlNCJrA"
51
+ },
52
+ {
53
+ "channel_name": "Best Shorts Quotes",
54
+ "url": "https://www.youtube.com/watch?v=ttRI4EmmxkY"
55
+ }
56
+ ]
57
+
58
+ paths, folder = youtube_video_preprocessor
59
+ for i, path in enumerate(paths):
60
+ serializer = JsonSerializer()
61
+ assert serializer.load(path) == expected_data[i]
threadeddatapipeline.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ import logging
3
+ from pathlib import Path
4
+ from typing import List
5
+
6
+ from datapipeline import DataPipeline
7
+
8
+ logging.basicConfig(level=logging.INFO, format="(%(threadName)-5s) %(message)s")
9
+
10
+ class ThreadedDataPipeline(threading.Thread):
11
+ """Class that wraps a data pipeline in a thread."""
12
+
13
+ def __init__(self,
14
+ data_pipeline: DataPipeline,
15
+ load_paths: List[Path],
16
+ **kwargs) -> None:
17
+ super().__init__(**kwargs)
18
+ self.data_pipeline = data_pipeline
19
+ self.load_paths = load_paths
20
+
21
+ def run(self) -> None:
22
+ logging.info("Started processing data.")
23
+ self.data_pipeline.process(self.load_paths)
24
+ logging.info("Finished processing data.")
transforming/__init__.py ADDED
File without changes
transforming/adddescriptiontransform.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ from pytube import YouTube
4
+
5
+ from video import YoutubeVideo
6
+ from utils import accepts_types
7
+ from transforming.transform import Transform
8
+
9
+ class AddDescriptionTransform(Transform):
10
+ """
11
+ Transform a Video object using PyTube. Adds title to YouTube video DTO.
12
+ It's a concrete Transform.
13
+ """
14
+
15
+ @accepts_types(YoutubeVideo)
16
+ def apply(self, video: YoutubeVideo) -> YoutubeVideo:
17
+
18
+ yt = YouTube(video.url)
19
+
20
+ video_With_description_params = {
21
+ "channel_name": video.channel_name,
22
+ "url": video.url,
23
+ "title": video.title,
24
+ "description": self._get_video_description(yt),
25
+ "transcription": video.transcription,
26
+ "segments": video.segments
27
+ }
28
+
29
+ return YoutubeVideo(**video_With_description_params)
30
+
31
+ def _get_video_description(self, yt: Any) -> str:
32
+ return str(yt.description)
transforming/addtitletransform.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ from pytube import YouTube
4
+
5
+ from video import YoutubeVideo
6
+ from utils import accepts_types
7
+ from transforming.transform import Transform
8
+
9
+ class AddTitleTransform(Transform):
10
+ """
11
+ Transform a Video object using PyTube. Adds title to YouTube video DTO.
12
+ It's a concrete Transform.
13
+ """
14
+
15
+ @accepts_types(YoutubeVideo)
16
+ def apply(self, video: YoutubeVideo) -> YoutubeVideo:
17
+ yt = YouTube(video.url)
18
+
19
+ video_With_title_params = {
20
+ "channel_name": video.channel_name,
21
+ "url": video.url,
22
+ "title": self._get_video_title(yt),
23
+ "description": video.description,
24
+ "transcription": video.transcription,
25
+ "segments": video.segments
26
+ }
27
+
28
+ return YoutubeVideo(**video_With_title_params)
29
+
30
+ def _get_video_title(self, yt: Any) -> str:
31
+ return str(yt.title)
transforming/batchtransformer.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from video import YoutubeVideo
4
+ from transforming.transform import Transform
5
+ from utils import accepts_types
6
+
7
+ class BatchTransformer:
8
+
9
+ """Class that applies multiple transforms to YouTube video object."""
10
+
11
+ def __init__(self, transforms: List[Transform]) -> None:
12
+ self._transforms = transforms
13
+
14
+ @property
15
+ def transforms(self) -> List[Transform]:
16
+ return self._transforms
17
+
18
+ @transforms.setter
19
+ def transforms(self, transforms: List[Transform]) -> None:
20
+ self._transforms = transforms
21
+
22
+ @accepts_types(list)
23
+ def apply(self, videos: List[YoutubeVideo]) -> List[YoutubeVideo]:
24
+ for transform in self._transforms:
25
+ videos = list(map(transform.apply, videos))
26
+ return videos
transforming/transform.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ from video import YoutubeVideo
4
+
5
+ class Transform(ABC):
6
+ """Interface for concrete Transform which transform a video object."""
7
+
8
+ @abstractmethod
9
+ def apply(self, video: YoutubeVideo) -> YoutubeVideo:
10
+ """Apply a transform to a video. Method must be implemented by
11
+ concrete transforms."""
transforming/whispertransform.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Any
4
+ from collections import OrderedDict
5
+
6
+ from pytube import YouTube
7
+ import whisper
8
+
9
+ from transforming.transform import Transform
10
+ from video import YoutubeVideo
11
+ from utils import accepts_types
12
+
13
+ class WhisperTransform(Transform):
14
+ """
15
+ Transform a Video object using Whisper model. It's a
16
+ concrete Transform.
17
+ Args:
18
+ model (`str`):
19
+ Size of Whisper model. Can be tiny, base (default), small, medium, and large.
20
+ without_timestamps (`bool`, defaults to `False`):
21
+ To add phrase-level timestamps.
22
+ """
23
+
24
+ def __init__(self, model: str="base", without_timestamps: bool=False) -> None:
25
+ self.model = whisper.load_model(model)
26
+ self.without_timestamps = without_timestamps
27
+
28
+ @accepts_types(YoutubeVideo)
29
+ def apply(self, video: YoutubeVideo) -> YoutubeVideo:
30
+ """Creates a new video with transcriptions created by Whisper.
31
+ """
32
+ # Create a YouTube object
33
+ yt = YouTube(video.url)
34
+
35
+ # Get audio from video
36
+ try:
37
+ audio_file = self._get_audio_from_video(yt)
38
+
39
+ except Exception as e:
40
+ print(f"Exception: {e}")
41
+
42
+ result = self.model.transcribe(audio_file,
43
+ without_timestamps=self.without_timestamps)
44
+ transcription = result["text"]
45
+
46
+ data = []
47
+ for seg in result['segments']:
48
+ data.append(OrderedDict({'start': seg['start'], 'end': seg['end'],'text': seg['text']}))
49
+
50
+ os.remove(audio_file)
51
+
52
+ return YoutubeVideo(channel_name = video.channel_name,
53
+ url = video.url,
54
+ title = video.title,
55
+ description = video.description,
56
+ transcription = transcription,
57
+ segments = data)
58
+
59
+ def _get_audio_from_video(self, yt: Any) -> Path:
60
+ # TODO: Add credits
61
+ video = yt.streams.filter(only_audio=True).first()
62
+ out_file = video.download(output_path=".")
63
+ base, _ = os.path.splitext(out_file)
64
+ new_file = base + ".mp3"
65
+ os.rename(out_file, new_file)
66
+ return new_file
utils.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ from video import YoutubeVideo
4
+ from errors import DifferentNumberOfArgumentsError
5
+
6
+ def accepts_types(*expected_types):
7
+ """Decorator that checks that the arguments of a method are valid.
8
+ :raise TypeError: If type of argument isn't valid
9
+ :raise DifferentNumberOfArgumentsError: If number of arguments passed to the
10
+ decorator and to the method (minus self) aren't the same
11
+ """
12
+ def check_types(func):
13
+ def wrapper(*args, **kwargs):
14
+ args_without_self = args[1:]
15
+ _raise_error_if_number_of_passed_and_expected_arguments_dont_match(args_without_self, expected_types)
16
+ _raise_type_error_if_passed_and_expected_types_dont_match(args_without_self, expected_types)
17
+ return func(*args, **kwargs)
18
+ return wrapper
19
+ return check_types
20
+
21
+ def _raise_error_if_number_of_passed_and_expected_arguments_dont_match(passed_args, expected_types):
22
+ if len(passed_args) != len(expected_types):
23
+ msg = "Number of arguments passed in decorator " \
24
+ f"{len(expected_types)} doesn't match with number of " \
25
+ f"arguments in method, i.e., {len(passed_args)}"
26
+ raise DifferentNumberOfArgumentsError(msg)
27
+
28
+ def _raise_type_error_if_passed_and_expected_types_dont_match(passed_args, expected_types):
29
+ for (arg, expected_type) in zip(passed_args, expected_types):
30
+ if not isinstance(arg, expected_type):
31
+ raise TypeError(f"Argument '{arg}' is of type {type(arg)}. "
32
+ f"'{expected_type}' expected instead")
33
+
34
+ def create_videos(video_parameters: List[Dict]) -> List[YoutubeVideo]:
35
+ """Factory function that creates a list of YoutubeVideos from a list of
36
+ dictionaries representing video parameters
37
+ """
38
+ youtube_videos = []
39
+ for params in video_parameters:
40
+ youtube_video = YoutubeVideo(channel_name=params["channel_name"],
41
+ url=params["url"])
42
+ youtube_videos.append(youtube_video)
43
+ return youtube_videos
44
+
45
+ def nest_list(list: list, nested_list_length: int) -> List[List]:
46
+ new_list = []
47
+ nested_list = []
48
+ for item in list:
49
+ nested_list.append(item)
50
+ if len(nested_list) == nested_list_length:
51
+ new_list.append(nested_list)
52
+ nested_list = []
53
+ if len(nested_list) != 0:
54
+ new_list.append(nested_list)
55
+ return new_list
56
+
57
+ def is_google_colab():
58
+ try:
59
+ import google.colab
60
+ return True
61
+ except:
62
+ return False
video.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional, Tuple
2
+
3
+ from pydantic import BaseModel
4
+
5
+ class YoutubeVideo(BaseModel):
6
+ """This class represent a YouTube video entry
7
+ """
8
+ channel_name: str
9
+ url: str
10
+ title: Optional[str]
11
+ description: Optional[str]
12
+ transcription: Optional[str]
13
+ segments: Optional[List[Dict]] = None
14
+
15
+ def to_tuple(self) -> Tuple:
16
+ """Convert TranscribedVideo object to a tuple of the type:
17
+ (channel_name, url, title, description, transcription, segments).
18
+ """
19
+ return (self.channel_name, self.url, self.title,
20
+ self.description, self.transcription, self.segments)