whisper-youtube-2-hf_dataset / test /test_youtubevideopreprocessor.py
juancopi81's picture
Duplicate from Whispering-GPT/whisper-youtube-2-hf_dataset
7288748
from pathlib import Path
import pytest
import os
from youtube_transcriber.preprocessing.youtubevideopreprocessor import YoutubeVideoPreprocessor
from youtube_transcriber.loading.serialization import JsonSerializer
@pytest.fixture
def youtube_video_preprocessor():
yt_video_preprocessor = YoutubeVideoPreprocessor(mode="channel_name",
serializer=JsonSerializer())
load_paths, dataset_folder = yt_video_preprocessor.preprocess(name="Best Shorts Quotes",
num_videos=2,
videos_in_ds=["GU2_xlNCJrA"])
return load_paths, dataset_folder
@pytest.fixture
def expected_file_paths():
youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
expected_dir = youtube_folder/"Best Shorts Quotes"
return [expected_dir/"0.json", expected_dir/"1.json"]
@pytest.fixture
def expected_folder_path():
youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
expected_dir = youtube_folder/"Best Shorts Quotes"
return expected_dir
def test_youtube_video_preprocessor_init():
yt_video_preprocessor = YoutubeVideoPreprocessor(mode="channel_name",
serializer=JsonSerializer())
assert type(yt_video_preprocessor) == YoutubeVideoPreprocessor
assert type(yt_video_preprocessor.serializer) == JsonSerializer
assert yt_video_preprocessor.mode == "channel_name"
def test_created_file(youtube_video_preprocessor, expected_file_paths):
paths, _ = youtube_video_preprocessor
for path in paths:
assert os.path.exists(expected_file_paths[0]) == True
assert os.path.exists(expected_file_paths[1]) == True
def test_created_folder(youtube_video_preprocessor, expected_folder_path):
_, folder = youtube_video_preprocessor
assert folder == expected_folder_path
def test_loop_through_created_files(youtube_video_preprocessor):
expected_data = [
{
"channel_name": "Best Shorts Quotes",
"url": "https://www.youtube.com/watch?v=GU2_xlNCJrA"
},
{
"channel_name": "Best Shorts Quotes",
"url": "https://www.youtube.com/watch?v=ttRI4EmmxkY"
}
]
paths, folder = youtube_video_preprocessor
for i, path in enumerate(paths):
serializer = JsonSerializer()
assert serializer.load(path) == expected_data[i]