whisper-youtube-2-hf_dataset / test /test_loaderiterator.py
juancopi81's picture
Duplicate from Whispering-GPT/whisper-youtube-2-hf_dataset
7288748
from pathlib import Path
import pytest
from youtube_transcriber.loading.loaderiterator import LoaderIterator
from youtube_transcriber.loading.serialization import JsonSerializer
@pytest.fixture
def loader_iterator():
test_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber/test"
paths = [Path(test_folder/"files/1.json"), Path(test_folder/"files/2.json"),
Path("non-existing-path"), Path(test_folder/"files/3.json"),
Path(test_folder/"files/4.json"), Path(test_folder/"files/5.json")]
return LoaderIterator(JsonSerializer(), 2, paths)
def test_loader_iterator_init():
loader_iterator = LoaderIterator(JsonSerializer(), 3, "dummy_paths")
assert type(loader_iterator) == LoaderIterator
assert type(loader_iterator.serializer) == JsonSerializer
assert loader_iterator.load_paths == "dummy_paths"
assert loader_iterator.num_files_per_iteration == 3
def test_loop_through_loaded_data(loader_iterator):
expected_data = [
[
{
"channel_name": "The verge",
"url": "https://www.youtube.com/watch?v=YMlTSmusEmA"
},
{
"channel_name": "The verge",
"url": "https://www.youtube.com/watch?v=Jzl0hHTc7Jw"
}
],
[
{
"channel_name": "The verge",
"url": "https://www.youtube.com/watch?v=gV50hpSKHFQ"
}
],
[
{
"channel_name": "The verge",
"url": "https://www.youtube.com/watch?v=N6ZyzoibXqg"
},
{
"channel_name": "The verge",
"url": "https://www.youtube.com/watch?v=q90v9FLXi1E"
}
]
]
for i, data in enumerate(loader_iterator):
assert data == expected_data[i]