from pathlib import Path import pytest from youtube_transcriber.loading.loaderiterator import LoaderIterator from youtube_transcriber.loading.serialization import JsonSerializer @pytest.fixture def loader_iterator(): test_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber/test" paths = [Path(test_folder/"files/1.json"), Path(test_folder/"files/2.json"), Path("non-existing-path"), Path(test_folder/"files/3.json"), Path(test_folder/"files/4.json"), Path(test_folder/"files/5.json")] return LoaderIterator(JsonSerializer(), 2, paths) def test_loader_iterator_init(): loader_iterator = LoaderIterator(JsonSerializer(), 3, "dummy_paths") assert type(loader_iterator) == LoaderIterator assert type(loader_iterator.serializer) == JsonSerializer assert loader_iterator.load_paths == "dummy_paths" assert loader_iterator.num_files_per_iteration == 3 def test_loop_through_loaded_data(loader_iterator): expected_data = [ [ { "channel_name": "The verge", "url": "https://www.youtube.com/watch?v=YMlTSmusEmA" }, { "channel_name": "The verge", "url": "https://www.youtube.com/watch?v=Jzl0hHTc7Jw" } ], [ { "channel_name": "The verge", "url": "https://www.youtube.com/watch?v=gV50hpSKHFQ" } ], [ { "channel_name": "The verge", "url": "https://www.youtube.com/watch?v=N6ZyzoibXqg" }, { "channel_name": "The verge", "url": "https://www.youtube.com/watch?v=q90v9FLXi1E" } ] ] for i, data in enumerate(loader_iterator): assert data == expected_data[i]