import os import options import pronouncing from tqdm.auto import tqdm from typing import List from dataset import GridDataset base = os.path.abspath('..') anno_dir = os.path.join(base, options.alignments_dir) phonemes_dir = os.path.join(base, options.phonemes_dir) images_dir = os.path.join(base, options.images_dir) datasets_filenames = ['overlap_train.txt', 'overlap_val.txt'] max_vid_len = 0 max_text_len = 0 max_phonemes_len = 0 for datasets_filename in datasets_filenames: datasets_filepath = os.path.join(base, 'data', datasets_filename) new_datasets_filepath = os.path.join( base, 'data', 'phonemes_' + datasets_filename ) video_filepaths = open(datasets_filepath, 'r').readlines() valid_filepaths = [] for video_filepath in tqdm(video_filepaths): video_filepath = video_filepath.strip() basename = os.path.basename(video_filepath) parts = video_filepath.split('/') speaker_dirname = parts[0] align_file = os.path.join( anno_dir, speaker_dirname, f'{basename}.align' ) vid_images_dir = os.path.join( images_dir, speaker_dirname, basename ) new_video_filepath = os.path.join( options.video_dir, speaker_dirname, f'{basename}.mpg' ) image_filenames = os.listdir(vid_images_dir) image_filenames = [ filename for filename in image_filenames if filename.endswith('.jpg') ] # print(align_file) # print(image_filenames) vid_len = len(image_filenames) try: sentence: List[str] = GridDataset.load_sentence( align_file, char_map=options.text_char_map ) except FileNotFoundError: continue text_len = len(sentence) sentence_str = ''.join(sentence) phonemes_sentence = GridDataset.text_to_phonemes( sentence_str, as_str=False ) phonemes_len = len(phonemes_sentence) # print(phonemes_len) max_vid_len = max(vid_len, max_vid_len) max_text_len = max(text_len, max_text_len) max_phonemes_len = max(phonemes_len, max_phonemes_len) assert ( (max_vid_len > 2 * max_text_len) and (max_vid_len > 2 * max_phonemes_len) ) valid_filepaths.append(new_video_filepath) open(new_datasets_filepath, 'w').write('\n'.join(valid_filepaths)) print('new valid filepaths written to:', new_datasets_filepath) print('MAX_VID_LEN', max_vid_len) print('MAX_TEXT_LEN', max_text_len) print('MAX_PHONEMES_LEN', max_phonemes_len)