import sys sys.path.append('..') import options import os.path import pronouncing import options as opt from Loader import GridLoader from tqdm.auto import tqdm from dataset import GridDataset from typing import List VALID_FILE_EXT = ('.txt', '.align') EXCLUDED_PHONEMES = ('foreign', 'french') MAX_VID_LEN = 100 CTC_SCALE = 2 base = os.path.abspath('..') anno_dir = os.path.join(base, options.alignments_dir) phonemes_dir = os.path.join(base, options.phonemes_dir) images_dir = os.path.join(base, options.images_dir) speaker_dirnames = sorted(os.listdir(anno_dir)) valid_sentence_pairs = [] sentence_pairs = [] for speaker_dirname in tqdm(speaker_dirnames): speaker_dir = os.path.join(anno_dir, speaker_dirname) filenames = os.listdir(speaker_dir) for filename in filenames: _, ext = os.path.splitext(filename) if ext not in VALID_FILE_EXT: continue align_file = os.path.join(speaker_dir, filename) sentence_pairs.append((speaker_dirname, filename)) sentence_pairs = sorted(sentence_pairs) pbar = tqdm(sentence_pairs) pairs_without_phonemes = 0 max_valid_vid_len = 0 max_valid_phonemes_len = 0 unique_phonemes = set() valid_unique_phonemes = set() unique_text_chars = set() unique_words = set() valid_unique_words = set() max_length = 0 for sentence_pair in pbar: speaker_dirname, filename = sentence_pair basename, _ = os.path.splitext(filename) align_file = os.path.join(anno_dir, speaker_dirname, filename) pair_str = f'{speaker_dirname}/{basename}' vid_images_dir = os.path.join(images_dir, speaker_dirname, basename) image_filenames = os.listdir(vid_images_dir) image_filenames = [ filename for filename in image_filenames if filename.endswith('.jpg') ] vid_len = len(image_filenames) phonemes_speaker_dir = os.path.join(phonemes_dir, speaker_dirname) if not os.path.exists(phonemes_speaker_dir): os.mkdir(phonemes_speaker_dir) phonemes_file = os.path.join(phonemes_dir, speaker_dirname, filename) sentence: List[str] = GridDataset.load_sentence( align_file, char_map=opt.text_char_map ) sentence_str = ''.join(sentence) sentence_words = sentence_str.split(' ') sentence_phonemes = [] flat_sentence_phonemes = [] has_valid_phonemes = True for char in sentence_str: unique_text_chars.add(char) for word in sentence_words: phoneme_set = pronouncing.phones_for_word(word) if len(phoneme_set) == 0: pbar.desc = f'NO-PHONEMES: {word} [{pairs_without_phonemes}]' has_valid_phonemes = False pairs_without_phonemes += 1 break phonemes = pronouncing.phones_for_word(word)[0] phonemes = phonemes.split(' ') assert len(phonemes) > 0 length = 0 for phoneme in phonemes: if phoneme in EXCLUDED_PHONEMES: has_valid_phonemes = False pairs_without_phonemes += 1 break unique_phonemes.add(phoneme) if not has_valid_phonemes: break sentence_phonemes.append(phonemes) flat_sentence_phonemes.extend(phonemes) flat_sentence_phonemes.append(' ') unique_words.add(word) length += len(phonemes) if not has_valid_phonemes: continue if flat_sentence_phonemes[-1] == ' ': flat_sentence_phonemes = flat_sentence_phonemes[:-1] is_valid_video = ( (vid_len > 0) and (vid_len < MAX_VID_LEN) and # (vid_len > 2 * len(sentence_str)) and (vid_len > CTC_SCALE * len(flat_sentence_phonemes)) and has_valid_phonemes ) if is_valid_video: valid_sentence_pairs.append(sentence_pair) num_flat_phonemes = len(flat_sentence_phonemes) if vid_len > max_valid_vid_len: max_valid_vid_len = vid_len if num_flat_phonemes > max_valid_phonemes_len: max_valid_phonemes_len = num_flat_phonemes for word in sentence_words: valid_unique_words.add(word) for phonemes in sentence_phonemes: for phoneme in phonemes: valid_unique_phonemes.add(phoneme) # sentence_phonemes = ' '.join(sentence_phonemes) # print(sentence_phonemes) raw_phonemes = '\n'.join([ ' '.join(phonemes) for phonemes in sentence_phonemes ]) # print(phonemes_file) if not os.path.exists(phonemes_file): open(phonemes_file, 'w').write(raw_phonemes) # input('>>> ') valid_pair_dirs = [] for sentence_pair in valid_sentence_pairs: speaker_dirname, filename = sentence_pair basename, _ = os.path.splitext(filename) pair_str = f'{speaker_dirname}/{basename}' valid_pair_dirs.append(pair_str) open(f'../data/{opt.dataset}-CTC{CTC_SCALE}-valid-pairs.txt', 'w').write( '\n'.join(valid_pair_dirs) ) print('VALID PAIRS', len(valid_pair_dirs)) print('VALID UNIQUE WORDS', valid_unique_words) print('PAIRS W/O PHONEMES', pairs_without_phonemes) print('UNIQUE PHONEMES', sorted(list(unique_phonemes))) print('VALID UNIQUE PHONEMES', sorted(list(valid_unique_phonemes))) print('UNIQUE CHARS', sorted(list(unique_text_chars))) print('MAX VALID PHONEMES LEN', max_valid_phonemes_len) print('MAX VALID VID LEN', max_valid_vid_len) print('>>>') # print(sentence_pairs[:10])