File size: 2,639 Bytes
df07554 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import os
import options
import pronouncing
from tqdm.auto import tqdm
from typing import List
from dataset import GridDataset
base = os.path.abspath('..')
anno_dir = os.path.join(base, options.alignments_dir)
phonemes_dir = os.path.join(base, options.phonemes_dir)
images_dir = os.path.join(base, options.images_dir)
datasets_filenames = ['overlap_train.txt', 'overlap_val.txt']
max_vid_len = 0
max_text_len = 0
max_phonemes_len = 0
for datasets_filename in datasets_filenames:
datasets_filepath = os.path.join(base, 'data', datasets_filename)
new_datasets_filepath = os.path.join(
base, 'data', 'phonemes_' + datasets_filename
)
video_filepaths = open(datasets_filepath, 'r').readlines()
valid_filepaths = []
for video_filepath in tqdm(video_filepaths):
video_filepath = video_filepath.strip()
basename = os.path.basename(video_filepath)
parts = video_filepath.split('/')
speaker_dirname = parts[0]
align_file = os.path.join(
anno_dir, speaker_dirname, f'{basename}.align'
)
vid_images_dir = os.path.join(
images_dir, speaker_dirname, basename
)
new_video_filepath = os.path.join(
options.video_dir, speaker_dirname, f'{basename}.mpg'
)
image_filenames = os.listdir(vid_images_dir)
image_filenames = [
filename for filename in image_filenames
if filename.endswith('.jpg')
]
# print(align_file)
# print(image_filenames)
vid_len = len(image_filenames)
try:
sentence: List[str] = GridDataset.load_sentence(
align_file, char_map=options.text_char_map
)
except FileNotFoundError:
continue
text_len = len(sentence)
sentence_str = ''.join(sentence)
phonemes_sentence = GridDataset.text_to_phonemes(
sentence_str, as_str=False
)
phonemes_len = len(phonemes_sentence)
# print(phonemes_len)
max_vid_len = max(vid_len, max_vid_len)
max_text_len = max(text_len, max_text_len)
max_phonemes_len = max(phonemes_len, max_phonemes_len)
assert (
(max_vid_len > 2 * max_text_len) and
(max_vid_len > 2 * max_phonemes_len)
)
valid_filepaths.append(new_video_filepath)
open(new_datasets_filepath, 'w').write('\n'.join(valid_filepaths))
print('new valid filepaths written to:', new_datasets_filepath)
print('MAX_VID_LEN', max_vid_len)
print('MAX_TEXT_LEN', max_text_len)
print('MAX_PHONEMES_LEN', max_phonemes_len) |