File size: 2,639 Bytes
df07554
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import options
import pronouncing

from tqdm.auto import tqdm
from typing import List
from dataset import GridDataset

base = os.path.abspath('..')
anno_dir = os.path.join(base, options.alignments_dir)
phonemes_dir = os.path.join(base, options.phonemes_dir)
images_dir = os.path.join(base, options.images_dir)
datasets_filenames = ['overlap_train.txt', 'overlap_val.txt']

max_vid_len = 0
max_text_len = 0
max_phonemes_len = 0

for datasets_filename in datasets_filenames:
    datasets_filepath = os.path.join(base, 'data', datasets_filename)
    new_datasets_filepath = os.path.join(
        base, 'data', 'phonemes_' + datasets_filename
    )

    video_filepaths = open(datasets_filepath, 'r').readlines()
    valid_filepaths = []

    for video_filepath in tqdm(video_filepaths):
        video_filepath = video_filepath.strip()
        basename = os.path.basename(video_filepath)
        parts = video_filepath.split('/')
        speaker_dirname = parts[0]

        align_file = os.path.join(
            anno_dir, speaker_dirname, f'{basename}.align'
        )
        vid_images_dir = os.path.join(
            images_dir, speaker_dirname, basename
        )
        new_video_filepath = os.path.join(
            options.video_dir, speaker_dirname, f'{basename}.mpg'
        )

        image_filenames = os.listdir(vid_images_dir)
        image_filenames = [
            filename for filename in image_filenames
            if filename.endswith('.jpg')
        ]

        # print(align_file)
        # print(image_filenames)

        vid_len = len(image_filenames)

        try:
            sentence: List[str] = GridDataset.load_sentence(
                align_file, char_map=options.text_char_map
            )
        except FileNotFoundError:
            continue

        text_len = len(sentence)
        sentence_str = ''.join(sentence)
        phonemes_sentence = GridDataset.text_to_phonemes(
            sentence_str, as_str=False
        )

        phonemes_len = len(phonemes_sentence)
        # print(phonemes_len)

        max_vid_len = max(vid_len, max_vid_len)
        max_text_len = max(text_len, max_text_len)
        max_phonemes_len = max(phonemes_len, max_phonemes_len)
        assert (
            (max_vid_len > 2 * max_text_len) and
            (max_vid_len > 2 * max_phonemes_len)
        )

        valid_filepaths.append(new_video_filepath)

    open(new_datasets_filepath, 'w').write('\n'.join(valid_filepaths))
    print('new valid filepaths written to:', new_datasets_filepath)


print('MAX_VID_LEN', max_vid_len)
print('MAX_TEXT_LEN', max_text_len)
print('MAX_PHONEMES_LEN', max_phonemes_len)