torchnet / scripts /overlap_filter.py
milselarch's picture
push to main
df07554
raw
history blame contribute delete
No virus
2.64 kB
import os
import options
import pronouncing
from tqdm.auto import tqdm
from typing import List
from dataset import GridDataset
base = os.path.abspath('..')
anno_dir = os.path.join(base, options.alignments_dir)
phonemes_dir = os.path.join(base, options.phonemes_dir)
images_dir = os.path.join(base, options.images_dir)
datasets_filenames = ['overlap_train.txt', 'overlap_val.txt']
max_vid_len = 0
max_text_len = 0
max_phonemes_len = 0
for datasets_filename in datasets_filenames:
datasets_filepath = os.path.join(base, 'data', datasets_filename)
new_datasets_filepath = os.path.join(
base, 'data', 'phonemes_' + datasets_filename
)
video_filepaths = open(datasets_filepath, 'r').readlines()
valid_filepaths = []
for video_filepath in tqdm(video_filepaths):
video_filepath = video_filepath.strip()
basename = os.path.basename(video_filepath)
parts = video_filepath.split('/')
speaker_dirname = parts[0]
align_file = os.path.join(
anno_dir, speaker_dirname, f'{basename}.align'
)
vid_images_dir = os.path.join(
images_dir, speaker_dirname, basename
)
new_video_filepath = os.path.join(
options.video_dir, speaker_dirname, f'{basename}.mpg'
)
image_filenames = os.listdir(vid_images_dir)
image_filenames = [
filename for filename in image_filenames
if filename.endswith('.jpg')
]
# print(align_file)
# print(image_filenames)
vid_len = len(image_filenames)
try:
sentence: List[str] = GridDataset.load_sentence(
align_file, char_map=options.text_char_map
)
except FileNotFoundError:
continue
text_len = len(sentence)
sentence_str = ''.join(sentence)
phonemes_sentence = GridDataset.text_to_phonemes(
sentence_str, as_str=False
)
phonemes_len = len(phonemes_sentence)
# print(phonemes_len)
max_vid_len = max(vid_len, max_vid_len)
max_text_len = max(text_len, max_text_len)
max_phonemes_len = max(phonemes_len, max_phonemes_len)
assert (
(max_vid_len > 2 * max_text_len) and
(max_vid_len > 2 * max_phonemes_len)
)
valid_filepaths.append(new_video_filepath)
open(new_datasets_filepath, 'w').write('\n'.join(valid_filepaths))
print('new valid filepaths written to:', new_datasets_filepath)
print('MAX_VID_LEN', max_vid_len)
print('MAX_TEXT_LEN', max_text_len)
print('MAX_PHONEMES_LEN', max_phonemes_len)