In [None]:
import os

import numpy as np
import soundfile as sf
from pathlib import Path
from shutil import copyfile
from tqdm import tqdm

input_dataset_path = "[your_local_path]/synpaflex-corpus/v0.1/"
reorganized_dataset_path = "../synpaflex/"

maximal_duration = 12 # maximal audio file duration in seconds


In [None]:
wav_dir = os.path.join(reorganized_dataset_path, "wavs/")
os.makedirs(wav_dir, exist_ok=True)
data = []
total_duration = 0

# Precomputing walk_count for tqdm
walk_count = 0
for subdir, dirs, files in os.walk(input_dataset_path):
 walk_count += 1

# walk through dataset
for subdir, dirs, files in tqdm(os.walk(input_dataset_path), total=walk_count, bar_format='Data Reorganization : {l_bar}{bar}|'):
 for filename in files:
 filepath = os.path.join(subdir, filename)

 # read wav files
 if filepath.endswith(".wav"):
 try:
 wav, sr = sf.read(filepath)
 duration = len(wav) / sr
 
 # Only keep files with shorter durations than maximal_duration
 if duration <= maximal_duration:
 total_duration += duration
 path = Path(filepath)
 current_path = Path(path.parent.absolute())
 
 # find corresponding text file
 txt_file_path = os.path.join(current_path, "txt", filename.replace('.wav','.txt'))
 if not os.path.exists(txt_file_path):
 parent_path = Path(current_path.parent.absolute())
 txt_file_path = os.path.join(parent_path, "txt", filename.replace('.wav', '.txt'))
 if not os.path.exists(txt_file_path):
 break
 norm_text_file_path = txt_file_path.replace(".txt", "_norm.txt")
 text = open(txt_file_path, "r").read()
 if os.path.exists(norm_text_file_path):
 norm_text = open(norm_text_file_path, 'r').read()
 else : 
 norm_text = text
 
 # ignore file if text contains digits, otherwise copy wav file and keep metadata to memory 
 if not any(chr.isdigit() for chr in text):
 data_line = filename.replace(".wav", "") + '|' + text + '|' + norm_text
 data.append(data_line)
 copyfile(filepath, os.path.join(wav_dir, filename))

 except RuntimeError:
 print(filepath + " not recognized and ignored.") 

# save metadata
with open(os.path.join(reorganized_dataset_path, "synpaflex.txt"), 'w') as f:
 for item in data:
 f.write("%s\n" % item)

# display reorganized dataset total duration
duration_hours = total_duration / 3600
print("total duration = " + str(f"{duration_hours:.2f}") + " hours")