# Copyright (c) 2023 Amphion. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import json import os from collections import defaultdict from tqdm import tqdm def get_uids_and_wav_paths(cfg, dataset, dataset_type): assert dataset == "bigdata" dataset_dir = os.path.join( cfg.OUTPUT_PATH, "preprocess/{}_version".format(cfg.PREPROCESS_VERSION), "bigdata/{}".format(cfg.BIGDATA_VERSION), ) dataset_file = os.path.join( dataset_dir, "{}.json".format(dataset_type.split("_")[-1]) ) with open(dataset_file, "r") as f: utterances = json.load(f) # Uids uids = [u["Uid"] for u in utterances] # Wav paths wav_paths = [u["Path"] for u in utterances] return uids, wav_paths def take_duration(utt): return utt["Duration"] def main(output_path, cfg): datasets = cfg.dataset print("-" * 10) print("Preparing samples for bigdata...") print("Including: \n{}\n".format("\n".join(datasets))) datasets.sort() bigdata_version = "_".join(datasets) save_dir = os.path.join(output_path, bigdata_version) os.makedirs(save_dir, exist_ok=True) train_output_file = os.path.join(save_dir, "train.json") test_output_file = os.path.join(save_dir, "test.json") singer_dict_file = os.path.join(save_dir, cfg.preprocess.spk2id) utt2singer_file = os.path.join(save_dir, cfg.preprocess.utt2spk) utt2singer = open(utt2singer_file, "a+") # We select songs of standard samples as test songs train = [] test = [] train_total_duration = 0 test_total_duration = 0 # Singer unique names singer_names = set() for dataset in datasets: dataset_path = os.path.join(output_path, dataset) train_json = os.path.join(dataset_path, "train.json") test_json = os.path.join(dataset_path, "test.json") with open(train_json, "r", encoding="utf-8") as f: train_utterances = json.load(f) with open(test_json, "r", encoding="utf-8") as f: test_utterances = json.load(f) for utt in tqdm(train_utterances): train.append(utt) train_total_duration += utt["Duration"] singer_names.add("{}_{}".format(utt["Dataset"], utt["Singer"])) utt2singer.write( "{}_{}\t{}_{}\n".format( utt["Dataset"], utt["Uid"], utt["Dataset"], utt["Singer"] ) ) for utt in test_utterances: test.append(utt) test_total_duration += utt["Duration"] singer_names.add("{}_{}".format(utt["Dataset"], utt["Singer"])) utt2singer.write( "{}_{}\t{}_{}\n".format( utt["Dataset"], utt["Uid"], utt["Dataset"], utt["Singer"] ) ) utt2singer.close() train.sort(key=take_duration) test.sort(key=take_duration) print("#Train = {}, #Test = {}".format(len(train), len(test))) print( "#Train hours= {}, #Test hours= {}".format( train_total_duration / 3600, test_total_duration / 3600 ) ) # Singer Look Up Table singer_names = list(singer_names) singer_names.sort() singer_lut = {name: i for i, name in enumerate(singer_names)} print("#Singers: {}\n".format(len(singer_lut))) # Save with open(train_output_file, "w") as f: json.dump(train, f, indent=4, ensure_ascii=False) with open(test_output_file, "w") as f: json.dump(test, f, indent=4, ensure_ascii=False) with open(singer_dict_file, "w") as f: json.dump(singer_lut, f, indent=4, ensure_ascii=False) # Save meta info meta_info = { "datasets": datasets, "train": {"size": len(train), "hours": round(train_total_duration / 3600, 4)}, "test": {"size": len(test), "hours": round(test_total_duration / 3600, 4)}, "singers": {"size": len(singer_lut)}, } singer2mins = defaultdict(float) for utt in train: dataset, singer, duration = utt["Dataset"], utt["Singer"], utt["Duration"] singer2mins["{}_{}".format(dataset, singer)] += duration / 60 singer2mins = sorted(singer2mins.items(), key=lambda x: x[1], reverse=True) singer2mins = dict( zip([i[0] for i in singer2mins], [round(i[1], 2) for i in singer2mins]) ) meta_info["singers"]["training_minutes"] = singer2mins with open(os.path.join(save_dir, "meta_info.json"), "w") as f: json.dump(meta_info, f, indent=4, ensure_ascii=False) for singer, min in singer2mins.items(): print("Singer {}: {} mins".format(singer, min)) print("-" * 10, "\n")