import json from random import shuffle import tqdm from text.cleaner import clean_text from collections import defaultdict import shutil stage = [1,2,3] transcription_path = 'filelists/short_character_anno.list' train_path = 'filelists/train.list' val_path = 'filelists/val.list' config_path = "configs/config.json" val_per_spk = 4 max_val_total = 8 if 1 in stage: with open( transcription_path+'.cleaned', 'w', encoding='utf-8') as f: for line in tqdm.tqdm(open(transcription_path, encoding='utf-8').readlines()): try: utt, spk, language, text = line.strip().split('|') #language = "ZH" norm_text, phones, tones, word2ph = clean_text(text, language) f.write('{}|{}|{}|{}|{}|{}|{}\n'.format(utt, spk, language, norm_text, ' '.join(phones), " ".join([str(i) for i in tones]), " ".join([str(i) for i in word2ph]))) except: print("err!", utt) if 2 in stage: spk_utt_map = defaultdict(list) spk_id_map = {} current_sid = 0 with open( transcription_path+'.cleaned', encoding='utf-8') as f: for line in f.readlines(): utt, spk, language, text, phones, tones, word2ph = line.strip().split('|') spk_utt_map[spk].append(line) if spk not in spk_id_map.keys(): spk_id_map[spk] = current_sid current_sid += 1 train_list = [] val_list = [] for spk, utts in spk_utt_map.items(): shuffle(utts) val_list+=utts[:val_per_spk] train_list+=utts[val_per_spk:] if len(val_list) > max_val_total: train_list+=val_list[max_val_total:] val_list = val_list[:max_val_total] with open( train_path,"w", encoding='utf-8') as f: for line in train_list: f.write(line) file_path = transcription_path+'.cleaned' shutil.copy(file_path,'./filelists/train.list') with open(val_path, "w", encoding='utf-8') as f: for line in val_list: f.write(line) if 3 in stage: assert 2 in stage config = json.load(open(config_path)) config['data']["n_speakers"] = current_sid # config["data"]['spk2id'] = spk_id_map with open(config_path, 'w', encoding='utf-8') as f: json.dump(config, f, indent=2, ensure_ascii=False)