|
import json |
|
from random import shuffle |
|
|
|
import tqdm |
|
from text.cleaner import clean_text |
|
from collections import defaultdict |
|
import shutil |
|
stage = [1,2,3] |
|
|
|
transcription_path = 'filelists/short_character_anno.list' |
|
train_path = 'filelists/train.list' |
|
val_path = 'filelists/val.list' |
|
config_path = "configs/config.json" |
|
val_per_spk = 4 |
|
max_val_total = 8 |
|
|
|
if 1 in stage: |
|
with open( transcription_path+'.cleaned', 'w', encoding='utf-8') as f: |
|
for line in tqdm.tqdm(open(transcription_path, encoding='utf-8').readlines()): |
|
try: |
|
utt, spk, language, text = line.strip().split('|') |
|
|
|
norm_text, phones, tones, word2ph = clean_text(text, language) |
|
f.write('{}|{}|{}|{}|{}|{}|{}\n'.format(utt, spk, language, norm_text, ' '.join(phones), |
|
" ".join([str(i) for i in tones]), |
|
" ".join([str(i) for i in word2ph]))) |
|
except: |
|
print("err!", utt) |
|
|
|
if 2 in stage: |
|
spk_utt_map = defaultdict(list) |
|
spk_id_map = {} |
|
current_sid = 0 |
|
|
|
with open( transcription_path+'.cleaned', encoding='utf-8') as f: |
|
for line in f.readlines(): |
|
utt, spk, language, text, phones, tones, word2ph = line.strip().split('|') |
|
spk_utt_map[spk].append(line) |
|
if spk not in spk_id_map.keys(): |
|
spk_id_map[spk] = current_sid |
|
current_sid += 1 |
|
train_list = [] |
|
val_list = [] |
|
for spk, utts in spk_utt_map.items(): |
|
shuffle(utts) |
|
val_list+=utts[:val_per_spk] |
|
train_list+=utts[val_per_spk:] |
|
if len(val_list) > max_val_total: |
|
train_list+=val_list[max_val_total:] |
|
val_list = val_list[:max_val_total] |
|
|
|
with open( train_path,"w", encoding='utf-8') as f: |
|
for line in train_list: |
|
f.write(line) |
|
|
|
file_path = transcription_path+'.cleaned' |
|
shutil.copy(file_path,'./filelists/train.list') |
|
|
|
with open(val_path, "w", encoding='utf-8') as f: |
|
for line in val_list: |
|
f.write(line) |
|
|
|
if 3 in stage: |
|
assert 2 in stage |
|
config = json.load(open(config_path)) |
|
config['data']["n_speakers"] = current_sid |
|
config["data"]['spk2id'] = spk_id_map |
|
with open(config_path, 'w', encoding='utf-8') as f: |
|
json.dump(config, f, indent=2, ensure_ascii=False) |
|
|