VITS2-Chinese / preprocess.py
kevinwang676's picture
Upload folder using huggingface_hub
de332ab
import os
import argparse
import json
import sys
sys.setrecursionlimit(500000) # Fix the error message of RecursionError: maximum recursion depth exceeded while calling a Python object. You can change the number as you want.
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--add_auxiliary_data", type=bool, default="False", help="Whether to add extra data as fine-tuning helper")
parser.add_argument("--languages", default="C")
args = parser.parse_args()
if args.languages == "CJE":
langs = ["[ZH]", "[JA]", "[EN]"]
elif args.languages == "CJ":
langs = ["[ZH]", "[JA]"]
elif args.languages == "C":
langs = ["[ZH]"]
new_annos = []
# Source 1: transcribed short audios
if os.path.exists("./filelists/short_character_anno.list"):
with open("./filelists/short_character_anno.list", 'r', encoding='utf-8') as f:
short_character_anno = f.readlines()
new_annos += short_character_anno
# Get all speaker names
speakers = []
for line in new_annos:
path, speaker, text = line.split("|")
if speaker not in speakers:
speakers.append(speaker)
assert (len(speakers) != 0), "No audio file found. Please check your uploaded file structure."
if True:
# Do not add extra helper data
# STEP 1: modify config file
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
hps = json.load(f)
# assign ids to new speakers
speaker2id = {}
for i, speaker in enumerate(speakers):
speaker2id[speaker] = i
# modify n_speakers
hps['data']["n_speakers"] = len(speakers)
# overwrite speaker names
hps['speakers'] = speaker2id
hps['train']['log_interval'] = 10
hps['train']['eval_interval'] = 100
hps['train']['batch_size'] = 16
hps['data']['training_files'] = "final_annotation_train.txt"
hps['data']['validation_files'] = "final_annotation_val.txt"
# save modified config
with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
json.dump(hps, f, indent=2)
# STEP 2: clean annotations, replace speaker names with assigned speaker IDs
import text
cleaned_new_annos = []
for i, line in enumerate(new_annos):
path, speaker, txt = line.split("|")
if len(txt) > 150:
continue
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners']).replace("[ZH]", "")
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
cleaned_new_annos.append(path + "|" + cleaned_text)
final_annos = cleaned_new_annos
# save annotation file
with open("./filelists/final_annotation_train.txt", 'w', encoding='utf-8') as f:
for line in final_annos:
f.write(line)
# save annotation file for validation
with open("./filelists/final_annotation_val.txt", 'w', encoding='utf-8') as f:
for line in cleaned_new_annos:
f.write(line)
print("finished")