|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from simple_tokenizer_infer import SpeechTokenizer |
|
|
import argparse |
|
|
import librosa |
|
|
import logging |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
def main(args): |
|
|
ref_wav_file_list = [] |
|
|
line_info_list = [] |
|
|
reconstruct_wav_file_list = [] |
|
|
|
|
|
logging.info(f"loading eval file list") |
|
|
base_path = Path(args.input_list).parent |
|
|
with open(args.input_list, "r") as input_file: |
|
|
for line in input_file: |
|
|
fields = line.strip().split("|") |
|
|
if args.input_type == "tts": |
|
|
ref_wav_file_list.append(fields[2]) |
|
|
else: |
|
|
reconstruct_wav_file_list.append(fields[4]) |
|
|
ref_wav_file_list.append(fields[2]) |
|
|
line_info_list.append([fields[2], fields[0], fields[3]]) |
|
|
|
|
|
logging.info(f"loading ref audio") |
|
|
raw_ref_wavs_list = [] |
|
|
for file_path in ref_wav_file_list: |
|
|
|
|
|
raw_wav, sr = librosa.load( |
|
|
(base_path / file_path), sr=16000 |
|
|
) |
|
|
raw_ref_wavs_list.append(raw_wav) |
|
|
|
|
|
logging.info(f"extracting token for ref audio") |
|
|
if args.ckpt is not None: |
|
|
tokenizer = SpeechTokenizer( |
|
|
ckpt_path=args.ckpt, cfg_path=args.cfg_path, cfg_name=args.cfg_name |
|
|
) |
|
|
else: |
|
|
tokenizer = SpeechTokenizer() |
|
|
ref_token_list, ref_token_info_list = tokenizer.extract(raw_ref_wavs_list) |
|
|
|
|
|
if args.input_type == "reconstruct": |
|
|
logging.info(f"loading reconstruct audio") |
|
|
raw_reconstruct_wav_list = [] |
|
|
for file_path in reconstruct_wav_file_list: |
|
|
|
|
|
raw_wav, sr = librosa.load( |
|
|
(base_path / file_path), sr=16000 |
|
|
) |
|
|
raw_reconstruct_wav_list.append(raw_wav) |
|
|
|
|
|
logging.info(f"extracting token for reconstruct audio") |
|
|
recon_token_list, recon_token_info_list = tokenizer.extract(raw_reconstruct_wav_list) |
|
|
assert(len(ref_token_info_list) == len(recon_token_info_list)) |
|
|
|
|
|
assert(len(ref_token_info_list) == len(line_info_list)) |
|
|
with open(args.output_file, "w") as output_file: |
|
|
logging.info(f"writing output file") |
|
|
if args.input_type == "tts": |
|
|
for ref, line_info in zip(ref_token_info_list, line_info_list): |
|
|
ref_units = ref["reduced_unit_sequence"] |
|
|
|
|
|
ref_path = str((base_path / line_info[0])) |
|
|
output_file.write(f"{ref_path}|{ref_units}|{line_info[1]}|{line_info[2]}\n") |
|
|
else: |
|
|
for ref, recon, line_info in zip(ref_token_info_list, recon_token_info_list, line_info_list): |
|
|
ref_units = ref["reduced_unit_sequence"] |
|
|
recon_units = recon["reduced_unit_sequence"] |
|
|
|
|
|
ref_path = str((base_path / line_info[0])) |
|
|
output_file.write(f"{ref_path}|{ref_units}|{line_info[1]}|{recon_units}|{line_info[2]}\n") |
|
|
output_file.close() |
|
|
logging.info("Finished") |
|
|
return |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument( |
|
|
"--ckpt", |
|
|
dest="ckpt", |
|
|
required=False, |
|
|
help="path to ckpt", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--cfg-path", |
|
|
dest="cfg_path", |
|
|
required=False, |
|
|
default="config", |
|
|
help="path to config", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--cfg-name", |
|
|
dest="cfg_name", |
|
|
required=False, |
|
|
default="hubert_config", |
|
|
help="name of config", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--input-list", |
|
|
dest="input_list", |
|
|
required=True, |
|
|
help="list of input wavform", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output-file", |
|
|
dest="output_file", |
|
|
required=True, |
|
|
help="file to output speech tokens", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--input-type", |
|
|
default="tts", |
|
|
type=str, |
|
|
required=True, |
|
|
help=f"test fil list type: tts or reconstruct, seedtts format", |
|
|
) |
|
|
args = parser.parse_args() |
|
|
|
|
|
if args.input_type not in {"tts", "reconstruct"}: |
|
|
logging.info(f"Input type must be tts or reconstruct") |
|
|
exit() |
|
|
main(args) |
|
|
|