File size: 2,061 Bytes
ee21b96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.


import csv
from pathlib import Path


def main(args):
    """
    `uid syn ref text`
    """
    in_root = Path(args.generation_root).resolve()
    ext = args.audio_format
    with open(args.audio_manifest) as f, open(args.output_path, "w") as f_out:
        reader = csv.DictReader(
            f, delimiter="\t", quotechar=None, doublequote=False,
            lineterminator="\n", quoting=csv.QUOTE_NONE
        )
        header = ["id", "syn", "ref", "text", "speaker"]
        f_out.write("\t".join(header) + "\n")
        for row in reader:
            dir_name = f"{ext}_{args.sample_rate}hz_{args.vocoder}"
            id_ = row["id"]
            syn = (in_root / dir_name / f"{id_}.{ext}").as_posix()
            ref = row["audio"]
            if args.use_resynthesized_target:
                ref = (in_root / f"{dir_name}_tgt" / f"{id_}.{ext}").as_posix()
            sample = [id_, syn, ref, row["tgt_text"], row["speaker"]]
            f_out.write("\t".join(sample) + "\n")
    print(f"wrote evaluation file to {args.output_path}")


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--generation-root",  help="output directory for generate_waveform.py"
    )
    parser.add_argument(
        "--audio-manifest",
        help="used to determine the original utterance ID and text"
    )
    parser.add_argument(
        "--output-path", help="path to output evaluation spec file"
    )
    parser.add_argument(
        "--use-resynthesized-target", action="store_true",
        help="use resynthesized reference instead of the original audio"
    )
    parser.add_argument("--vocoder", type=str, default="griffin_lim")
    parser.add_argument("--sample-rate", type=int, default=22_050)
    parser.add_argument("--audio-format", type=str, default="wav")
    args = parser.parse_args()

    main(args)