File size: 1,991 Bytes
7900c16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python3

"""
Helper script to generate speech2text dataset
"""

import argparse
import os
import glob
import random


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("root")
    parser.add_argument("--output-file", required=True)
    parser.add_argument("--for_finetune", action='store_true')
    parser.add_argument("--sample-ratio", type=float, default=1.0)
    parser.add_argument("--ext", default="flac")
    parser.add_argument("--seed", type=int, default=7)
    args = parser.parse_args()

    rand = random.Random(args.seed)
    os.makedirs(os.path.dirname(os.path.realpath(args.output_file)), exist_ok=True)

    dir_path = os.path.realpath(args.root)
    search_path = os.path.join(dir_path, "**/*." + args.ext)

    transcriptions = {}

    with open(args.output_file, "w") as out_file:
        if args.for_finetune:
            print("text" + "\t" + "wav_path", file=out_file)
        for fname in glob.iglob(search_path, recursive=True):
            if rand.random() > args.sample_ratio:
                continue
            file_path = os.path.realpath(fname)

            dir = os.path.dirname(file_path)
            if dir not in transcriptions:
                parts = dir.split(os.path.sep)
                trans_path = f"{parts[-2]}-{parts[-1]}.trans.txt"
                path = os.path.join(args.root, dir, trans_path)
                assert os.path.exists(path)
                texts = {}
                with open(path, "r") as trans_f:
                    for tline in trans_f:
                        items = tline.strip().split()
                        texts[items[0]] = " ".join(items[1:]).lower()
                transcriptions[dir] = texts
            part = os.path.basename(file_path).split(".")[0]
            assert part in transcriptions[dir]
            print(
                    transcriptions[dir][part] + "\t" + file_path,
                    file=out_file
            )


if __name__ == "__main__":
    main()