File size: 4,740 Bytes
3b92d66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
from glob import glob
import re
import string
import argparse
import json
import random
random.seed(42)

def replace_extra_chars(line):
    line = line.replace("(", "").replace(
        ")", ""
    )  # .replace('\u200d', ' ').replace('\ufeff', ' ').replace('\u200c', ' ').replace('\u200e', ' ')
    # line = line.replace('β€œ', ' ').replace('”', ' ').replace(':', ' ')

    return line.strip()


def write_txt(content, filename):
    with open(filename, "w+", encoding="utf-8") as f:
        f.write(content)


def save_train_test_valid_split(annotations_txt, num_samples_valid, num_samples_test):
    with open(annotations_txt, encoding="utf-8") as f:
        all_lines = [line.strip() for line in f.readlines()]
    test_val_indices = random.sample(
        range(len(all_lines)), num_samples_valid + num_samples_test
    )
    valid_ix = test_val_indices[:num_samples_valid]
    test_ix = test_val_indices[num_samples_valid:]
    train = [line for i, line in enumerate(all_lines) if i not in test_val_indices]
    valid = [line for i, line in enumerate(all_lines) if i in valid_ix]
    test = [line for i, line in enumerate(all_lines) if i in test_ix]

    print(f"Num samples in train: {len(train)}")
    print(f"Num samples in valid: {len(valid)}")
    print(f"Num samples in test: {len(test)}")

    out_dir_path = "/".join(annotations_txt.split("/")[:-1])
    with open(os.path.join(out_dir_path, "train.txt"), "w+", encoding="utf-8") as f:
        for line in train:
            print(line, file=f)
    with open(os.path.join(out_dir_path, "valid.txt"), "w+", encoding="utf-8") as f:
        for line in valid:
            print(line, file=f)
    with open(os.path.join(out_dir_path, "test.txt"), "w+", encoding="utf-8") as f:
        for line in test:
            print(line, file=f)
    print(f"train, test and valid txts saved in {out_dir_path}")


def save_txts_from_txt_done_data(
    text_path,
    wav_path_for_annotations_txt,
    out_path_for_txts,
    num_samples_valid,
    num_samples_test,
):
    outfile = os.path.join(out_path_for_txts, "annotations.txt")
    with open(text_path) as file:
        file_lines = file.readlines()

    # print(file_lines[0])

    file_lines = [replace_extra_chars(line) for line in file_lines]
    # print(file_lines[0])

    fnames, ftexts = [], []
    for line in file_lines:
        elems = line.split('"')
        fnames.append(elems[0].strip())
        ftexts.append(elems[1].strip().lower().replace('β€˜','\'').replace('’','\''))

    all_chars = list(set("".join(ftexts)))
    punct_with_space = [i for i in all_chars if i in list(string.punctuation)] + [" "]
    chars = [i for i in all_chars if i not in punct_with_space if i.strip()]
    chars = "".join(chars)
    punct_with_space = "".join(punct_with_space)#.replace("'",r"\'")

    with open('../../config/glow/base_blank.json', 'r') as jfile:
        json_config = json.load(jfile)

    json_config["data"]["chars"] = chars
    json_config["data"]["punc"] = punct_with_space
    json_config["data"]["training_files"]=out_path_for_txts + '/train.txt'
    json_config["data"]["validation_files"] = out_path_for_txts + '/valid.txt'
    new_config_name = out_path_for_txts.split('/')[-1]
    with open(f'../../config/glow/{new_config_name}.json','w+') as jfile:
        json.dump(json_config, jfile)
    
    print(f"Characters: {chars}")
    print(f"Len of vocab: {len(chars)}")
    print(f"Punctuation: {punct_with_space}")
    print(f"Config file is stored at ../../config/glow/{new_config_name}.json")

    outfile_f = open(outfile, "w+", encoding="utf-8")
    for f, t in zip(fnames, ftexts):
        print(
            os.path.join(wav_path_for_annotations_txt, f) + ".wav",
            t,
            sep="|",
            file=outfile_f,
        )
    outfile_f.close()
    write_txt(punct_with_space, os.path.join(out_path_for_txts, "punc.txt"))
    write_txt(chars, os.path.join(out_path_for_txts, "chars.txt"))

    save_train_test_valid_split(
        annotations_txt=outfile,
        num_samples_valid=num_samples_valid,
        num_samples_test=num_samples_test,
    )




if __name__ == "__main__":


    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--text-path", type=str, required=True)
    parser.add_argument("-o", "--output-path", type=str, required=True)
    parser.add_argument("-w", "--wav-path", type=str, required=True)
    parser.add_argument("-v", "--valid-samples", type=int, default = 100)
    parser.add_argument("-t", "--test-samples", type=int, default = 10)
    args = parser.parse_args()

    save_txts_from_txt_done_data(
        args.text_path,
        args.wav_path,
        args.output_path,
        args.valid_samples,
        args.test_samples,
    )