File size: 3,703 Bytes
72835fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
from glob import glob
import re
import string
import random


def replace_extra_chars(line):
    line = line.replace("(", "").replace(
        ")", ""
    )  # .replace('\u200d', ' ').replace('\ufeff', ' ').replace('\u200c', ' ').replace('\u200e', ' ')
    # line = line.replace('“', ' ').replace('”', ' ').replace(':', ' ')

    return line.strip()


def write_txt(content, filename):
    with open(filename, "w+", encoding="utf-8") as f:
        f.write(content)


def save_train_test_valid_split(annotations_txt, num_samples_valid, num_samples_test):
    with open(annotations_txt, encoding="utf-8") as f:
        all_lines = [line.strip() for line in f.readlines()]
    test_val_indices = random.sample(
        range(len(all_lines)), num_samples_valid + num_samples_test
    )
    valid_ix = test_val_indices[:num_samples_valid]
    test_ix = test_val_indices[num_samples_valid:]
    train = [line for i, line in enumerate(all_lines) if i not in test_val_indices]
    valid = [line for i, line in enumerate(all_lines) if i in valid_ix]
    test = [line for i, line in enumerate(all_lines) if i in test_ix]

    print(f"Num samples in train: {len(train)}")
    print(f"Num samples in valid: {len(valid)}")
    print(f"Num samples in test: {len(test)}")

    out_dir_path = "/".join(annotations_txt.split("/")[:-1])
    with open(os.path.join(out_dir_path, "train.txt"), "w+", encoding="utf-8") as f:
        for line in train:
            print(line, file=f)
    with open(os.path.join(out_dir_path, "valid.txt"), "w+", encoding="utf-8") as f:
        for line in valid:
            print(line, file=f)
    with open(os.path.join(out_dir_path, "test.txt"), "w+", encoding="utf-8") as f:
        for line in test:
            print(line, file=f)
    print(f"train, test and valid txts saved in {out_dir_path}")


def save_txts_from_txt_done_data(
    text_path,
    wav_path_for_annotations_txt,
    out_path_for_txts,
    num_samples_valid,
    num_samples_test,
):
    outfile = os.path.join(out_path_for_txts, "annotations.txt")
    file_lines = open(text_path).read().splitlines()
    # print(file_lines[0])

    file_lines = [replace_extra_chars(line) for line in file_lines]
    # print(file_lines[0])

    fnames, ftexts = [], []
    for line in file_lines:
        elems = line.split('"')
        fnames.append(elems[0].strip())
        ftexts.append(elems[1].strip())

    all_chars = list(set("".join(ftexts)))
    punct_with_space = [i for i in all_chars if i in list(string.punctuation)] + [" "]
    chars = [i for i in all_chars if i not in punct_with_space if i.strip()]
    chars = "".join(chars)
    punct_with_space = "".join(punct_with_space)
    print(chars)
    print(punct_with_space)

    outfile_f = open(outfile, "w", encoding="utf-8")
    for f, t in zip(fnames, ftexts):
        print(
            os.path.join(wav_path_for_annotations_txt, f) + ".wav",
            t,
            sep="|",
            file=outfile_f,
        )

    write_txt(punct_with_space, os.path.join(out_path_for_txts, "punc.txt"))
    write_txt(chars, os.path.join(out_path_for_txts, "chars.txt"))

    save_train_test_valid_split(
        annotations_txt=outfile,
        num_samples_valid=num_samples_valid,
        num_samples_test=num_samples_test,
    )


if __name__ == "__main__":
    text_path = "path/to/txt.done.data"
    out_path_for_txts = "vakyansh-tts/data/training/"
    wav_path_for_annotations_txt = "vakyansh-tts/data/training/wav_16K"
    num_samples_valid = 400
    num_samples_test = 50
    save_txts_from_txt_done_data(
        text_path,
        wav_path_for_annotations_txt,
        out_path_for_txts,
        num_samples_valid,
        num_samples_test,
    )