Yurii Paniv commited on
Commit
369ee40
1 Parent(s): 4744f27

Add script for converting custom dataset to Common Voice-like

Browse files
Files changed (1) hide show
  1. scripts/converter.py +97 -0
scripts/converter.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from random import shuffle
3
+ from shutil import copyfile
4
+
5
+ # file template needed for import script
6
+ template = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}"
7
+ # structure example below
8
+ # client_id path sentence up_votes down_votes age gender accent locale segment
9
+ structure = template.format("client_id", "path", "sentence", "up_votes",
10
+ "down_votes", "age", "gender", "accent", "locale", "segment")
11
+
12
+ iterator = 1
13
+ speaker_iterator = 1
14
+
15
+
16
+ def write_dataset(path, name, data):
17
+ """
18
+ Function to write converted data list
19
+ """
20
+ global iterator
21
+ global speaker_iterator
22
+ file_path = os.path.join(path, name)
23
+ clip_path = os.path.join(os.path.dirname(path), "wav")
24
+ result = open(file_path, mode="w", encoding="utf-8")
25
+ result.write(structure)
26
+ result.write("\n")
27
+ for row in data:
28
+ file_name = row[0]
29
+ if file_name.endswith(".wav"):
30
+ pass
31
+ elif file_name.endswith(".mp3"):
32
+ pass
33
+ elif file_name.find(".") == -1:
34
+ file_name += ".wav"
35
+ parted_name = file_name.split(".")
36
+
37
+ new_file_name = f"{iterator}." + parted_name[1]
38
+
39
+ old_file_path = os.path.join(clip_path, file_name)
40
+ new_file_path = os.path.join("clips", new_file_name)
41
+ if os.path.exists(old_file_path):
42
+ copyfile(old_file_path,
43
+ new_file_path)
44
+ result.write(template.format(
45
+ speaker_iterator, new_file_name, row[1], "", "", "", "", "", "uk", "\n"))
46
+ speaker_iterator += 1
47
+ iterator += 1
48
+ else:
49
+ print("File not found", old_file_path)
50
+ result.close()
51
+
52
+
53
+ if not os.path.exists("clips"):
54
+ os.makedirs("clips") # create folder to contain processed clips
55
+
56
+ # iterate over all data lists and write converted version near them
57
+ for subdir, dirs, files in os.walk(os.path.abspath(os.path.curdir)):
58
+ print(subdir)
59
+ for file in files:
60
+ if file == "txt.final.data":
61
+ file_path = os.path.join(subdir, file)
62
+ file = open(file_path, mode="r")
63
+ data = [row.replace(" \n", "").split(" ", 1)
64
+ for row in file.readlines()]
65
+ file.close()
66
+
67
+ shuffle(data)
68
+
69
+ dataset_size = len(data)
70
+ train_point = int(dataset_size*0.8)
71
+ dev_point = int(train_point + (dataset_size - train_point) / 2)
72
+ # split dataset
73
+ write_dataset(subdir, "train.tsv", data[:train_point])
74
+ write_dataset(subdir, "dev.tsv", data[train_point:dev_point])
75
+ write_dataset(subdir, "test.tsv", data[dev_point:])
76
+
77
+ # write dataset splits into single files
78
+ final_files = {
79
+ "train.tsv": open("train.tsv", mode="w", encoding="utf-8"),
80
+ "dev.tsv": open("dev.tsv", mode="w", encoding="utf-8"),
81
+ "test.tsv": open("test.tsv", mode="w", encoding="utf-8")
82
+ }
83
+ for file in final_files.values():
84
+ file.write(structure)
85
+ file.write("\n")
86
+
87
+ for subdir, dirs, files in os.walk(os.path.curdir):
88
+ for file in files:
89
+ if file in ["train.tsv", "dev.tsv", "test.tsv"]:
90
+ input_file = open(os.path.join(subdir, file))
91
+ data = [row for row in input_file.readlines()][1::]
92
+ input_file.close()
93
+ for row in data:
94
+ final_files[file].write(row)
95
+
96
+ for file in final_files.values():
97
+ file.close()