Spaces:
Runtime error
Runtime error
Delete Create_dataset
Browse files
Create_dataset/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
|
|
|
|
Create_dataset/cr_dataset_script.py
DELETED
@@ -1,97 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import numpy as np
|
3 |
-
from datasets import load_dataset
|
4 |
-
from datasets import Dataset, DatasetDict
|
5 |
-
from IPython.display import Audio
|
6 |
-
import scipy
|
7 |
-
import librosa
|
8 |
-
from tqdm import tqdm
|
9 |
-
import re
|
10 |
-
import os
|
11 |
-
|
12 |
-
|
13 |
-
def load_audio(audio_dict:dict)->None:
|
14 |
-
target_sr = 22050
|
15 |
-
audio_resampled = librosa.resample(np.array(audio_dict['array']),
|
16 |
-
orig_sr=audio_dict['sampling_rate'],
|
17 |
-
target_sr=target_sr)
|
18 |
-
scipy.io.wavfile.write(audio_dict['path'],
|
19 |
-
rate=target_sr,
|
20 |
-
data=(audio_resampled* 32767).astype(np.int16))
|
21 |
-
|
22 |
-
def remove_outer_quotes_regex(sen:str)->str:
|
23 |
-
return re.sub(r'^["\'](.*)["\']$', r'\1', sen)
|
24 |
-
|
25 |
-
def main()->None:
|
26 |
-
os.mkdir('kany_dataset')
|
27 |
-
os.chdir('kany_dataset')
|
28 |
-
os.mkdir('wavs')
|
29 |
-
os.chdir('wavs')
|
30 |
-
|
31 |
-
|
32 |
-
art = """
|
33 |
-
/\_/\
|
34 |
-
( o.o )
|
35 |
-
> ^ <
|
36 |
-
|
37 |
-
V O I C E
|
38 |
-
"""
|
39 |
-
print(art)
|
40 |
-
|
41 |
-
print('--- LOADING DATASET ---')
|
42 |
-
dataset_kany = load_dataset("Simonlob/Kany_dataset_mk4")
|
43 |
-
|
44 |
-
# mk TRAIN
|
45 |
-
print()
|
46 |
-
print('--- CONVERTIND AND SAVING THE TRAIN DATASET ---')
|
47 |
-
num_shards=20
|
48 |
-
path = []
|
49 |
-
text = []
|
50 |
-
|
51 |
-
with tqdm(total=len(dataset_kany['train']), leave=False) as pbar:
|
52 |
-
for ind in range(num_shards):
|
53 |
-
dataset_shard = dataset_kany['train'].shard(num_shards=num_shards, index=ind)
|
54 |
-
for row in dataset_shard:
|
55 |
-
load_audio(row['audio'])
|
56 |
-
path.append(row['audio']['path'])
|
57 |
-
text.append(row['raw_transcription'])
|
58 |
-
pbar.update(1)
|
59 |
-
|
60 |
-
|
61 |
-
absolute_path = os.path.abspath('../')
|
62 |
-
os.chdir(absolute_path)
|
63 |
-
|
64 |
-
dir = f'{absolute_path}/wavs/'
|
65 |
-
df = pd.DataFrame({'path':path, 'text':text})
|
66 |
-
df.text = df.text.map(remove_outer_quotes_regex)
|
67 |
-
df.path = dir + df.path
|
68 |
-
df.to_csv('kany_filelist_train.txt', sep='|', header=None, index=False)
|
69 |
-
|
70 |
-
# mk TEST
|
71 |
-
os.chdir(dir)
|
72 |
-
path = []
|
73 |
-
text = []
|
74 |
-
print()
|
75 |
-
print('--- CONVERTIND AND SAVING THE TEST DATASET ---')
|
76 |
-
with tqdm(total=len(dataset_kany['test']), leave=False) as pbar2:
|
77 |
-
for row in tqdm(dataset_kany['test']):
|
78 |
-
load_audio(row['audio'])
|
79 |
-
path.append(row['audio']['path'])
|
80 |
-
text.append(row['raw_transcription'])
|
81 |
-
pbar2.update(1)
|
82 |
-
|
83 |
-
os.chdir(absolute_path)
|
84 |
-
df = pd.DataFrame({'path':path, 'text':text})
|
85 |
-
df.text = df.text.map(remove_outer_quotes_regex)
|
86 |
-
df.path = dir + df.path
|
87 |
-
df.to_csv('kany_filelist_test.txt', sep='|', header=None, index=False)
|
88 |
-
print()
|
89 |
-
print('--- THE DATASET IS READY ---')
|
90 |
-
print(f'Dir of data is "{absolute_path}"')
|
91 |
-
|
92 |
-
absolute_path_home = os.path.abspath('../')
|
93 |
-
os.chdir(absolute_path_home)
|
94 |
-
|
95 |
-
|
96 |
-
if __name__ == "__main__":
|
97 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|