File size: 2,808 Bytes
6703e27 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import pandas as pd
import numpy as np
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from IPython.display import Audio
import scipy
import librosa
from tqdm import tqdm
import re
import os
def load_audio(audio_dict:dict)->None:
target_sr = 22050
audio_resampled = librosa.resample(np.array(audio_dict['array']),
orig_sr=audio_dict['sampling_rate'],
target_sr=target_sr)
scipy.io.wavfile.write(audio_dict['path'],
rate=target_sr,
data=(audio_resampled* 32767).astype(np.int16))
def remove_outer_quotes_regex(sen:str)->str:
return re.sub(r'^["\'](.*)["\']$', r'\1', sen)
def main()->None:
name_dataset = input('Write HF dataset name as <REPO_NAME/DATASET_NAME>: ')
sub_name_dataset = name_dataset.split('/')[1]
os.mkdir(sub_name_dataset)
os.chdir(sub_name_dataset)
os.mkdir('wavs')
os.chdir('wavs')
art = """
/\_/\
( o.o )
> ^ <
V O I C E
"""
print(art)
print('--- LOADING DATASET ---')
your_dataset = load_dataset(name_dataset)
# mk TRAIN
print()
print('--- CONVERTIND AND SAVING THE TRAIN DATASET ---')
num_shards=20
path = []
text = []
with tqdm(total=len(your_dataset['train']), leave=False) as pbar:
for ind in range(num_shards):
dataset_shard = your_dataset['train'].shard(num_shards=num_shards, index=ind)
for row in dataset_shard:
load_audio(row['audio'])
path.append(row['audio']['path'])
text.append(row['raw_transcription'])
pbar.update(1)
absolute_path = os.path.abspath('../')
os.chdir(absolute_path)
dir = f'{absolute_path}/wavs/'
df = pd.DataFrame({'path':path, 'text':text})
df.text = df.text.map(remove_outer_quotes_regex)
df.path = dir + df.path
df.to_csv(f'{sub_name_dataset}_filelist_train.txt', sep='|', header=None, index=False)
# mk TEST
os.chdir(dir)
path = []
text = []
print()
print('--- CONVERTIND AND SAVING THE TEST DATASET ---')
with tqdm(total=len(your_dataset['test']), leave=False) as pbar2:
for row in tqdm(your_dataset['test']):
load_audio(row['audio'])
path.append(row['audio']['path'])
text.append(row['raw_transcription'])
pbar2.update(1)
os.chdir(absolute_path)
df = pd.DataFrame({'path':path, 'text':text})
df.text = df.text.map(remove_outer_quotes_regex)
df.path = dir + df.path
df.to_csv(f'{sub_name_dataset}_filelist_test.txt', sep='|', header=None, index=False)
print()
print('--- THE DATASET IS READY ---')
print(f'Dir of data is "{absolute_path}"')
absolute_path_home = os.path.abspath('../')
os.chdir(absolute_path_home)
if __name__ == "__main__":
main()
|