Spaces:
Running
on
A10G
Running
on
A10G
import os | |
import json | |
import sys | |
import librosa | |
def traverse_dir( | |
root_dir, | |
extension, | |
amount=None, | |
str_include=None, | |
str_exclude=None, | |
is_pure=False, | |
is_sort=False, | |
is_ext=True): | |
file_list = [] | |
cnt = 0 | |
for root, _, files in os.walk(root_dir): | |
for file in files: | |
if file.endswith(extension): | |
# path | |
mix_path = os.path.join(root, file) | |
pure_path = mix_path[len(root_dir)+1:] if is_pure else mix_path | |
# amount | |
if (amount is not None) and (cnt == amount): | |
if is_sort: | |
file_list.sort() | |
return file_list | |
# check string | |
if (str_include is not None) and (str_include not in pure_path): | |
continue | |
if (str_exclude is not None) and (str_exclude in pure_path): | |
continue | |
if not is_ext: | |
ext = pure_path.split('.')[-1] | |
pure_path = pure_path[:-(len(ext)+1)] | |
file_list.append(pure_path) | |
cnt += 1 | |
if is_sort: | |
file_list.sort() | |
return file_list | |
if __name__ == '__main__': | |
root_dir = '../audiocraft/dataset/example/clip' | |
path_jsonl = '../audiocraft/egs/example/data.jsonl' | |
filelist = traverse_dir( | |
root_dir, | |
extension='wav', | |
str_include='no_vocal', | |
is_sort=True) | |
num_files = len(filelist) | |
with open(path_jsonl, "w") as train_file: | |
for fidx in range(num_files): | |
print(f'==={fidx}/{num_files}================') | |
path_wave = filelist[fidx] | |
path_json = os.path.join( | |
os.path.dirname(path_wave), 'tags.json') | |
sr = librosa.get_samplerate(path_wave) | |
print('path_wave:', path_wave) | |
print('path_json:', path_json) | |
with open(path_json, 'r') as f: | |
data = json.load(f) | |
assert sr == data['sample_rate'] | |
final = { | |
'path': data['path'], | |
'duration': data['duration'], | |
"sample_rate": data['sample_rate'], | |
"bpm": data['bpm'], | |
"amplitude": None, | |
"weight": None, | |
"info_path": None | |
} | |
train_file.write(json.dumps(final) + '\n') | |
print('\n\n\n==================') | |
print('num files:', num_files) | |