|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | """ | 
					
						
						|  | This script converts a filelist file where each line contains | 
					
						
						|  | <absolute path of wav file> to a manifest json file. | 
					
						
						|  | Optionally post processes the manifest file to create dev and train split for speaker embedding | 
					
						
						|  | training, also optionally segment an audio file in to segments of random DURATIONS and create those | 
					
						
						|  | wav files in CWD. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | --filelist: path to file containing list of audio files | 
					
						
						|  | --manifest(optional): if you already have manifest file, but would like to process it for creating | 
					
						
						|  | segments and splitting then use manifest ignoring filelist | 
					
						
						|  | --id: index of speaker label in filename present in filelist file that is separated by '/' | 
					
						
						|  | --out: output manifest file name | 
					
						
						|  | --split: if you would want to split the  manifest file for training purposes | 
					
						
						|  | you may not need this for test set. output file names is <out>_<train/dev>.json, defaults to False | 
					
						
						|  | --create_segments: if you would want to segment each manifest line to segments of [1,2,3,4] sec or less | 
					
						
						|  | you may not need this for test set, defaults to False | 
					
						
						|  | --min_spkrs_count: min number of samples per speaker to consider and ignore otherwise, defaults to 0 (all speakers) | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | import argparse | 
					
						
						|  | import json | 
					
						
						|  | import os | 
					
						
						|  | import random | 
					
						
						|  |  | 
					
						
						|  | import librosa as l | 
					
						
						|  | import numpy as np | 
					
						
						|  | import soundfile as sf | 
					
						
						|  | import sox | 
					
						
						|  | from sklearn.model_selection import StratifiedShuffleSplit | 
					
						
						|  | from tqdm.contrib.concurrent import process_map | 
					
						
						|  | from nemo.collections.asr.parts.utils.manifest_utils import read_manifest | 
					
						
						|  |  | 
					
						
						|  | random.seed(42) | 
					
						
						|  |  | 
					
						
						|  | DURATIONS = sorted([3], reverse=True) | 
					
						
						|  | MIN_ENERGY = 0.01 | 
					
						
						|  | CWD = os.getcwd() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def filter_manifest_line(manifest_line): | 
					
						
						|  | split_manifest = [] | 
					
						
						|  | audio_path = manifest_line['audio_filepath'] | 
					
						
						|  | start = manifest_line.get('offset', 0) | 
					
						
						|  | dur = manifest_line['duration'] | 
					
						
						|  | label = manifest_line['label'] | 
					
						
						|  | endname = os.path.splitext(audio_path.split(label, 1)[-1])[0] | 
					
						
						|  | to_path = os.path.join(CWD, 'segments', label) | 
					
						
						|  | to_path = os.path.join(to_path, endname[1:]) | 
					
						
						|  | os.makedirs(os.path.dirname(to_path), exist_ok=True) | 
					
						
						|  |  | 
					
						
						|  | if dur >= min(DURATIONS): | 
					
						
						|  | signal, sr = sf.read(audio_path) | 
					
						
						|  | remaining_dur = dur - start | 
					
						
						|  |  | 
					
						
						|  | segments = DURATIONS.copy() | 
					
						
						|  | mode = int(remaining_dur // sum(DURATIONS)) | 
					
						
						|  | rem = remaining_dur % sum(DURATIONS) | 
					
						
						|  | segments = mode * segments | 
					
						
						|  |  | 
					
						
						|  | for val in DURATIONS: | 
					
						
						|  | if rem >= val: | 
					
						
						|  | segments.append(val) | 
					
						
						|  | rem = rem - val | 
					
						
						|  |  | 
					
						
						|  | for temp_dur in segments: | 
					
						
						|  | segment_audio = signal[int(start * sr) : int(start * sr + temp_dur * sr)] | 
					
						
						|  | if l.feature.rms(y=segment_audio).mean() > MIN_ENERGY: | 
					
						
						|  | final_string = '_' + str(start) + '_' + str(temp_dur) | 
					
						
						|  | final_string = final_string.replace('.', '-') | 
					
						
						|  | to_file = to_path + final_string + '.wav' | 
					
						
						|  |  | 
					
						
						|  | c_start = int(float(start * sr)) | 
					
						
						|  | c_end = c_start + int(float(temp_dur * sr)) | 
					
						
						|  | segment = signal[c_start:c_end] | 
					
						
						|  | sf.write(to_file, segment, sr) | 
					
						
						|  |  | 
					
						
						|  | meta = manifest_line.copy() | 
					
						
						|  | meta['audio_filepath'] = to_file | 
					
						
						|  | meta['offset'] = 0 | 
					
						
						|  | meta['duration'] = temp_dur | 
					
						
						|  | split_manifest.append(meta) | 
					
						
						|  |  | 
					
						
						|  | start = start + temp_dur | 
					
						
						|  |  | 
					
						
						|  | return split_manifest | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def count_and_consider_only(speakers, lines, min_count=10): | 
					
						
						|  | """ | 
					
						
						|  | consider speakers only if samples per speaker is at least min_count | 
					
						
						|  | """ | 
					
						
						|  | uniq_speakers, indices, counts = np.unique(speakers, return_index=True, return_counts=True) | 
					
						
						|  | print("speaker count before filtering minimum number of speaker counts: ", len(uniq_speakers)) | 
					
						
						|  | required_speakers = {} | 
					
						
						|  | for idx, count in enumerate(counts): | 
					
						
						|  | if count >= min_count: | 
					
						
						|  | required_speakers[uniq_speakers[idx]] = count | 
					
						
						|  |  | 
					
						
						|  | print("speaker count after filtering minimum number of speaker counts: ", len(required_speakers)) | 
					
						
						|  | required_lines = [] | 
					
						
						|  | speakers_only = [] | 
					
						
						|  | for idx, speaker in enumerate(speakers): | 
					
						
						|  | if speaker in required_speakers: | 
					
						
						|  | required_lines.append(lines[idx]) | 
					
						
						|  | speakers_only.append(speaker) | 
					
						
						|  |  | 
					
						
						|  | return speakers_only, required_lines | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def write_file(name, lines, idx): | 
					
						
						|  | with open(name, 'w', encoding='utf-8') as fout: | 
					
						
						|  | for i in idx: | 
					
						
						|  | dic = lines[i] | 
					
						
						|  | json.dump(dic, fout) | 
					
						
						|  | fout.write('\n') | 
					
						
						|  | print("wrote", name) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def read_file(filelist, id=-1): | 
					
						
						|  | json_lines = [] | 
					
						
						|  | with open(filelist, 'r') as fo: | 
					
						
						|  | lines = fo.readlines() | 
					
						
						|  | lines = sorted(lines) | 
					
						
						|  | for line in lines: | 
					
						
						|  | line = line.strip() | 
					
						
						|  | speaker = line.split('/')[id] | 
					
						
						|  | speaker = list(speaker) | 
					
						
						|  | speaker = ''.join(speaker) | 
					
						
						|  | meta = {"audio_filepath": line, "offset": 0, "duration": None, "label": speaker} | 
					
						
						|  | json_lines.append(meta) | 
					
						
						|  | return json_lines | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_duration(json_line): | 
					
						
						|  | dur = json_line['duration'] | 
					
						
						|  | if dur is None: | 
					
						
						|  | wav_path = json_line['audio_filepath'] | 
					
						
						|  | json_line['duration'] = sox.file_info.duration(wav_path) | 
					
						
						|  | return json_line | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_labels(lines): | 
					
						
						|  | labels = [] | 
					
						
						|  | for line in lines: | 
					
						
						|  | label = line['label'] | 
					
						
						|  | labels.append(label) | 
					
						
						|  | return labels | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def main(filelist, manifest, id, out, split=False, create_segments=False, min_count=10): | 
					
						
						|  | if os.path.exists(out): | 
					
						
						|  | os.remove(out) | 
					
						
						|  | if filelist: | 
					
						
						|  | lines = read_file(filelist=filelist, id=id) | 
					
						
						|  | lines = process_map(get_duration, lines, chunksize=100) | 
					
						
						|  | out_file = os.path.splitext(filelist)[0] + '_manifest.json' | 
					
						
						|  | write_file(out_file, lines, range(len(lines))) | 
					
						
						|  | else: | 
					
						
						|  | lines = read_manifest(manifest) | 
					
						
						|  |  | 
					
						
						|  | lines = process_map(get_duration, lines, chunksize=100) | 
					
						
						|  |  | 
					
						
						|  | if create_segments: | 
					
						
						|  | print(f"creating and writing segments to {CWD}") | 
					
						
						|  | lines = process_map(filter_manifest_line, lines, chunksize=100) | 
					
						
						|  | temp = [] | 
					
						
						|  | for line in lines: | 
					
						
						|  | temp.extend(line) | 
					
						
						|  | del lines | 
					
						
						|  | lines = temp | 
					
						
						|  |  | 
					
						
						|  | speakers = [x['label'] for x in lines] | 
					
						
						|  |  | 
					
						
						|  | if min_count: | 
					
						
						|  | speakers, lines = count_and_consider_only(speakers, lines, abs(min_count)) | 
					
						
						|  |  | 
					
						
						|  | write_file(out, lines, range(len(lines))) | 
					
						
						|  | path = os.path.dirname(out) | 
					
						
						|  | if split: | 
					
						
						|  | speakers = [x['label'] for x in lines] | 
					
						
						|  | sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42) | 
					
						
						|  | for train_idx, test_idx in sss.split(speakers, speakers): | 
					
						
						|  | print("number of train samples after split: ", len(train_idx)) | 
					
						
						|  |  | 
					
						
						|  | out = os.path.join(path, 'train.json') | 
					
						
						|  | write_file(out, lines, train_idx) | 
					
						
						|  | out = os.path.join(path, 'dev.json') | 
					
						
						|  | write_file(out, lines, test_idx) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | parser = argparse.ArgumentParser() | 
					
						
						|  | parser.add_argument("--filelist", help="path to filelist file", type=str, required=False, default=None) | 
					
						
						|  | parser.add_argument("--manifest", help="manifest file name", type=str, required=False, default=None) | 
					
						
						|  | parser.add_argument( | 
					
						
						|  | "--id", | 
					
						
						|  | help="field num seperated by '/' to be considered as speaker label from filelist file, can be ignored if manifest file is already provided with labels", | 
					
						
						|  | type=int, | 
					
						
						|  | required=False, | 
					
						
						|  | default=None, | 
					
						
						|  | ) | 
					
						
						|  | parser.add_argument("--out", help="manifest_file name", type=str, required=True) | 
					
						
						|  | parser.add_argument( | 
					
						
						|  | "--split", | 
					
						
						|  | help="bool if you would want to split the manifest file for training purposes", | 
					
						
						|  | required=False, | 
					
						
						|  | action='store_true', | 
					
						
						|  | ) | 
					
						
						|  | parser.add_argument( | 
					
						
						|  | "--create_segments", | 
					
						
						|  | help="bool if you would want to segment each manifest line to segments of 4 sec or less", | 
					
						
						|  | required=False, | 
					
						
						|  | action='store_true', | 
					
						
						|  | ) | 
					
						
						|  | parser.add_argument( | 
					
						
						|  | "--min_spkrs_count", | 
					
						
						|  | default=0, | 
					
						
						|  | type=int, | 
					
						
						|  | help="min number of samples per speaker to consider and ignore otherwise", | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | args = parser.parse_args() | 
					
						
						|  |  | 
					
						
						|  | main( | 
					
						
						|  | args.filelist, args.manifest, args.id, args.out, args.split, args.create_segments, args.min_spkrs_count, | 
					
						
						|  | ) | 
					
						
						|  |  |