import numpy as np import random import os import glob import json def write_notes_file(file_name, text): with open(file_name, 'a') as da: da.write(text + '\n') def get_blank_dataset_dict(dataset_name, is_test, ann_path, wav_path): ddict = {'dataset_name': dataset_name, 'is_test': is_test, 'is_binary': False, 'ann_path': ann_path, 'wav_path': wav_path} return ddict def get_short_class_names(class_names, str_len=3): class_names_short = [] for cc in class_names: class_names_short.append(' '.join([sp[:str_len] for sp in cc.split(' ')])) return class_names_short def remove_dupes(data_train, data_test): test_ids = [dd['id'] for dd in data_test] data_train_prune = [] for aa in data_train: if aa['id'] not in test_ids: data_train_prune.append(aa) diff = len(data_train) - len(data_train_prune) if diff != 0: print(diff, 'items removed from train set') return data_train_prune def get_genus_mapping(class_names): genus_names, genus_mapping = np.unique([cc.split(' ')[0] for cc in class_names], return_inverse=True) return genus_names.tolist(), genus_mapping.tolist() def standardize_low_freq(data, class_of_interest): # address the issue of highly variable low frequency annotations # this often happens for contstant frequency calls # for the class of interest sets the low and high freq to be the dataset mean low_freqs = [] high_freqs = [] for dd in data: for aa in dd['annotation']: if aa['class'] == class_of_interest: low_freqs.append(aa['low_freq']) high_freqs.append(aa['high_freq']) low_mean = np.mean(low_freqs) high_mean = np.mean(high_freqs) assert(low_mean < high_mean) print('\nStandardizing low and high frequency for:') print(class_of_interest) print('low: ', round(low_mean, 2)) print('high: ', round(high_mean, 2)) # only set the low freq, high stays the same # assumes that low_mean < high_mean for dd in data: for aa in dd['annotation']: if aa['class'] == class_of_interest: aa['low_freq'] = low_mean if aa['high_freq'] < low_mean: aa['high_freq'] = high_mean return data def load_set_of_anns(data, classes_to_ignore=[], events_of_interest=None, convert_to_genus=False, verbose=True, list_of_anns=False, filter_issues=False, name_replace=False): # load the annotations anns = [] if list_of_anns: # path to list of individual json files anns.extend(load_anns_from_path(data['ann_path'], data['wav_path'])) else: # dictionary of datasets for dd in data: anns.extend(load_anns(dd['ann_path'], dd['wav_path'])) # discarding unannoated files anns = [aa for aa in anns if aa['annotated'] is True] # filter files that have annotation issues - is the input is a dictionary of # datasets, this will lilely have already been done if filter_issues: anns = [aa for aa in anns if aa['issues'] is False] # check for some basic formatting errors with class names for ann in anns: for aa in ann['annotation']: aa['class'] = aa['class'].strip() # only load specified events - i.e. types of calls if events_of_interest is not None: for ann in anns: filtered_events = [] for aa in ann['annotation']: if aa['event'] in events_of_interest: filtered_events.append(aa) ann['annotation'] = filtered_events # change class names # replace_names will be a dictionary mapping input name to output if type(name_replace) is dict: for ann in anns: for aa in ann['annotation']: if aa['class'] in name_replace: aa['class'] = name_replace[aa['class']] # convert everything to genus name if convert_to_genus: for ann in anns: for aa in ann['annotation']: aa['class'] = aa['class'].split(' ')[0] # get unique class names class_names_all = [] for ann in anns: for aa in ann['annotation']: if aa['class'] not in classes_to_ignore: class_names_all.append(aa['class']) class_names, class_cnts = np.unique(class_names_all, return_counts=True) class_inv_freq = (class_cnts.sum() / (len(class_names) * class_cnts.astype(np.float32))) if verbose: print('Class count:') str_len = np.max([len(cc) for cc in class_names]) + 5 for cc in range(len(class_names)): print(str(cc).ljust(5) + class_names[cc].ljust(str_len) + str(class_cnts[cc])) if len(classes_to_ignore) == 0: return anns else: return anns, class_names.tolist(), class_inv_freq.tolist() def load_anns(ann_file_name, raw_audio_dir): with open(ann_file_name) as da: anns = json.load(da) for aa in anns: aa['file_path'] = raw_audio_dir + aa['id'] return anns def load_anns_from_path(ann_file_dir, raw_audio_dir): files = glob.glob(ann_file_dir + '*.json') anns = [] for ff in files: with open(ff) as da: ann = json.load(da) ann['file_path'] = raw_audio_dir + ann['id'] anns.append(ann) return anns class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count