""" This is an example using CLAP to perform zeroshot classification on ESC50 (https://github.com/karolpiczak/ESC-50). """ from CLAPWrapper import CLAPWrapper from esc50_dataset import ESC50 import torch.nn.functional as F import numpy as np from tqdm import tqdm from sklearn.metrics import accuracy_score # Load dataset dataset = ESC50(root="data_path", download=False) prompt = 'this is a sound of ' y = [prompt + x for x in dataset.classes] # Load and initialize CLAP weights_path = "weights_path" clap_model = CLAPWrapper(weights_path, use_cuda=False) # Computing text embeddings text_embeddings = clap_model.get_text_embeddings(y) # Computing audio embeddings y_preds, y_labels = [], [] for i in tqdm(range(len(dataset))): x, _, one_hot_target = dataset.__getitem__(i) audio_embeddings = clap_model.get_audio_embeddings([x], resample=True) similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings) y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy() y_preds.append(y_pred) y_labels.append(one_hot_target.detach().cpu().numpy()) y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0) acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1)) print('ESC50 Accuracy {}'.format(acc)) """ The output: ESC50 Accuracy: 82.6% """