import os import torch from transformers import logging from transformers import AutoTokenizer from wrapper import EvalWrapper from models_xin import CLAP from utils import compute_similarity import librosa if __name__ == '__main__': logging.set_verbosity_error() ckpt = torch.hub.load_state_dict_from_url( url="", map_location="cpu", check_hash=True, ) text_model = 'bert-base-uncased' audio_model = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim' device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') candidates = ['happy', 'sad', 'surprise', 'angry'] # free to adapt it to your need wavpath = '[Waveform path]' # single channel wavform waveform, sample_rate = librosa.load(wavpath, sr=16000) x = torch.Tensor(waveform) tokenizer = AutoTokenizer.from_pretrained(text_model) candidate_tokens = tokenizer.batch_encode_plus( candidates, padding=True, truncation=True, return_tensors='pt' ) model = CLAP( speech_name=audio_model, text_name=text_model, embedding_dim=768, ) model.load_state_dict(ckpt) print(f'Checkpoint is loaded') model.eval() with torch.no_grad(): z = model( x.unsqueeze(0).to(device), candidate_tokens ) similarity = compute_similarity(z[2], z[0], z[1]) prediction = similarity.T.argmax(dim=1) result = candidates[prediction] print(result)