import re import glob import pickle import os import torch import numpy as np from utils.audio import load_spectrograms from utils.compute_args import compute_args from utils.tokenize import tokenize, create_dict, sent_to_ix, cmumosei_2, cmumosei_7, pad_feature from model_LA import Model_LA import gradio as gr device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # load model ckpts_path = 'ckpt' model_name = "Model_LA_e" # Listing sorted checkpoints ckpts = sorted(glob.glob(os.path.join(ckpts_path, model_name,'best*')), reverse=True) # Load original args args = torch.load(ckpts[0], map_location=torch.device(device))['args'] args = compute_args(args) pretrained_emb = np.load("train_glove.npy") token_to_ix = pickle.load(open("token_to_ix.pkl", "rb")) state_dict = torch.load(ckpts[0], map_location=torch.device(device))['state_dict'] net = Model_LA(args, len(token_to_ix), pretrained_emb).to(device) net.load_state_dict(state_dict) def inference(source_video, transcription): # data preprocessing # text def clean(w): return re.sub( r"([.,'!?\"()*#:;])", '', w.lower() ).replace('-', ' ').replace('/', ' ') s = [clean(w) for w in transcription.split() if clean(w) != ''] # Sound _, mel, mag = load_spectrograms(source_video) l_max_len = args.lang_seq_len a_max_len = args.audio_seq_len v_max_len = args.video_seq_len L = sent_to_ix(s, token_to_ix, max_token=l_max_len) A = pad_feature(mel, a_max_len) V = pad_feature(mel, v_max_len) # print shapes print(f"Processed text shape from {len(s)} to {L.shape}") print(f"Processed audio shape from {mel.shape} to {A.shape}") print(f"Processed video shape from {mel.shape} to {V.shape}") net.train(False) x = np.expand_dims(L,axis=0) y = np.expand_dims(A,axis=0) z = np.expand_dims(V,axis=0) x, y, z = torch.from_numpy(x).to(device), torch.from_numpy(y).to(device), torch.from_numpy(z).float().to(device) pred = net(x, y, z).cpu().data.numpy()[0] pred = np.exp(pred) / np.sum(np.exp(pred)) # softmax label_to_ix = ['happy', 'sad', 'angry', 'fear', 'disgust', 'surprise'] result_dict = {label_to_ix[i]: float(pred[i]) for i in range(len(label_to_ix))} return result_dict title="Emotion Recognition" description="" examples = [ ['examples/0h-zjBukYpk_2.mp4', "NOW IM NOT EVEN GONNA SUGAR COAT THIS THIS MOVIE FRUSTRATED ME TO SUCH AN EXTREME EXTENT THAT I WAS LOUDLY EXCLAIMING WHY AT THE END OF THE FILM"], ['examples/0h-zjBukYpk_19.mp4', "NOW OTHER PERFORMANCES ARE BORDERLINE"], ['examples/03bSnISJMiM_1.mp4', "IT WAS REALLY GOOD "], ['examples/03bSnISJMiM_5.mp4', "AND THEY SHOULDVE I GUESS "], ] gr.Interface(inference, inputs = [gr.inputs.Video(type="avi", source="upload"), "text"], outputs=["label"], title=title, description=description, examples=examples ).launch(debug=True)