import gradio as gr import os import numpy as np import librosa import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from transformers import AutoFeatureExtractor from sklearnex import patch_sklearn, unpatch_sklearn patch_sklearn() import xgboost as xgb MAX_DURATION = 2 # Sampling rate is the number of samples of audio recorded every second SAMPLING_RATE = 16000 BATCH_SIZE = 2 # Batch-size for training and evaluating our model. NUM_CLASSES = 8 # Number of classes our dataset will have (11 in our case). HIDDEN_DIM = 768 # Dimension of our model output (768 in case of Wav2Vec 2.0 - Base). MAX_SEQ_LENGTH = MAX_DURATION * SAMPLING_RATE # Maximum length of the input audio file. # Wav2Vec 2.0 results in an output frequency with a stride of about 20ms. MAX_FRAMES = 99 MAX_EPOCHS = 5 # Maximum number of training epochs. RAVDESS_CLASS_LABELS = ("angry", "calm", "disgust", "fear", "happy", "neutral","sad","surprise") MODEL_CHECKPOINT = "facebook/wav2vec2-base" labels = RAVDESS_CLASS_LABELS label2id, id2label = dict(), dict() from transformers import TFWav2Vec2Model def mean_pool(hidden_states, feature_lengths): attenion_mask = tf.sequence_mask( feature_lengths, maxlen=MAX_FRAMES, dtype=tf.dtypes.int64 ) padding_mask = tf.cast( tf.reverse(tf.cumsum(tf.reverse(attenion_mask, [-1]), -1), [-1]), dtype=tf.dtypes.bool, ) hidden_states = tf.where( tf.broadcast_to( tf.expand_dims(~padding_mask, -1), (BATCH_SIZE, MAX_FRAMES, HIDDEN_DIM) ), 0.0, hidden_states, ) pooled_state = tf.math.reduce_sum(hidden_states, axis=1) / tf.reshape( tf.math.reduce_sum(tf.cast(padding_mask, dtype=tf.dtypes.float32), axis=1), [-1, 1], ) return pooled_state class TFWav2Vec2ForAudioClassification(keras.Model): def __init__(self, model_checkpoint): super().__init__() # Instantiate the Wav2Vec 2.0 model without the Classification-Head self.wav2vec2 = TFWav2Vec2Model.from_pretrained( model_checkpoint, apply_spec_augment=False, from_pt=True ) self.pooling = layers.GlobalAveragePooling1D() self.flat = layers.Flatten() self.intermediate_layer_dropout = layers.Dropout(0.5) def call(self, inputs): hidden_states = self.wav2vec2(inputs[0])[0] if tf.is_tensor(inputs[1]): audio_lengths = tf.cumsum(inputs[1], -1)[:, -1] feature_lengths = self.wav2vec2.wav2vec2._get_feat_extract_output_lengths( audio_lengths ) pooled_state = mean_pool(hidden_states, feature_lengths) else: pooled_state = self.pooling(hidden_states) intermediate_state = self.flat(self.intermediate_layer_dropout(pooled_state)) return intermediate_state wav2vec2_model = TFWav2Vec2ForAudioClassification(MODEL_CHECKPOINT) for i, label in enumerate(labels): label2id[label] = str(i) id2label[str(i)] = label feature_extractor = AutoFeatureExtractor.from_pretrained( MODEL_CHECKPOINT, return_attention_mask=True ) xgb_params = { 'objective': 'binary:logistic', 'predictor': 'cpu_predictor', 'disable_default_eval_metric': 'true', } model_xgb= xgb.XGBClassifier(**xgb_params) model_xgb.load_model('xgb.json') def greet(name): inp = feature_extractor( name[1], sampling_rate=feature_extractor.sampling_rate, max_length=MAX_SEQ_LENGTH, truncation=True, padding=True, ) inp = np.array([y for x,y in inp.items()]) pred = wav2vec2_model.predict([inp[0],inp[1]]) pred = model_xgb.predict(pred) lab = id2label[str(pred[0])] return lab iface = gr.Interface(fn=greet, inputs="audio", outputs="text") iface.launch()