# from transformers.models.wav2vec2 import Wav2Vec2Model, Wav2Vec2FeatureExtractor | |
# import torchaudio | |
# import torch | |
# import torch.nn as nn | |
def get_valence_score(file_path): | |
# class VADPredictor(nn.Module): | |
# """Model to predict VAD Scores""" | |
# def __init__(self, pretrained_model_name="facebook/wav2vec2-base-960h", freeze_feature_extractor=True): | |
# super(VADPredictor, self).__init__() | |
# self.wav2vec2 = Wav2Vec2Model.from_pretrained(pretrained_model_name) | |
# if freeze_feature_extractor: | |
# for param in self.wav2vec2.feature_extractor.parameters(): | |
# param.requires_grad = False | |
# hidden_size = self.wav2vec2.config.hidden_size | |
# self.valence_layers = nn.Sequential( | |
# nn.Linear(hidden_size, 256), | |
# nn.ReLU(), | |
# nn.Dropout(0.3), | |
# nn.Linear(256,64), | |
# nn.Linear(64,1) | |
# ) | |
# self.arousal_layers = nn.Sequential( | |
# nn.Linear(hidden_size, 256), | |
# nn.ReLU(), | |
# nn.Dropout(0.3), | |
# nn.Linear(256,64), | |
# nn.Linear(64,1) | |
# ) | |
# self.dominance_layers = nn.Sequential( | |
# nn.Linear(hidden_size, 256), | |
# nn.ReLU(), | |
# nn.Dropout(0.3), | |
# nn.Linear(256,64), | |
# nn.Linear(64,1) | |
# ) | |
# def forward(self, input_values, attention_mask=None): | |
# outputs = self.wav2vec2(input_values, attention_mask=attention_mask) | |
# last_hidden_state = outputs.last_hidden_state | |
# pooled_output = torch.mean(last_hidden_state, dim=1) | |
# valence = self.valence_layers(pooled_output) | |
# arousal = self.arousal_layers(pooled_output) | |
# dominance = self.dominance_layers(pooled_output) | |
# return { | |
# 'valence': valence.squeeze(-1), | |
# 'arousal': arousal.squeeze(-1), | |
# 'dominance': dominance.squeeze(-1) | |
# } | |
# model = VADPredictor() | |
# model.load_state_dict(torch.load(r"D:\Intern\shankh\DUMP\vad_predictor_model.pt", map_location=torch.device("cpu"))) | |
# model.eval() | |
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") | |
# # Load and process audio | |
# file_path = file_path | |
# waveform, sr = torchaudio.load(file_path) | |
# # Convert to mono | |
# if waveform.shape[0] > 1: | |
# waveform = waveform.mean(dim=0, keepdim=True) | |
# # Resample to 16000 Hz | |
# if sr != 16000: | |
# resampler = torchaudio.transforms.Resample(sr, 16000) | |
# waveform = resampler(waveform) | |
# sr = 16000 | |
# # Normalize | |
# waveform = waveform / waveform.abs().max() | |
# # Parameters | |
# segment_sec = 1 | |
# segment_samples = int(segment_sec * sr) | |
# valence_scores = [] | |
# # Inference per segment | |
# with torch.no_grad(): | |
# for start in range(0, waveform.shape[1] - segment_samples + 1, segment_samples): | |
# segment = waveform[:, start:start+segment_samples] | |
# input_values = feature_extractor(segment.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values | |
# output = model(input_values) | |
# val = output['valence'].item() | |
# valence_scores.append(val) | |
valence_scores = 5.0 | |
return valence_scores | |