| | import os |
| | import torch |
| | import numpy as np |
| | import librosa |
| | from torch import nn |
| | import torch.nn.functional as F |
| |
|
| | |
| | def extract_mfcc_and_pitch(audio_path, sr=16000, n_mfcc=40): |
| | """ |
| | Ekstrak fitur MFCC dan pitch dari file audio |
| | """ |
| | |
| | audio, sr = librosa.load(audio_path, sr=sr) |
| | |
| | |
| | mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc) |
| | |
| | |
| | mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc) |
| | |
| | |
| | pitch = librosa.yin(audio, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C6')) |
| | pitch = np.nan_to_num(pitch, nan=np.nanmean(pitch)) |
| | |
| | |
| | pitch = (pitch - np.mean(pitch)) / np.std(pitch) |
| | |
| | |
| | pitch = pitch.reshape(1, -1) |
| | |
| | |
| | combined_features = np.vstack([mfcc, pitch]) |
| | |
| | return combined_features |
| |
|
| | |
| | class XVectorNet(nn.Module): |
| | def __init__(self, input_dim=41, dropout_rate=0.45): |
| | super(XVectorNet, self).__init__() |
| | |
| | |
| | self.layer1 = nn.Conv1d(input_dim, 512, 5, padding=2) |
| | self.dropout1 = nn.Dropout(dropout_rate) |
| | self.layer2 = nn.Conv1d(512, 512, 3, padding=1) |
| | self.dropout2 = nn.Dropout(dropout_rate) |
| | self.layer3 = nn.Conv1d(512, 512, 3, padding=1) |
| | self.dropout3 = nn.Dropout(dropout_rate) |
| | self.layer4 = nn.Conv1d(512, 512, 1) |
| | self.dropout4 = nn.Dropout(dropout_rate) |
| | self.layer5 = nn.Conv1d(512, 1500, 1) |
| | |
| | |
| | self.stats_pooling = StatsPooling() |
| | |
| | |
| | self.layer6 = nn.Linear(3000, 512) |
| | self.dropout6 = nn.Dropout(dropout_rate) |
| | self.layer7 = nn.Linear(512, 512) |
| | self.dropout7 = nn.Dropout(dropout_rate) |
| | self.output = nn.Linear(512, 2) |
| | |
| | def forward(self, x): |
| | x = F.relu(self.layer1(x)) |
| | x = self.dropout1(x) |
| | x = F.relu(self.layer2(x)) |
| | x = self.dropout2(x) |
| | x = F.relu(self.layer3(x)) |
| | x = self.dropout3(x) |
| | x = F.relu(self.layer4(x)) |
| | x = self.dropout4(x) |
| | x = F.relu(self.layer5(x)) |
| | |
| | x = self.stats_pooling(x) |
| | |
| | x = F.relu(self.layer6(x)) |
| | x = self.dropout6(x) |
| | x = F.relu(self.layer7(x)) |
| | x = self.dropout7(x) |
| | x = self.output(x) |
| | |
| | return x |
| |
|
| | class StatsPooling(nn.Module): |
| | def forward(self, x): |
| | mean = torch.mean(x, dim=2) |
| | std = torch.std(x, dim=2) |
| | return torch.cat((mean, std), dim=1) |
| |
|
| | |
| | def load_model(model_path, input_dim=41, dropout_rate=0.45): |
| | model = XVectorNet(input_dim=input_dim, dropout_rate=dropout_rate) |
| | model.load_state_dict(torch.load(model_path)) |
| | model.eval() |
| | return model |
| |
|
| | |
| | def inference(model, audio_path, device='cuda' if torch.cuda.is_available() else 'cpu'): |
| | |
| | features = extract_mfcc_and_pitch(audio_path) |
| | |
| | |
| | features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device) |
| | |
| | |
| | with torch.no_grad(): |
| | output = model(features_tensor) |
| | probabilities = F.softmax(output, dim=1) |
| | predicted_class = torch.argmax(probabilities, dim=1).item() |
| | |
| | return predicted_class, probabilities[:, 1].item() |
| |
|
| | |
| | def main_inference(model_path, audio_folder): |
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | |
| | |
| | model = load_model(model_path).to(device) |
| | |
| | |
| | wav_files = [f for f in os.listdir(audio_folder) if f.endswith('.wav')] |
| | |
| | |
| | for wav_file in wav_files: |
| | audio_path = os.path.join(audio_folder, wav_file) |
| | predicted_class, probability = inference(model, audio_path, device) |
| | print(f"File: {wav_file}, Predicted Class: {predicted_class}, Probability: {probability:.4f}") |
| |
|
| | if __name__ == "__main__": |
| | |
| | model_path = 'output/best_overall_model.pth' |
| | |
| | |
| | audio_folder = '/path/to/folder/test' |
| | |
| | |
| | main_inference(model_path, audio_folder) |