litagin's picture
Upload 11 files
d485dcb verified
import torch
from torch import nn
# モデルの定義
class AudioClassifier(nn.Module):
def __init__(
self,
label2id: dict,
feature_dim=256,
hidden_dim=256,
device="cpu",
dropout_rate=0.5,
num_hidden_layers=2,
):
super(AudioClassifier, self).__init__()
self.num_classes = len(label2id)
self.device = device
self.label2id = label2id
self.id2label = {v: k for k, v in self.label2id.items()}
# 最初の線形層と活性化層を追加
self.fc1 = nn.Sequential(
nn.Linear(feature_dim, hidden_dim),
nn.BatchNorm1d(hidden_dim),
nn.Mish(),
nn.Dropout(dropout_rate),
)
# 隠れ層の追加
self.hidden_layers = nn.ModuleList()
for _ in range(num_hidden_layers):
layer = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.BatchNorm1d(hidden_dim),
nn.Mish(),
nn.Dropout(dropout_rate),
)
self.hidden_layers.append(layer)
# 最後の層(クラス分類用)
self.fc_last = nn.Linear(hidden_dim, self.num_classes)
def forward(self, x):
# 最初の層を通過
x = self.fc1(x)
# 隠れ層を順に通過
for layer in self.hidden_layers:
x = layer(x)
# 最後の分類層
x = self.fc_last(x)
return x
def infer_from_features(self, features):
# 特徴量をテンソルに変換
features = (
torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(self.device)
)
# モデルを評価モードに設定
self.eval()
# モデルの出力を取得
with torch.no_grad():
output = self.forward(features)
# ソフトマックス関数を適用して確率を計算
probs = torch.softmax(output, dim=1)
# ラベルごとの確率を計算して大きい順に並べ替えて返す
probs, indices = torch.sort(probs, descending=True)
probs = probs.cpu().numpy().squeeze()
indices = indices.cpu().numpy().squeeze()
return [(self.id2label[i], p) for i, p in zip(indices, probs)]
def infer_from_file(self, file_path):
feature = extract_features(file_path, device=self.device)
return self.infer_from_features(feature)
from pyannote.audio import Inference, Model
emb_model = Model.from_pretrained("pyannote/wespeaker-voxceleb-resnet34-LM")
inference = Inference(emb_model, window="whole")
def extract_features(file_path, device="cpu"):
inference.to(torch.device(device))
return inference(file_path)