Audio Pitch-Level Classifier (Single-Label)
This model is a feature-based MLP classifier for single-label classification. It predicts the perceived pitch level of an audio clip using handcrafted F0-based features extracted from the waveform.
π·οΈ Pitch Labels
high-pitched: Audio is classified as having a relatively high perceived pitch.medium-pitched: Audio is classified as having a moderate or neutral perceived pitch.low-pitched: Audio is classified as having a relatively low perceived pitch.
π Usage: Input & Output
1. Input Specifications
- Feature Extraction: Uses
librosa.pyinto extract pitch-related F0 statistics. - Sampling Rate: 16,000 Hz.
- Audio Format: Mono raw waveform.
- Metadata Usage: Disabled. This checkpoint uses only acoustic pitch features.
2. Output (Single-Label Logic)
Because this is a Single-Label task, the categories are mutually exclusive.
- Activation: Softmax. The output probabilities sum to 1.0.
- Decision: The model selects the label with the highest probability score.
π Feature Set
This checkpoint uses the following 12 normalized features:
[
"f0_mean",
"f0_std",
"f0_median",
"f0_min",
"f0_max",
"f0_p10",
"f0_p90",
"f0_iqr",
"f0_range",
"f0_slope",
"voiced_ratio",
"voiced_count",
]
π Label Mapping
{
0: "high-pitched",
1: "medium-pitched",
2: "low-pitched",
}
π Inference Code
import json
import librosa
import numpy as np
import torch
import torch.nn as nn
model_dir = "ITTS-Evaluation/models/feature-mlp_pitch_level-no_metadata-100epochs"
device = "cuda" if torch.cuda.is_available() else "cpu"
class MLPClassifier(nn.Module):
def __init__(self, input_dim, hidden_dims, num_labels, dropout):
super().__init__()
layers = []
current_dim = input_dim
for hidden_dim in hidden_dims:
layers.append(nn.Linear(current_dim, hidden_dim))
layers.append(nn.ReLU())
layers.append(nn.Dropout(dropout))
current_dim = hidden_dim
layers.append(nn.Linear(current_dim, num_labels))
self.network = nn.Sequential(*layers)
def forward(self, features):
return self.network(features)
def compute_features(audio, sr=16000, f0_fmin=50.0, f0_fmax=500.0):
if len(audio) == 0:
return np.zeros(12, dtype=np.float32)
audio = audio.astype(np.float32)
if np.max(np.abs(audio)) > 0:
audio = audio / np.max(np.abs(audio))
try:
f0, voiced_flag, _ = librosa.pyin(audio, fmin=f0_fmin, fmax=f0_fmax, sr=sr)
except Exception:
return np.zeros(12, dtype=np.float32)
if f0 is None or voiced_flag is None:
return np.zeros(12, dtype=np.float32)
voiced_mask = np.asarray(voiced_flag, dtype=bool)
voiced_f0 = np.asarray(f0[voiced_mask], dtype=np.float32)
voiced_f0 = voiced_f0[~np.isnan(voiced_f0)]
total_frames = max(len(f0), 1)
voiced_ratio = float(voiced_mask.sum() / total_frames)
voiced_count = float(voiced_f0.size)
if voiced_f0.size == 0:
return np.array(([0.0] * 10) + [voiced_ratio, voiced_count], dtype=np.float32)
p10 = float(np.percentile(voiced_f0, 10))
p90 = float(np.percentile(voiced_f0, 90))
slope = float(np.polyfit(np.arange(voiced_f0.size), voiced_f0, 1)[0]) if voiced_f0.size > 1 else 0.0
f0_mean = float(np.mean(voiced_f0))
f0_std = float(np.std(voiced_f0))
f0_median = float(np.median(voiced_f0))
f0_min = float(np.min(voiced_f0))
f0_max = float(np.max(voiced_f0))
f0_iqr = float(p90 - p10)
f0_range = float(f0_max - f0_min)
return np.array([
f0_mean,
f0_std,
f0_median,
f0_min,
f0_max,
p10,
p90,
f0_iqr,
f0_range,
slope,
voiced_ratio,
voiced_count,
], dtype=np.float32)
with open(f"{model_dir}/training_metadata.json", "r") as f:
metadata = json.load(f)
mean = np.array(metadata["normalization"]["mean"][0], dtype=np.float32)
std = np.array(metadata["normalization"]["std"][0], dtype=np.float32)
id2label = {int(k): v for k, v in metadata["id2label"].items()}
model = MLPClassifier(
input_dim=len(metadata["feature_names"]),
hidden_dims=metadata["hidden_dims"],
num_labels=len(metadata["label2id"]),
dropout=metadata["dropout"],
).to(device)
model.load_state_dict(torch.load(f"{model_dir}/best_model.pt", map_location=device))
model.eval()
def predict_pitch(audio_path):
audio, _ = librosa.load(audio_path, sr=metadata["sample_rate"], mono=True)
features = compute_features(
audio,
sr=metadata["sample_rate"],
f0_fmin=metadata["f0_fmin"],
f0_fmax=metadata["f0_fmax"],
)
features = (features - mean) / std
inputs = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
with torch.no_grad():
logits = model(inputs)
probs = torch.softmax(logits, dim=-1).squeeze().cpu().numpy()
predicted_id = np.argmax(probs)
return {
"label": id2label[int(predicted_id)],
"confidence": float(probs[predicted_id]),
"all_scores": {id2label[i]: float(probs[i]) for i in range(len(probs))}
}
result = predict_pitch("audio_clip.wav")
print(f"Detected Pitch: {result['label']} ({result['confidence']:.2%})")
π Reported Performance
From the saved evaluation results:
- Accuracy: 0.68
- Macro F1: 0.68
Class-wise summary:
high-pitched: precision 0.74, recall 0.72, f1-score 0.73medium-pitched: precision 0.61, recall 0.64, f1-score 0.63low-pitched: precision 0.70, recall 0.68, f1-score 0.69
Inference Providers NEW
This model isn't deployed by any Inference Provider. π Ask for provider support