SidonSamples / tools /make_mel_images.py
Wataru's picture
add files
baa9131
#!/usr/bin/env python3
import argparse
from pathlib import Path
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
def save_mel_image(
audio_path: Path,
out_path: Path,
sr: int = 22050,
n_fft: int = 1024,
hop_length: int = 256,
n_mels: int = 80,
fmin: int = 0,
fmax: int | None = 8000,
):
y, _ = librosa.load(str(audio_path), sr=sr, mono=True)
S = librosa.feature.melspectrogram(
y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, fmin=fmin, fmax=fmax
)
S_db = librosa.power_to_db(S, ref=np.max)
plt.figure(figsize=(8, 3), dpi=150)
librosa.display.specshow(S_db, sr=sr, hop_length=hop_length, x_axis=None, y_axis=None, cmap="magma")
plt.axis("off")
plt.tight_layout(pad=0)
out_path.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(out_path, bbox_inches="tight", pad_inches=0)
plt.close()
def main():
p = argparse.ArgumentParser(description="Generate mel-spectrogram image from an audio file.")
p.add_argument("audio", type=Path, help="Path to input audio (wav/flac/mp3)")
p.add_argument("output", type=Path, help="Path to output image (png/jpg)")
p.add_argument("--sr", type=int, default=22050)
p.add_argument("--n_fft", type=int, default=1024)
p.add_argument("--hop", dest="hop_length", type=int, default=256)
p.add_argument("--mels", dest="n_mels", type=int, default=80)
p.add_argument("--fmin", type=int, default=0)
p.add_argument("--fmax", type=int, default=8000)
args = p.parse_args()
save_mel_image(
audio_path=args.audio,
out_path=args.output,
sr=args.sr,
n_fft=args.n_fft,
hop_length=args.hop_length,
n_mels=args.n_mels,
fmin=args.fmin,
fmax=args.fmax,
)
if __name__ == "__main__":
main()