import numpy as np
import torch
import torchaudio
import librosa
import librosa.display
import matplotlib.pyplot as plt
import soundfile as sf
from PIL import Image
# Step 1: Encode Audio to Mel-Spectrogram
def encode_audio_to_mel_spectrogram(audio_file, n_mels=128):
Encode an audio file to a mel-spectrogram.
- audio_file: Path to the audio file.
- n_mels: Number of mel bands (default: 128).
- mel_spectrogram_db: Mel-spectrogram in dB scale.
- sample_rate: Sample rate of the audio file.
y, sample_rate = librosa.load(audio_file, sr=None) # Load audio
mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sample_rate, n_mels=n_mels)
mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max) # Convert to dB
return mel_spectrogram_db, sample_rate
# Improved Step 2: Save Mel-Spectrogram as Image
def save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, output_image='mel_spectrogram.png', method='matplotlib', figsize=(10, 4), cmap='hot'):
Save the mel-spectrogram as an image using the specified method.
- mel_spectrogram_db: Mel-spectrogram in dB scale.
- sample_rate: Sample rate of the audio file.
- output_image: Path to save the image.
- method: Method for saving ('matplotlib' or 'custom').
- figsize: Size of the figure for matplotlib (default: (10, 4)).
- cmap: Colormap for the spectrogram (default: 'hot').
if method == 'matplotlib':
librosa.display.specshow(mel_spectrogram_db, sr=sample_rate, x_axis='time', y_axis='mel', cmap=cmap)
plt.colorbar(format='%+2.0f dB')
print(f"Mel-spectrogram image saved using matplotlib as '{output_image}'")
elif method == 'custom':
# Convert dB scale to linear scale for image generation
mel_spectrogram_linear = librosa.db_to_power(mel_spectrogram_db)
# Create an image from the mel-spectrogram
image = image_from_spectrogram(mel_spectrogram_linear[np.newaxis, ...]) # Add channel dimension
# Save the image
print(f"Mel-spectrogram image saved using custom method as '{output_image}'")
raise ValueError("Invalid method. Choose 'matplotlib' or 'custom'.")
# Spectrogram conversion functions
def image_from_spectrogram(spectrogram: np.ndarray, power: float = 0.25) -> Image.Image:
Compute a spectrogram image from a spectrogram magnitude array.
spectrogram: (channels, frequency, time)
power: A power curve to apply to the spectrogram to preserve contrast
image: (frequency, time, channels)
# Rescale to 0-1
max_value = np.max(spectrogram)
data = spectrogram / max_value
# Apply the power curve
data = np.power(data, power)
# Rescale to 0-255 and invert
data = 255 - (data * 255).astype(np.uint8)
# Convert to a PIL image
if data.shape[0] == 1:
image = Image.fromarray(data[0], mode="L").convert("RGB")
elif data.shape[0] == 2:
data = np.array([np.zeros_like(data[0]), data[0], data[1]]).transpose(1, 2, 0)
image = Image.fromarray(data, mode="RGB")
raise NotImplementedError(f"Unsupported number of channels: {data.shape[0]}")
# Flip Y
image = image.transpose(Image.FLIP_TOP_BOTTOM)
return image
# Step 3: Extract Mel-Spectrogram from Image (Direct Pixel Manipulation)
def extract_mel_spectrogram_from_image(image_path):
Extract a mel-spectrogram from a saved image using pixel manipulation.
- image_path: Path to the spectrogram image file.
- mel_spectrogram_db: The extracted mel-spectrogram in dB scale.
img = Image.open(image_path).convert('L') # Open image and convert to grayscale
img_array = np.array(img) # Convert to NumPy array
mel_spectrogram_db = img_array / 255.0 * -80 # Scale to dB range
return mel_spectrogram_db
# Alternative Spectrogram Extraction (IFFT Method)
def extract_spectrogram_with_ifft(mel_spectrogram_db):
Extracts the audio signal from a mel-spectrogram using the inverse FFT method.
- mel_spectrogram_db: The mel-spectrogram in dB scale.
- audio: The reconstructed audio signal.
# Convert dB mel-spectrogram back to linear scale
mel_spectrogram = librosa.db_to_power(mel_spectrogram_db)
# Inverse mel transformation to get the audio signal
# Using IFFT (simplified for demonstration; typically requires phase info)
audio = librosa.feature.inverse.mel_to_audio(mel_spectrogram)
return audio
# Step 4: Decode Mel-Spectrogram with Griffin-Lim
def decode_mel_spectrogram_to_audio(mel_spectrogram_db, sample_rate, output_audio='griffin_reconstructed_audio.wav'):
Decode a mel-spectrogram into audio using Griffin-Lim algorithm.
- mel_spectrogram_db: The mel-spectrogram in dB scale.
- sample_rate: The sample rate for the audio file.
- output_audio: Path to save the reconstructed audio file.
# Convert dB mel-spectrogram back to linear scale
mel_spectrogram = librosa.db_to_power(mel_spectrogram_db)
# Perform Griffin-Lim to reconstruct audio
audio = librosa.griffinlim(mel_spectrogram)
# Save the generated audio
sf.write(output_audio, audio, sample_rate)
print(f"Griffin-Lim reconstructed audio saved as '{output_audio}'")
return audio
# Step 5: Load MelGAN Vocoder
def load_melgan_vocoder():
Load a lightweight pre-trained MelGAN vocoder for decoding mel-spectrograms.
Returns a torch MelGAN vocoder model.
model = torchaudio.models.MelGAN() # Load MelGAN model
model.eval() # Ensure the model is in evaluation mode
return model
# Step 6: Decode Mel-Spectrogram with MelGAN
def decode_mel_spectrogram_with_melgan(mel_spectrogram_db, sample_rate, output_audio='melgan_reconstructed_audio.wav'):
Decode a mel-spectrogram into audio using MelGAN vocoder.
- mel_spectrogram_db: The mel-spectrogram in dB scale.
- sample_rate: The sample rate for the audio file.
- output_audio: Path to save the reconstructed audio file.
- audio: The reconstructed audio signal.
# Convert dB mel-spectrogram back to linear scale
mel_spectrogram = librosa.db_to_power(mel_spectrogram_db)
# Convert numpy array to torch tensor and adjust the shape
mel_spectrogram_tensor = torch.tensor(mel_spectrogram).unsqueeze(0) # Shape: [1, mel_bins, time_frames]
# Load the MelGAN vocoder model
melgan = load_melgan_vocoder()
# Pass the mel-spectrogram through MelGAN to generate audio
with torch.no_grad():
audio = melgan(mel_spectrogram_tensor).squeeze().numpy() # Squeeze to remove batch dimension
# Save the generated audio
sf.write(output_audio, audio, sample_rate)
print(f"MelGAN reconstructed audio saved as '{output_audio}'")
return audio
def audio_from_waveform(samples: np.ndarray, sample_rate: int, normalize: bool = False) -> pydub.AudioSegment:
Convert a numpy array of samples of a waveform to an audio segment.
samples: (channels, samples) array
sample_rate: Sample rate of the audio.
normalize: Flag to normalize volume.
# Normalize volume to fit in int16
if normalize:
samples *= np.iinfo(np.int16).max / np.max(np.abs(samples))
# Transpose and convert to int16
samples = samples.transpose(1, 0).astype(np.int16)
# Write to the bytes of a WAV file
wav_bytes = io.BytesIO()
wavfile.write(wav_bytes, sample_rate, samples)
# Read into pydub
return pydub.AudioSegment.from_wav(wav_bytes)
def apply_filters(segment: pydub.AudioSegment, compression: bool = False) -> pydub.AudioSegment:
Apply post-processing filters to the audio segment to compress it and keep at a -10 dBFS level.
segment: The audio segment to filter.
compression: Flag to apply dynamic range compression.
if compression:
segment = pydub.effects.normalize(segment, headroom=0.1)
segment = segment.apply_gain(-10 - segment.dBFS)
segment = pydub.effects.compress_dynamic_range(
# Apply gain to desired dB level and normalize again
desired_db = -12
segment = segment.apply_gain(desired_db - segment.dBFS)
return pydub.effects.normalize(segment, headroom=0.1)
def stitch_segments(segments: Sequence[pydub.AudioSegment], crossfade_s: float) -> pydub.AudioSegment:
Stitch together a sequence of audio segments with a crossfade between each segment.
segments: Sequence of audio segments to stitch.
crossfade_s: Duration of crossfade in seconds.
crossfade_ms = int(crossfade_s * 1000)
combined_segment = segments[0]
for segment in segments[1:]:
combined_segment = combined_segment.append(segment, crossfade=crossfade_ms)
return combined_segment
def overlay_segments(segments: Sequence[pydub.AudioSegment]) -> pydub.AudioSegment:
Overlay a sequence of audio segments on top of each other.
segments: Sequence of audio segments to overlay.
assert len(segments) > 0
output: pydub.AudioSegment = segments[0]
for segment in segments[1:]:
output = output.overlay(segment)
return output
# Step 7: Full Pipeline for Audio Processing with Customization
def mel_spectrogram_pipeline(audio_file, output_image='mel_spectrogram.png',
extraction_method='pixel', # 'pixel' or 'ifft'
decoding_method='griffin'): # 'griffin' or 'melgan'
Full pipeline to encode audio to mel-spectrogram, save it as an image, extract the spectrogram from the image,
and decode it back to audio using the selected methods.
- audio_file: Path to the audio file to be processed.
- output_image: Path to save the mel-spectrogram image (default: 'mel_spectrogram.png').
- output_audio_griffin: Path to save the Griffin-Lim reconstructed audio.
- output_audio_melgan: Path to save the MelGAN reconstructed audio.
- extraction_method: Method for extraction ('pixel' or 'ifft').
- decoding_method: Method for decoding ('griffin' or 'melgan').
# Step 1: Encode (Audio -> Mel-Spectrogram)
mel_spectrogram_db, sample_rate = encode_audio_to_mel_spectrogram(audio_file)
# Step 2: Convert Mel-Spectrogram to Image and save it
save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, output_image)
# Step 3: Extract Mel-Spectrogram from the image based on chosen method
if extraction_method == 'pixel':
extracted_mel_spectrogram_db = extract_mel_spectrogram_from_image(output_image)
elif extraction_method == 'ifft':
extracted_mel_spectrogram_db = extract_spectrogram_with_ifft(mel_spectrogram_db)
raise ValueError("Invalid extraction method. Choose 'pixel' or 'ifft'.")
# Step 4: Decode based on the chosen decoding method
if decoding_method == 'griffin':
decode_mel_spectrogram_to_audio(extracted_mel_spectrogram_db, sample_rate, output_audio_griffin)
elif decoding_method == 'melgan':
decode_mel_spectrogram_with_melgan(extracted_mel_spectrogram_db, sample_rate, output_audio_melgan)
raise ValueError("Invalid decoding method. Choose 'griffin' or 'melgan'.")
def process_audio(audio_file, extraction_method, decoding_method):
# Create temporary files for outputs
with tempfile.NamedTemporaryFile(suffix=".png") as temp_image, \
tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_griffin, \
tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_melgan:
# Step 1: Encode (Audio -> Mel-Spectrogram)
mel_spectrogram_db, sample_rate = encode_audio_to_mel_spectrogram(audio_file)
# Step 2: Convert Mel-Spectrogram to Image and save it
save_mel_spectrogram_image(mel_spectrogram_db, sample_rate, temp_image.name)
# Step 3: Extract Mel-Spectrogram from the image based on chosen method
if extraction_method == 'pixel':
extracted_mel_spectrogram_db = extract_mel_spectrogram_from_image(temp_image.name)
elif extraction_method == 'ifft':
extracted_mel_spectrogram_db = extract_spectrogram_with_ifft(mel_spectrogram_db)
# Step 4: Decode using both methods
decode_mel_spectrogram_to_audio(extracted_mel_spectrogram_db, sample_rate, temp_audio_griffin.name)
decode_mel_spectrogram_with_melgan(extracted_mel_spectrogram_db, sample_rate, temp_audio_melgan.name)
# Return results
return (temp_image.name,
temp_audio_griffin.name if decoding_method == 'griffin' else temp_audio_melgan.name)
# Create Gradio interface
iface = gr.Interface(
gr.Audio(type="filepath", label="Upload Audio"),
gr.Radio(["pixel", "ifft"], label="Extraction Method", value="pixel"),
gr.Radio(["griffin", "melgan"], label="Decoding Method", value="griffin")
gr.Image(type="filepath", label="Mel-Spectrogram"),
gr.Audio(type="filepath", label="Reconstructed Audio")
title="Audio Encoder-Decoder",
description="Upload an audio file to encode it to a mel-spectrogram and then decode it back to audio."
# Launch the app
# Example usage(TEST)
if __name__ == "__main__":
audio_file_path = 'your_audio_file.wav' # Specify the path to your audio file here
extraction_method='pixel', # Choose 'pixel' or 'ifft'
decoding_method='griffin' # Choose 'griffin' or 'melgan'