Spaces:

ameythakur
/

Deepfake-Audio

Running

App Files Files Community

Deepfake-Audio / Source Code /encoder /audio.py

ameythakur

Deepfake-Audio

1d8403e verified about 2 months ago

raw

history blame contribute delete

5.62 kB

	# ==================================================================================================
	# DEEPFAKE AUDIO - encoder/audio.py (Acoustic Signal Processing)
	# ==================================================================================================
	#
	# 📝 DESCRIPTION
	# This module implements the acoustic primitives required for the Speaker Encoder.
	# It handles waveform normalization, resampling, and most importantly, the
	# transformation of raw time-domain signals into frequency-domain Mel-Spectrograms.
	# It also integrates Voice Activity Detection (VAD) via 'webrtcvad' to ensure that
	# only active speech segments are passed to the neural distillation layers.
	#
	# 👤 AUTHORS
	# - Amey Thakur (https://github.com/Amey-Thakur)
	# - Mega Satish (https://github.com/msatmod)
	#
	# 🤝🏻 CREDITS
	# Original Real-Time Voice Cloning methodology by CorentinJ
	# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
	#
	# 🔗 PROJECT LINKS
	# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
	# Video Demo: https://youtu.be/i3wnBcbHDbs
	# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
	#
	# 📜 LICENSE
	# Released under the MIT License
	# Release Date: 2021-02-06
	# ==================================================================================================

	from scipy.ndimage import binary_dilation
	from encoder.params_data import *
	from pathlib import Path
	from typing import Optional, Union
	from warnings import warn
	import numpy as np
	import librosa
	import struct

	# --- VAD INITIALIZATION ---
	try:
	import webrtcvad
	except:
	warn("⚠️ Scholarly Warning: 'webrtcvad' not detected. Noise removal and silence trimming will be bypassed.")
	webrtcvad = None

	int16_max = (2 ** 15) - 1

	def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
	source_sr: Optional[int] = None,
	normalize: Optional[bool] = True,
	trim_silence: Optional[bool] = True):
	"""
	Orchestrates the acoustic normalization pipeline.
	1. Loads signal from disk or buffer.
	2. Resamples to training-specific frequencies.
	3. Normalizes volume (dBFS).
	4. Trims non-speech intervals (if VAD is active).
	"""
	# Defensive Input Handling
	if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
	wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
	else:
	wav = fpath_or_wav

	# Frequency Alignment
	if source_sr is not None and source_sr != sampling_rate:
	wav = librosa.resample(y=wav, orig_sr=source_sr, target_sr=sampling_rate)

	# Amplitude Normalization
	if normalize:
	wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)

	# Temporal Compression (Silence Removal)
	if webrtcvad and trim_silence:
	wav = trim_long_silences(wav)

	return wav

	def wav_to_mel_spectrogram(wav):
	"""
	Distills a time-domain waveform into a frequency-domain Mel-Spectrogram matrix.
	This serves as the primary input for the Speaker Encoder neural network.
	"""
	frames = librosa.feature.melspectrogram(
	y=wav,
	sr=sampling_rate,
	n_fft=int(sampling_rate * mel_window_length / 1000),
	hop_length=int(sampling_rate * mel_window_step / 1000),
	n_mels=mel_n_channels
	)
	return frames.astype(np.float32).T

	def trim_long_silences(wav):
	"""
	Utilizes WebRTC Voice Activity Detection (VAD) to excise non-semantic silences.
	Ensures the speaker identity is extracted from high-entropy speech segments only.
	"""
	# Spatial Decomposition into temporal windows
	samples_per_window = (vad_window_length * sampling_rate) // 1000
	wav = wav[:len(wav) - (len(wav) % samples_per_window)]

	# Binary Serialization for VAD compatibility (16-bit PCM)
	pcm_wave = struct.pack("%dh" % len(wav), (np.round(wav int16_max)).astype(np.int16))

	# Statistical Speech Filtering
	voice_flags = []
	vad = webrtcvad.Vad(mode=3) # Aggressive Filtering
	for window_start in range(0, len(wav), samples_per_window):
	window_end = window_start + samples_per_window
	voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
	sample_rate=sampling_rate))
	voice_flags = np.array(voice_flags)

	# Temporal Smoothing (Moving Average)
	def moving_average(array, width):
	array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
	ret = np.cumsum(array_padded, dtype=float)
	ret[width:] = ret[width:] - ret[:-width]
	return ret[width - 1:] / width

	audio_mask = moving_average(voice_flags, vad_moving_average_width)
	audio_mask = np.round(audio_mask).astype(bool)

	# Morphological Dilation to preserve speech boundaries
	audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
	audio_mask = np.repeat(audio_mask, samples_per_window)

	return wav[audio_mask == True]

	def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
	"""Calibrates the signal's energy level to a target Decibel Full Scale (dBFS)."""
	if increase_only and decrease_only:
	raise ValueError("Conflict: Both increase and decrease flags are active.")

	dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))

	if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
	return wav

	return wav * (10 ** (dBFS_change / 20))