Spaces:

amphion
/

singing_voice_conversion

Running on A10G

App Files Files Community

singing_voice_conversion / utils /audio.py

RMSnow

init and interface

df2accb 10 months ago

raw

history blame contribute delete

No virus

2.08 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import torch
	import numpy as np
	from numpy import linalg as LA
	import librosa
	import soundfile as sf
	import librosa.filters


	def load_audio_torch(wave_file, fs):
	"""Load audio data into torch tensor

	Args:
	wave_file (str): path to wave file
	fs (int): sample rate

	Returns:
	audio (tensor): audio data in tensor
	fs (int): sample rate
	"""

	audio, sample_rate = librosa.load(wave_file, sr=fs, mono=True)
	# audio: (T,)
	assert len(audio) > 2

	# Check the audio type (for soundfile loading backbone) - float, 8bit or 16bit
	if np.issubdtype(audio.dtype, np.integer):
	max_mag = -np.iinfo(audio.dtype).min
	else:
	max_mag = max(np.amax(audio), -np.amin(audio))
	max_mag = (
	(2**31) + 1
	if max_mag > (2**15)
	else ((2**15) + 1 if max_mag > 1.01 else 1.0)
	)

	# Normalize the audio
	audio = torch.FloatTensor(audio.astype(np.float32)) / max_mag

	if (torch.isnan(audio) \| torch.isinf(audio)).any():
	return [], sample_rate or fs or 48000

	# Resample the audio to our target samplerate
	if fs is not None and fs != sample_rate:
	audio = torch.from_numpy(
	librosa.core.resample(audio.numpy(), orig_sr=sample_rate, target_sr=fs)
	)
	sample_rate = fs

	return audio, fs


	def _stft(y, cfg):
	return librosa.stft(
	y=y, n_fft=cfg.n_fft, hop_length=cfg.hop_size, win_length=cfg.win_size
	)


	def energy(wav, cfg):
	D = _stft(wav, cfg)
	magnitudes = np.abs(D).T # [F, T]
	return LA.norm(magnitudes, axis=1)


	def get_energy_from_tacotron(audio, _stft):
	audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
	audio = torch.autograd.Variable(audio, requires_grad=False)
	mel, energy = _stft.mel_spectrogram(audio)
	energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
	return mel, energy