Spaces:

tobiasc
/

conex

Build error

App Files Files Community

conex / espnet2 /layers /stft.py

tobiasc

Initial commit

ad16788 about 2 years ago

raw

history blame contribute delete

No virus

5.62 kB

	from distutils.version import LooseVersion
	from typing import Optional
	from typing import Tuple
	from typing import Union

	import torch
	from torch_complex.tensor import ComplexTensor
	from typeguard import check_argument_types

	from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
	from espnet2.layers.inversible_interface import InversibleInterface


	class Stft(torch.nn.Module, InversibleInterface):
	def __init__(
	self,
	n_fft: int = 512,
	win_length: int = None,
	hop_length: int = 128,
	window: Optional[str] = "hann",
	center: bool = True,
	normalized: bool = False,
	onesided: bool = True,
	):
	assert check_argument_types()
	super().__init__()
	self.n_fft = n_fft
	if win_length is None:
	self.win_length = n_fft
	else:
	self.win_length = win_length
	self.hop_length = hop_length
	self.center = center
	self.normalized = normalized
	self.onesided = onesided
	if window is not None and not hasattr(torch, f"{window}_window"):
	raise ValueError(f"{window} window is not implemented")
	self.window = window

	def extra_repr(self):
	return (
	f"n_fft={self.n_fft}, "
	f"win_length={self.win_length}, "
	f"hop_length={self.hop_length}, "
	f"center={self.center}, "
	f"normalized={self.normalized}, "
	f"onesided={self.onesided}"
	)

	def forward(
	self, input: torch.Tensor, ilens: torch.Tensor = None
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	"""STFT forward function.

	Args:
	input: (Batch, Nsamples) or (Batch, Nsample, Channels)
	ilens: (Batch)
	Returns:
	output: (Batch, Frames, Freq, 2) or (Batch, Frames, Channels, Freq, 2)

	"""
	bs = input.size(0)
	if input.dim() == 3:
	multi_channel = True
	# input: (Batch, Nsample, Channels) -> (Batch * Channels, Nsample)
	input = input.transpose(1, 2).reshape(-1, input.size(1))
	else:
	multi_channel = False

	# NOTE(kamo):
	# The default behaviour of torch.stft is compatible with librosa.stft
	# about padding and scaling.
	# Note that it's different from scipy.signal.stft

	# output: (Batch, Freq, Frames, 2=real_imag)
	# or (Batch, Channel, Freq, Frames, 2=real_imag)
	if self.window is not None:
	window_func = getattr(torch, f"{self.window}_window")
	window = window_func(
	self.win_length, dtype=input.dtype, device=input.device
	)
	else:
	window = None
	output = torch.stft(
	input,
	n_fft=self.n_fft,
	win_length=self.win_length,
	hop_length=self.hop_length,
	center=self.center,
	window=window,
	normalized=self.normalized,
	onesided=self.onesided,
	)
	# output: (Batch, Freq, Frames, 2=real_imag)
	# -> (Batch, Frames, Freq, 2=real_imag)
	output = output.transpose(1, 2)
	if multi_channel:
	# output: (Batch * Channel, Frames, Freq, 2=real_imag)
	# -> (Batch, Frame, Channel, Freq, 2=real_imag)
	output = output.view(bs, -1, output.size(1), output.size(2), 2).transpose(
	1, 2
	)

	if ilens is not None:
	if self.center:
	pad = self.win_length // 2
	ilens = ilens + 2 * pad

	olens = (ilens - self.win_length) // self.hop_length + 1
	output.masked_fill_(make_pad_mask(olens, output, 1), 0.0)
	else:
	olens = None

	return output, olens

	def inverse(
	self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor = None
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	"""Inverse STFT.

	Args:
	input: Tensor(batch, T, F, 2) or ComplexTensor(batch, T, F)
	ilens: (batch,)
	Returns:
	wavs: (batch, samples)
	ilens: (batch,)
	"""
	if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
	istft = torch.functional.istft
	else:
	try:
	import torchaudio
	except ImportError:
	raise ImportError(
	"Please install torchaudio>=0.3.0 or use torch>=1.6.0"
	)

	if not hasattr(torchaudio.functional, "istft"):
	raise ImportError(
	"Please install torchaudio>=0.3.0 or use torch>=1.6.0"
	)
	istft = torchaudio.functional.istft

	if self.window is not None:
	window_func = getattr(torch, f"{self.window}_window")
	window = window_func(
	self.win_length, dtype=input.dtype, device=input.device
	)
	else:
	window = None

	if isinstance(input, ComplexTensor):
	input = torch.stack([input.real, input.imag], dim=-1)
	assert input.shape[-1] == 2
	input = input.transpose(1, 2)

	wavs = istft(
	input,
	n_fft=self.n_fft,
	hop_length=self.hop_length,
	win_length=self.win_length,
	window=window,
	center=self.center,
	normalized=self.normalized,
	onesided=self.onesided,
	length=ilens.max() if ilens is not None else ilens,
	)

	return wavs, ilens