Spaces:

NorHsangPha
/

IMS-ToucanTTS

Runtime error

App Files Files Community

IMS-ToucanTTS / Architectures /EmbeddingModel /StyleEmbedding.py

NorHsangPha

Initial commit

de6e35f verified 6 months ago

raw

history blame contribute delete

3.37 kB

	import torch

	from Architectures.EmbeddingModel.GST import GSTStyleEncoder
	from Architectures.EmbeddingModel.StyleTTSEncoder import StyleEncoder as StyleTTSEncoder


	class StyleEmbedding(torch.nn.Module):
	"""
	The style embedding should provide information of the speaker and their speaking style

	The feedback signal for the module will come from the TTS objective, so it doesn't have a dedicated train loop.
	The train loop does however supply supervision in the form of a barlow twins objective.

	See the git history for some other approaches for style embedding, like the SWIN transformer
	and a simple LSTM baseline. GST turned out to be the best.
	"""

	def __init__(self, embedding_dim=16, style_tts_encoder=False):
	super().__init__()
	self.embedding_dim = embedding_dim
	self.use_gst = not style_tts_encoder
	if style_tts_encoder:
	self.style_encoder = StyleTTSEncoder(style_dim=embedding_dim)
	else:
	self.style_encoder = GSTStyleEncoder(gst_token_dim=embedding_dim)

	def forward(self,
	batch_of_feature_sequences,
	batch_of_feature_sequence_lengths):
	"""
	Args:
	batch_of_feature_sequences: b is the batch axis, 128 features per timestep
	and l time-steps, which may include padding
	for most elements in the batch (b, l, 128)
	batch_of_feature_sequence_lengths: indicate for every element in the batch,
	what the true length is, since they are
	all padded to the length of the longest
	element in the batch (b, 1)
	Returns:
	batch of n dimensional embeddings (b,n)
	"""

	minimum_sequence_length = 512
	specs = list()
	for index, spec_length in enumerate(batch_of_feature_sequence_lengths):
	spec = batch_of_feature_sequences[index][:spec_length]
	# double the length at least once, then check
	spec = spec.repeat((2, 1))
	current_spec_length = len(spec)
	while current_spec_length < minimum_sequence_length:
	# make it longer
	spec = spec.repeat((2, 1))
	current_spec_length = len(spec)
	specs.append(spec[:minimum_sequence_length])

	spec_batch = torch.stack(specs, dim=0)
	return self.style_encoder(speech=spec_batch)


	if __name__ == '__main__':
	style_emb = StyleEmbedding(style_tts_encoder=False)
	print(f"GST parameter count: {sum(p.numel() for p in style_emb.style_encoder.parameters() if p.requires_grad)}")

	seq_length = 398
	print(style_emb(torch.randn(5, seq_length, 512),
	torch.tensor([seq_length, seq_length, seq_length, seq_length, seq_length])).shape)

	style_emb = StyleEmbedding(style_tts_encoder=True)
	print(f"StyleTTS encoder parameter count: {sum(p.numel() for p in style_emb.style_encoder.parameters() if p.requires_grad)}")

	seq_length = 398
	print(style_emb(torch.randn(5, seq_length, 512),
	torch.tensor([seq_length, seq_length, seq_length, seq_length, seq_length])).shape)