Spaces:

iteratehack
/

voice-model-rl-training

Runtime error

App Files Files Community

voice-model-rl-training / voice_rl /models /policy_wrapper.py

mbellan

Initial deployment

c3efd49 14 days ago

raw

history blame contribute delete

11.7 kB

	"""Policy wrapper for making voice models RL-compatible."""
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from typing import Tuple, Optional
	import logging

	logger = logging.getLogger(__name__)


	class PolicyValueHead(nn.Module):
	"""
	Policy and value head for RL training on voice models.

	Adds a policy head (for action log probabilities) and value head
	(for state value estimation) on top of a voice model's hidden states.
	"""

	def __init__(
	self,
	hidden_size: int,
	action_dim: int = 256,
	value_hidden_size: int = 128
	):
	"""
	Initialize policy and value heads.

	Args:
	hidden_size: Size of the base model's hidden states
	action_dim: Dimensionality of the action space
	value_hidden_size: Hidden size for value network
	"""
	super().__init__()

	# Policy head - outputs action logits
	self.policy_head = nn.Sequential(
	nn.Linear(hidden_size, hidden_size // 2),
	nn.ReLU(),
	nn.Dropout(0.1),
	nn.Linear(hidden_size // 2, action_dim)
	)

	# Value head - outputs state value estimate
	self.value_head = nn.Sequential(
	nn.Linear(hidden_size, value_hidden_size),
	nn.ReLU(),
	nn.Dropout(0.1),
	nn.Linear(value_hidden_size, 1)
	)

	logger.info(f"Initialized PolicyValueHead with hidden_size={hidden_size}, action_dim={action_dim}")

	def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Forward pass through policy and value heads.

	Args:
	hidden_states: Hidden states from base model [batch, seq_len, hidden_size]

	Returns:
	Tuple of (action_logits, state_values)
	"""
	# Pool hidden states (mean pooling over sequence)
	pooled = hidden_states.mean(dim=1) # [batch, hidden_size]

	# Get action logits and values
	action_logits = self.policy_head(pooled) # [batch, action_dim]
	state_values = self.value_head(pooled) # [batch, 1]

	return action_logits, state_values


	class RLVoiceModel(nn.Module):
	"""
	RL-compatible wrapper for voice models.

	Wraps a HuggingFace voice model and adds policy/value heads
	for reinforcement learning training.
	"""

	def __init__(
	self,
	base_model: nn.Module,
	hidden_size: int,
	action_dim: int = 256,
	action_representation: str = "discrete"
	):
	"""
	Initialize RL voice model wrapper.

	Args:
	base_model: Base voice model (e.g., wav2vec2)
	hidden_size: Hidden size of base model
	action_dim: Dimensionality of action space
	action_representation: "discrete" or "continuous"
	"""
	super().__init__()

	self.base_model = base_model
	self.hidden_size = hidden_size
	self.action_dim = action_dim
	self.action_representation = action_representation

	# Add policy and value heads
	self.policy_value_head = PolicyValueHead(
	hidden_size=hidden_size,
	action_dim=action_dim
	)

	logger.info(f"Initialized RLVoiceModel with action_representation={action_representation}")

	def forward(
	self,
	input_features: torch.Tensor,
	return_hidden_states: bool = False,
	**kwargs
	) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
	"""
	Forward pass for RL training.

	Args:
	input_features: Input audio features [batch, seq_len, features]
	return_hidden_states: Whether to return base model hidden states
	**kwargs: Additional arguments for base model

	Returns:
	Tuple of (log_probs, values, hidden_states)
	"""
	# Get base model outputs
	base_outputs = self.base_model(input_features, **kwargs)

	# Extract hidden states
	if hasattr(base_outputs, 'last_hidden_state'):
	hidden_states = base_outputs.last_hidden_state
	elif isinstance(base_outputs, torch.Tensor):
	hidden_states = base_outputs
	else:
	hidden_states = base_outputs[0]

	# Get policy and value outputs
	action_logits, state_values = self.policy_value_head(hidden_states)

	# Compute log probabilities
	if self.action_representation == "discrete":
	log_probs = F.log_softmax(action_logits, dim=-1)
	else:
	# For continuous actions, return the logits directly
	log_probs = action_logits

	if return_hidden_states:
	return log_probs, state_values, hidden_states
	else:
	return log_probs, state_values, None

	def sample_action(
	self,
	input_features: torch.Tensor,
	deterministic: bool = False
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	"""
	Sample actions from the policy.

	Args:
	input_features: Input audio features
	deterministic: If True, take most likely action

	Returns:
	Tuple of (actions, log_probs, values)
	"""
	log_probs, values, _ = self.forward(input_features)

	if self.action_representation == "discrete":
	if deterministic:
	actions = log_probs.argmax(dim=-1)
	else:
	# Sample from categorical distribution
	probs = torch.exp(log_probs)
	actions = torch.multinomial(probs, num_samples=1).squeeze(-1)

	# Get log prob of selected actions
	action_log_probs = log_probs.gather(-1, actions.unsqueeze(-1)).squeeze(-1)
	else:
	# For continuous actions, add noise for exploration
	if deterministic:
	actions = log_probs
	else:
	actions = log_probs + torch.randn_like(log_probs) * 0.1
	action_log_probs = -0.5 * ((actions - log_probs) ** 2).sum(dim=-1)

	return actions, action_log_probs, values

	def evaluate_actions(
	self,
	input_features: torch.Tensor,
	actions: torch.Tensor
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	"""
	Evaluate actions (for PPO training).

	Args:
	input_features: Input audio features
	actions: Actions to evaluate

	Returns:
	Tuple of (log_probs, values, entropy)
	"""
	log_probs, values, _ = self.forward(input_features)

	if self.action_representation == "discrete":
	# Get log probs of given actions
	action_log_probs = log_probs.gather(-1, actions.unsqueeze(-1)).squeeze(-1)

	# Compute entropy
	probs = torch.exp(log_probs)
	entropy = -(probs * log_probs).sum(dim=-1).mean()
	else:
	# For continuous actions
	action_log_probs = -0.5 * ((actions - log_probs) ** 2).sum(dim=-1)

	# Entropy for continuous (Gaussian assumption)
	entropy = 0.5 * log_probs.shape[-1] * (1.0 + torch.log(torch.tensor(2.0 * 3.14159)))

	return action_log_probs, values.squeeze(-1), entropy

	def get_base_model(self) -> nn.Module:
	"""Get the underlying base model."""
	return self.base_model

	def freeze_base_model(self) -> None:
	"""Freeze base model parameters (only train policy/value heads)."""
	for param in self.base_model.parameters():
	param.requires_grad = False
	logger.info("Froze base model parameters")

	def unfreeze_base_model(self) -> None:
	"""Unfreeze base model parameters."""
	for param in self.base_model.parameters():
	param.requires_grad = True
	logger.info("Unfroze base model parameters")


	class SequentialVoicePolicy(nn.Module):
	"""
	Sequential policy for frame-by-frame voice generation.

	For autoregressive voice generation where each frame is an action.
	"""

	def __init__(
	self,
	base_model: nn.Module,
	hidden_size: int,
	frame_size: int = 80, # e.g., 80-dim mel spectrogram
	max_seq_len: int = 1000
	):
	"""
	Initialize sequential voice policy.

	Args:
	base_model: Base model for processing context
	hidden_size: Hidden size
	frame_size: Size of each output frame
	max_seq_len: Maximum sequence length
	"""
	super().__init__()

	self.base_model = base_model
	self.hidden_size = hidden_size
	self.frame_size = frame_size
	self.max_seq_len = max_seq_len

	# Frame generation network
	self.frame_generator = nn.LSTM(
	input_size=hidden_size + frame_size,
	hidden_size=hidden_size,
	num_layers=2,
	batch_first=True
	)

	# Output projection
	self.output_projection = nn.Linear(hidden_size, frame_size)

	# Value network
	self.value_net = nn.Sequential(
	nn.Linear(hidden_size, hidden_size // 2),
	nn.ReLU(),
	nn.Linear(hidden_size // 2, 1)
	)

	logger.info(f"Initialized SequentialVoicePolicy with frame_size={frame_size}")

	def forward(
	self,
	input_features: torch.Tensor,
	previous_frames: Optional[torch.Tensor] = None,
	num_frames: int = 10
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	"""
	Generate sequence of frames.

	Args:
	input_features: Input conditioning features
	previous_frames: Previous generated frames (for autoregression)
	num_frames: Number of frames to generate

	Returns:
	Tuple of (generated_frames, log_probs, values)
	"""
	batch_size = input_features.shape[0]

	# Get context from base model
	base_outputs = self.base_model(input_features)
	if hasattr(base_outputs, 'last_hidden_state'):
	context = base_outputs.last_hidden_state.mean(dim=1) # [batch, hidden]
	else:
	context = base_outputs.mean(dim=1) if len(base_outputs.shape) > 2 else base_outputs

	# Initialize
	if previous_frames is None:
	current_frame = torch.zeros(batch_size, self.frame_size, device=input_features.device)
	else:
	current_frame = previous_frames[:, -1]

	hidden = None
	generated_frames = []
	log_probs = []

	# Generate frames autoregressively
	for t in range(num_frames):
	# Combine context and previous frame
	lstm_input = torch.cat([context, current_frame], dim=-1).unsqueeze(1)

	# LSTM step
	lstm_out, hidden = self.frame_generator(lstm_input, hidden)

	# Project to frame
	frame_logits = self.output_projection(lstm_out.squeeze(1))

	# Sample frame (treat as continuous output)
	current_frame = torch.tanh(frame_logits) # Bound to [-1, 1]

	# Compute log prob (simplified)
	frame_log_prob = -0.5 * (frame_logits ** 2).sum(dim=-1)

	generated_frames.append(current_frame)
	log_probs.append(frame_log_prob)

	# Stack results
	generated_frames = torch.stack(generated_frames, dim=1) # [batch, num_frames, frame_size]
	log_probs = torch.stack(log_probs, dim=1) # [batch, num_frames]

	# Compute values
	values = self.value_net(context) # [batch, 1]

	return generated_frames, log_probs, values