Spaces:

fishaudio
/

fish-speech-1

Running on A10G

App Files Files Community

fish-speech-1 / fish_speech /models /text2semantic /llama.py

PoTaTo721

Update to V1.4

28c720a 4 months ago

raw

history blame

25.8 kB

	import json
	import math
	from collections import OrderedDict
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Optional

	import torch
	import torch.nn as nn
	from einops import rearrange
	from loguru import logger
	from torch import Tensor
	from torch.nn import functional as F
	from torch.nn.attention import SDPBackend, sdpa_kernel
	from torch.utils.checkpoint import checkpoint
	from transformers import AutoTokenizer

	from fish_speech.conversation import SEMANTIC_TOKEN
	from fish_speech.utils import RankedLogger

	from .lora import LoraConfig, setup_lora

	log = RankedLogger(__name__, rank_zero_only=True)


	def find_multiple(n: int, k: int) -> int:
	if n % k == 0:
	return n
	return n + k - (n % k)


	@dataclass
	class BaseModelArgs:
	model_type: str = "base"

	vocab_size: int = 32000
	n_layer: int = 32
	n_head: int = 32
	dim: int = 4096
	intermediate_size: int = None
	n_local_heads: int = -1
	head_dim: int = 64
	rope_base: float = 10000
	norm_eps: float = 1e-5
	max_seq_len: int = 2048
	dropout: float = 0.0
	tie_word_embeddings: bool = True
	attention_qkv_bias: bool = False

	# Codebook configs
	codebook_size: int = 160
	num_codebooks: int = 4

	# Gradient checkpointing
	use_gradient_checkpointing: bool = True

	# Initialize the model
	initializer_range: float = 0.02

	def __post_init__(self):
	if self.n_local_heads == -1:
	self.n_local_heads = self.n_head
	if self.intermediate_size is None:
	hidden_dim = 4 * self.dim
	n_hidden = int(2 * hidden_dim / 3)
	self.intermediate_size = find_multiple(n_hidden, 256)
	self.head_dim = self.dim // self.n_head

	@staticmethod
	def from_pretrained(path: str):
	path = Path(path)

	if path.is_dir():
	path = path / "config.json"

	with open(path, "r", encoding="utf-8") as f:
	data = json.load(f)

	match data["model_type"]:
	case "naive":
	cls = NaiveModelArgs
	case "dual_ar":
	cls = DualARModelArgs
	case _:
	raise ValueError(f"Unknown model type: {data['model_type']}")

	return cls(**data)

	def save(self, path: str):
	with open(path, "w") as f:
	json.dump(self.__dict__, f, indent=4, sort_keys=True, ensure_ascii=False)


	@dataclass
	class NaiveModelArgs(BaseModelArgs):
	model_type: str = "naive"


	@dataclass
	class DualARModelArgs(BaseModelArgs):
	model_type: str = "dual_ar"
	n_fast_layer: int = 4


	class KVCache(nn.Module):
	def __init__(
	self, max_batch_size, max_seq_len, n_heads, head_dim, dtype=torch.bfloat16
	):
	super().__init__()
	cache_shape = (max_batch_size, n_heads, max_seq_len, head_dim)
	self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
	self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))

	def update(self, input_pos, k_val, v_val):
	# input_pos: [S], k_val: [B, H, S, D]
	assert input_pos.shape[0] == k_val.shape[2]

	k_out = self.k_cache
	v_out = self.v_cache
	k_out[:, :, input_pos] = k_val
	v_out[:, :, input_pos] = v_val

	return k_out, v_out


	@dataclass
	class TransformerForwardResult:
	token_logits: Tensor
	codebook_logits: Tensor


	@dataclass
	class BaseTransformerForwardResult:
	logits: Tensor
	hidden_states: Tensor


	class BaseTransformer(nn.Module):
	def __init__(
	self, config: BaseModelArgs, tokenizer: AutoTokenizer, init_weights: bool = True
	) -> None:
	super().__init__()
	self.config = config
	self.tokenizer = tokenizer

	self.semantic_token_id = tokenizer.convert_tokens_to_ids(SEMANTIC_TOKEN)

	# Slow transformer
	self.embeddings = nn.Embedding(
	config.vocab_size,
	config.dim,
	)
	self.codebook_embeddings = nn.Embedding(
	config.codebook_size * config.num_codebooks,
	config.dim,
	)
	self.layers = nn.ModuleList(
	TransformerBlock(config, use_sdpa=True) for _ in range(config.n_layer)
	)
	self.norm = RMSNorm(config.dim, eps=config.norm_eps)

	if self.config.tie_word_embeddings is False:
	self.output = nn.Linear(
	config.dim,
	config.vocab_size,
	bias=False,
	)

	self.register_buffer(
	"freqs_cis",
	precompute_freqs_cis(
	config.max_seq_len,
	config.dim // config.n_head,
	config.rope_base,
	),
	persistent=False,
	)
	self.register_buffer(
	"causal_mask",
	torch.tril(
	torch.ones(
	config.max_seq_len,
	config.max_seq_len,
	dtype=torch.bool,
	)
	),
	persistent=False,
	)

	# For kv cache
	self.max_batch_size = -1
	self.max_seq_len = -1

	if init_weights:
	self.apply(self._init_weights)

	def setup_caches(
	self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
	):
	if self.max_seq_len >= max_seq_len and self.max_batch_size >= max_batch_size:
	return

	head_dim = self.config.dim // self.config.n_head
	max_seq_len = find_multiple(max_seq_len, 8)
	self.max_seq_len = max_seq_len
	self.max_batch_size = max_batch_size

	for b in self.layers:
	b.attention.kv_cache = KVCache(
	max_batch_size,
	max_seq_len,
	self.config.n_local_heads,
	head_dim,
	dtype=dtype,
	)

	def embed(self, x: Tensor) -> Tensor:
	vocab_embeds = [self.embeddings(x[:, 0])]
	for i in range(self.config.num_codebooks):
	emb = self.codebook_embeddings(x[:, i + 1] + i * self.config.codebook_size)
	emb[x[:, 0] != self.semantic_token_id] = 0
	vocab_embeds.append(emb)

	x = torch.stack(vocab_embeds, dim=3)
	x = x.sum(dim=3)

	return x

	def forward(
	self,
	inp: Tensor,
	key_padding_mask: Optional[Tensor] = None,
	) -> BaseTransformerForwardResult:
	seq_len = inp.size(2)

	# Here we want to merge the embeddings of the codebooks
	x = self.embed(inp)

	freqs_cis = self.freqs_cis[:seq_len]

	# Not that the causal mask here follows the definition of scaled_dot_product_attention
	# That is, FALSE means masked out
	# To maintain consistency, key_padding_mask use TRUE to mask out
	mask = None
	if key_padding_mask is not None:
	mask = self.causal_mask[None, None, :seq_len, :seq_len] # (B, N, Q, K)
	mask = mask & key_padding_mask[:, None, None, :].logical_not()

	for layer in self.layers:
	if self.config.use_gradient_checkpointing and self.training:
	x = checkpoint(layer, x, freqs_cis, mask, use_reentrant=True)
	else:
	x = layer(x, freqs_cis, mask)

	# We got slow_out here
	slow_out = self.norm(x)

	if self.config.tie_word_embeddings:
	token_logits = F.linear(slow_out, self.embeddings.weight)
	else:
	token_logits = self.output(slow_out)

	return BaseTransformerForwardResult(
	logits=token_logits,
	hidden_states=x,
	)

	def forward_generate(
	self,
	x: Tensor,
	input_pos: Optional[Tensor] = None,
	return_all: bool = False,
	) -> BaseTransformerForwardResult:
	# This is used for generation, optimized for torch compile
	assert (
	self.max_seq_len != -1 and self.max_batch_size != -1
	), "Please call setup_caches before forward_generate"

	x = self.embed(x)

	mask = self.causal_mask[
	None, None, input_pos, : self.max_seq_len
	] # (B, N, Q, K)
	freqs_cis = self.freqs_cis[input_pos]

	for layer in self.layers:
	x = layer(x, freqs_cis, mask, input_pos=input_pos)

	# If prefill, we only calculate the logits of last token
	if x.size(1) > 1 and not return_all:
	x = x[:, -1:]

	# We got slow_out here
	slow_out = self.norm(x)

	if self.config.tie_word_embeddings:
	token_logits = F.linear(slow_out, self.embeddings.weight)
	else:
	token_logits = self.output(slow_out)

	return BaseTransformerForwardResult(
	logits=token_logits,
	hidden_states=x,
	)

	def _init_weights(self, module):
	std = self.config.initializer_range
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()

	@staticmethod
	def from_pretrained(
	path: str,
	load_weights: bool = False,
	max_length: int \| None = None,
	lora_config: LoraConfig \| None = None,
	rope_base: int \| None = None,
	) -> "BaseTransformer":
	config = BaseModelArgs.from_pretrained(str(path))
	if max_length is not None:
	config.max_seq_len = max_length
	log.info(f"Override max_seq_len to {max_length}")

	if rope_base is not None:
	config.rope_base = rope_base
	log.info(f"Override rope_base to {rope_base}")

	match config.model_type:
	case "naive":
	model_cls = NaiveTransformer
	case "dual_ar":
	model_cls = DualARTransformer
	case _:
	raise ValueError(f"Unknown model type: {config.model_type}")

	tokenizer = AutoTokenizer.from_pretrained(str(path))
	log.info(f"Loading model from {path}, config: {config}")
	model = model_cls(config, tokenizer=tokenizer)

	if lora_config is not None:
	setup_lora(model, lora_config)
	log.info(f"LoRA setup: {lora_config}")

	if load_weights is False:
	log.info("Randomly initialized model")
	else:

	if "int8" in str(Path(path)):
	logger.info("Using int8 weight-only quantization!")
	from tools.llama.quantize import WeightOnlyInt8QuantHandler

	simple_quantizer = WeightOnlyInt8QuantHandler(model)
	model = simple_quantizer.convert_for_runtime()

	if "int4" in str(Path(path)):
	logger.info("Using int4 quantization!")
	path_comps = path.name.split("-")
	assert path_comps[-2].startswith("g")
	groupsize = int(path_comps[-2][1:])
	from tools.llama.quantize import WeightOnlyInt4QuantHandler

	simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
	model = simple_quantizer.convert_for_runtime()

	weights = torch.load(
	Path(path) / "model.pth", map_location="cpu", mmap=True
	)

	if "state_dict" in weights:
	logger.warning(
	"Using a TextToSemantic LightningModule checkpoint, "
	"please make sure it is a full model, not a LoRA model."
	)
	weights = weights["state_dict"]

	if next(iter(weights.keys())).startswith("model."):
	logger.info(
	f"Remove prefix 'model.' created by TextToSemantic LightningModule from keys"
	)
	new_weights = OrderedDict()
	for k, v in weights.items():
	new_weights[k.replace("model.", "")] = v
	weights = new_weights

	# Verify the name and shape of parameters since strict=False in load_state_dict.
	for k, v in model.named_parameters():
	if k not in weights:
	logger.warning(f"No weight for {k}")
	elif v.shape != weights[k].shape:
	logger.warning(
	f"Shape mismatch for {k}: {v.shape} vs {weights[k].shape}"
	)

	err = model.load_state_dict(weights, strict=False, assign=True)
	log.info(f"Loaded weights with error: {err}")

	return model

	def save_pretrained(self, path: str, drop_lora: bool = False):
	path = Path(path)
	path.mkdir(parents=True, exist_ok=True)

	self.config.save(path / "config.json")
	state_dict = self.state_dict()

	if drop_lora:
	for key in list(state_dict.keys()):
	if "lora" not in key:
	continue

	state_dict.pop(key)
	log.info(f"Drop LoRA parameter: {key}")

	torch.save(state_dict, path / "model.pth")
	self.tokenizer.save_pretrained(path)


	class NaiveTransformer(BaseTransformer):
	def __init__(self, config: NaiveModelArgs, tokenizer: AutoTokenizer) -> None:
	super().__init__(config, init_weights=False, tokenizer=tokenizer)

	self.codebook_norm = RMSNorm(config.dim, eps=config.norm_eps)
	self.codebook_output = nn.Linear(
	config.dim,
	config.codebook_size * config.num_codebooks,
	bias=False,
	)

	self.apply(self._init_weights)

	def decode(self, result: BaseTransformerForwardResult) -> TransformerForwardResult:
	token_logits = result.logits
	x = result.hidden_states

	# Codebook
	codebook_logits = self.codebook_output(self.codebook_norm(x))
	codebook_logits = rearrange(
	codebook_logits, "b n (c d) -> b n c d", c=self.config.num_codebooks
	)

	return TransformerForwardResult(
	token_logits=token_logits,
	codebook_logits=codebook_logits,
	)

	def forward(
	self,
	inp: Tensor,
	key_padding_mask: Optional[Tensor] = None,
	) -> TransformerForwardResult:
	result = super().forward(
	inp=inp,
	key_padding_mask=key_padding_mask,
	)
	return self.decode(result)

	def forward_generate(
	self, x: Tensor, input_pos: Optional[Tensor] = None
	) -> TransformerForwardResult:
	result = super().forward_generate(x, input_pos)
	return self.decode(result)


	class DualARTransformer(BaseTransformer):
	def __init__(self, config: NaiveModelArgs, tokenizer: AutoTokenizer) -> None:
	super().__init__(config, init_weights=False, tokenizer=tokenizer)

	# Fast transformer
	self.fast_embeddings = nn.Embedding(config.codebook_size, config.dim)

	# The equivalent bs is so large that sdpa doesn't work
	self.fast_layers = nn.ModuleList(
	TransformerBlock(config, use_sdpa=False) for _ in range(config.n_fast_layer)
	)
	self.fast_norm = RMSNorm(config.dim, eps=config.norm_eps)
	self.fast_output = nn.Linear(
	config.dim,
	config.codebook_size,
	bias=False,
	)

	self.apply(self._init_weights)

	def setup_caches(
	self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
	):
	super().setup_caches(max_batch_size, max_seq_len, dtype)

	head_dim = self.config.dim // self.config.n_head

	# Fast transformer
	# The max seq len here is the number of codebooks
	for b in self.fast_layers:
	b.attention.kv_cache = KVCache(
	max_batch_size,
	self.config.num_codebooks,
	self.config.n_local_heads,
	head_dim,
	dtype=dtype,
	)

	def forward(
	self,
	inp: Tensor,
	key_padding_mask: Optional[Tensor] = None,
	) -> TransformerForwardResult:
	parent_result = super().forward(inp, key_padding_mask)
	token_logits = parent_result.logits
	x = parent_result.hidden_states

	# Fast transformer
	fast_seq_len = self.config.num_codebooks
	fast_mask = self.causal_mask[
	None, None, :fast_seq_len, :fast_seq_len
	] # (B, N, Q, K)
	fast_freqs_cis = self.freqs_cis[:fast_seq_len]

	# Drop the last token and rotate left
	codebooks = inp[:, 1:-1, 1:]
	codebooks = F.pad(codebooks, (0, 1), value=0)
	codebook_embeddings = self.fast_embeddings(codebooks)
	x = torch.cat([x[:, None], codebook_embeddings], dim=1)
	b, s = x.size(0), x.size(2)
	x = rearrange(x, "b n s d -> (b s) n d") # flatten the batch and seq_len

	# Remove padded part
	codebooks = rearrange(codebooks, "b n s -> (b s) n")
	codebook_mask = (codebooks == 0).all(dim=-1)

	if torch.all(codebook_mask):
	# If all codebooks are padded, we keep first 8 to make sure the model runs
	codebook_mask[:8] = False

	x_bs, x_len = x.size(0), x.size(1)
	x = x[~codebook_mask]

	for layer in self.fast_layers:
	if self.config.use_gradient_checkpointing and self.training:
	x = checkpoint(layer, x, fast_freqs_cis, fast_mask, use_reentrant=True)
	else:
	x = layer(x, fast_freqs_cis, fast_mask)

	# unflatten the batch and num_codebooks
	fast_out = self.fast_norm(x)
	codebook_logits = self.fast_output(fast_out)

	# Re-pad the codebook_logits
	buffer = torch.zeros(
	x_bs,
	x_len,
	codebook_logits.size(-1),
	device=codebook_logits.device,
	dtype=codebook_logits.dtype,
	)
	buffer[~codebook_mask] = codebook_logits
	codebook_logits = buffer

	assert codebook_logits.shape[1] == self.config.num_codebooks
	codebook_logits = rearrange(
	codebook_logits,
	"(b s) n d -> b s n d",
	b=b,
	s=s,
	n=self.config.num_codebooks,
	)

	return TransformerForwardResult(
	token_logits=token_logits,
	codebook_logits=codebook_logits,
	)

	def forward_generate_fast(
	self, x: Tensor, input_pos: Optional[Tensor] = None
	) -> Tensor:
	# Fast transformer
	x = x.view(1, 1, -1)

	fast_mask = self.causal_mask[
	None, None, input_pos, : self.config.num_codebooks
	] # (B, N, Q, K)
	fast_freqs_cis = self.freqs_cis[input_pos]

	for layer in self.fast_layers:
	x = layer(x, fast_freqs_cis, fast_mask, input_pos=input_pos)

	# unflatten the batch and num_codebooks
	fast_out = self.fast_norm(x) # only take the last token
	codebook_logits = self.fast_output(fast_out)

	return codebook_logits


	class TransformerBlock(nn.Module):
	def __init__(self, config: BaseModelArgs, use_sdpa: bool = True) -> None:
	super().__init__()
	self.attention = Attention(config, use_sdpa=use_sdpa)
	self.feed_forward = FeedForward(config)
	self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
	self.attention_norm = RMSNorm(config.dim, config.norm_eps)

	def forward(
	self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Tensor = None
	) -> Tensor:
	h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
	out = h + self.feed_forward(self.ffn_norm(h))
	return out


	class Attention(nn.Module):
	def __init__(self, config: BaseModelArgs, use_sdpa: bool = True):
	super().__init__()
	assert config.dim % config.n_head == 0

	total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
	# key, query, value projections for all heads, but in a batch
	self.wqkv = nn.Linear(
	config.dim, total_head_dim, bias=config.attention_qkv_bias
	)
	self.wo = nn.Linear(config.dim, config.dim, bias=False)
	self.kv_cache = None

	self.dropout = config.dropout
	self.n_head = config.n_head
	self.head_dim = config.head_dim
	self.n_local_heads = config.n_local_heads
	self.dim = config.dim
	self.use_sdpa = use_sdpa
	self._register_load_state_dict_pre_hook(self.load_hook)

	def load_hook(self, state_dict, prefix, *args):
	if prefix + "wq.weight" in state_dict:
	wq = state_dict.pop(prefix + "wq.weight")
	wk = state_dict.pop(prefix + "wk.weight")
	wv = state_dict.pop(prefix + "wv.weight")
	state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])

	def forward(
	self,
	x: Tensor,
	freqs_cis: Tensor,
	mask: Tensor,
	input_pos: Optional[Tensor] = None,
	) -> Tensor:
	bsz, seqlen, _ = x.shape

	kv_size = self.n_local_heads * self.head_dim
	q, k, v = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)

	q = q.view(bsz, seqlen, self.n_head, self.head_dim)
	k = k.view(bsz, seqlen, self.n_local_heads, self.head_dim)
	v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)

	q = apply_rotary_emb(q, freqs_cis)
	k = apply_rotary_emb(k, freqs_cis)

	q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))

	if self.kv_cache is not None:
	k, v = self.kv_cache.update(input_pos, k, v)

	k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
	v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)

	if self.use_sdpa:
	if mask is None:
	with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
	y = F.scaled_dot_product_attention(
	q,
	k,
	v,
	dropout_p=self.dropout if self.training else 0.0,
	is_causal=True,
	# No third party attn_mask here to use flash_attention
	)
	else:
	y = F.scaled_dot_product_attention(
	q,
	k,
	v,
	attn_mask=mask,
	dropout_p=self.dropout if self.training else 0.0,
	)
	else:
	y = self.eq_scaled_dot_product_attention(
	q,
	k,
	v,
	attn_mask=mask,
	dropout_p=self.dropout if self.training else 0.0,
	)

	y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)

	return self.wo(y)

	def eq_scaled_dot_product_attention(
	self,
	query,
	key,
	value,
	attn_mask=None,
	dropout_p=0.0,
	) -> torch.Tensor:
	# This is a standard scaled dot product attention
	# It's low efficient, but it doesn't raise cuda error

	L, S = query.size(-2), key.size(-2)
	scale_factor = 1 / math.sqrt(query.size(-1))
	attn_bias = torch.zeros(1, 1, L, S, dtype=query.dtype, device=query.device)

	if attn_mask is not None:
	if attn_mask.dtype == torch.bool:
	attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
	else:
	attn_bias += attn_mask

	attn_weight = query @ key.transpose(-2, -1) * scale_factor
	attn_weight += attn_bias
	attn_weight = torch.softmax(attn_weight, dim=-1)
	attn_weight = torch.dropout(attn_weight, dropout_p, train=True)

	return attn_weight @ value


	class FeedForward(nn.Module):
	def __init__(self, config: BaseModelArgs) -> None:
	super().__init__()
	self.w1 = nn.Linear(config.dim, config.intermediate_size, bias=False)
	self.w3 = nn.Linear(config.dim, config.intermediate_size, bias=False)
	self.w2 = nn.Linear(config.intermediate_size, config.dim, bias=False)

	def forward(self, x: Tensor) -> Tensor:
	return self.w2(F.silu(self.w1(x)) * self.w3(x))


	class RMSNorm(nn.Module):
	def __init__(self, dim: int, eps: float = 1e-5):
	super().__init__()
	self.eps = eps
	self.weight = nn.Parameter(torch.ones(dim))

	def _norm(self, x):
	return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)

	def forward(self, x: Tensor) -> Tensor:
	output = self._norm(x.float()).type_as(x)
	return output * self.weight


	def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000) -> Tensor:
	freqs = 1.0 / (
	base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
	)
	t = torch.arange(seq_len, device=freqs.device)
	freqs = torch.outer(t, freqs)
	freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
	cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
	return cache.to(dtype=torch.bfloat16)


	def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
	xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
	freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
	x_out2 = torch.stack(
	[
	xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
	xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
	],
	-1,
	)

	x_out2 = x_out2.flatten(3)
	return x_out2.type_as(x)