File size: 2,619 Bytes
7e6946d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
from dataclasses import dataclass, field

import torch
from transformers import AutoConfig


@dataclass
class CosyVoice2LLMConfig:
    architectures: list[str] = field(default_factory=lambda: ["Qwen2ForCausalLM"])
    attention_dropout: float = 0.0
    bos_token_id: int = 151643
    eos_token_id: int = 6561  # speech eos
    hidden_act: str = "silu"
    hidden_size: int = 896
    initializer_range: float = 0.02
    intermediate_size: int = 4864
    max_position_embeddings: int = 32768
    max_window_layers: int = 24
    model_type: str = "qwen2"
    num_attention_heads: int = 14
    num_hidden_layers: int = 24
    num_key_value_heads: int = 2
    head_dim: int = 64
    rms_norm_eps: float = 1e-06
    rope_scaling: dict | None = None
    rope_theta: float = 1000000.0
    sliding_window: int = 32768
    tie_word_embeddings: bool = False
    torch_dtype: torch.dtype = torch.bfloat16
    transformers_version: str = "4.52.0.dev0"
    use_cache: bool = True
    use_sliding_window: bool = False
    vocab_size: int = 158500  # text_vocab_size + speech_vocab_size + 2 (eos and task_id)
    text_vocab_size: int = 151936
    speech_vocab_size: int = 6562  # actually 6564, we only care about non-streaming inference, so cut off tokens (6562, 6563) that are only used for streaming TTS
    lm_head_bias: bool = True
    qkv_bias: bool = True
    fp16_flow: bool = True


@dataclass
class SamplingParams:
    temperature: float = 1.0
    min_tokens: int = 2
    max_tokens: int = 64
    ignore_eos: bool = False
    top_k: int = 25
    # RasSampler parameters
    use_ras: bool = False
    win_size: int = 10
    tau_r: float = 0.1
    top_p: float = 0.8


@dataclass
class Config:
    model: str
    max_num_batched_tokens: int = 1572864
    max_num_seqs: int = 1024
    max_model_len: int = 1536  # 15s prompt + 30s generated audio for 25hz audio tokenizer
    gpu_memory_utilization: float = 0.9
    tensor_parallel_size: int = 1
    enforce_eager: bool = False
    hf_config: CosyVoice2LLMConfig | AutoConfig = field(default_factory=CosyVoice2LLMConfig)
    eos: int = -1
    kvcache_block_size: int = 256
    num_kvcache_blocks: int = -1
    min_token_text_ratio: int = 2
    max_token_text_ratio: int = 20
    rank: int = 0

    def __post_init__(self):
        assert os.path.isdir(self.model)
        assert self.kvcache_block_size % 256 == 0
        assert 1 <= self.tensor_parallel_size <= 8

        max_pos = getattr(self.hf_config, "max_position_embeddings", 4096)
        self.max_model_len = min(self.max_model_len, max_pos)
        assert self.max_num_batched_tokens >= self.max_model_len