File size: 5,281 Bytes
9b2107c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
from dataclasses import dataclass, field
from typing import Dict

from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.layers.bark.model import GPTConfig
from TTS.tts.layers.bark.model_fine import FineGPTConfig
from TTS.tts.models.bark import BarkAudioConfig
from TTS.utils.generic_utils import get_user_data_dir


@dataclass
class BarkConfig(BaseTTSConfig):
    """Bark TTS configuration

    Args:
        model (str): model name that registers the model.
        audio (BarkAudioConfig): audio configuration. Defaults to BarkAudioConfig().
        num_chars (int): number of characters in the alphabet. Defaults to 0.
        semantic_config (GPTConfig): semantic configuration. Defaults to GPTConfig().
        fine_config (FineGPTConfig): fine configuration. Defaults to FineGPTConfig().
        coarse_config (GPTConfig): coarse configuration. Defaults to GPTConfig().
        CONTEXT_WINDOW_SIZE (int): GPT context window size. Defaults to 1024.
        SEMANTIC_RATE_HZ (float): semantic tokens rate in Hz. Defaults to 49.9.
        SEMANTIC_VOCAB_SIZE (int): semantic vocabulary size. Defaults to 10_000.
        CODEBOOK_SIZE (int): encodec codebook size. Defaults to 1024.
        N_COARSE_CODEBOOKS (int): number of coarse codebooks. Defaults to 2.
        N_FINE_CODEBOOKS (int): number of fine codebooks. Defaults to 8.
        COARSE_RATE_HZ (int): coarse tokens rate in Hz. Defaults to 75.
        SAMPLE_RATE (int): sample rate. Defaults to 24_000.
        USE_SMALLER_MODELS (bool): use smaller models. Defaults to False.
        TEXT_ENCODING_OFFSET (int): text encoding offset. Defaults to 10_048.
        SEMANTIC_PAD_TOKEN (int): semantic pad token. Defaults to 10_000.
        TEXT_PAD_TOKEN ([type]): text pad token. Defaults to 10_048.
        TEXT_EOS_TOKEN ([type]): text end of sentence token. Defaults to 10_049.
        TEXT_SOS_TOKEN ([type]): text start of sentence token. Defaults to 10_050.
        SEMANTIC_INFER_TOKEN (int): semantic infer token. Defaults to 10_051.
        COARSE_SEMANTIC_PAD_TOKEN (int): coarse semantic pad token. Defaults to 12_048.
        COARSE_INFER_TOKEN (int): coarse infer token. Defaults to 12_050.
        REMOTE_BASE_URL ([type]): remote base url. Defaults to "https://huggingface.co/erogol/bark/tree".
        REMOTE_MODEL_PATHS (Dict): remote model paths. Defaults to None.
        LOCAL_MODEL_PATHS (Dict): local model paths. Defaults to None.
        SMALL_REMOTE_MODEL_PATHS (Dict): small remote model paths. Defaults to None.
        CACHE_DIR (str): local cache directory. Defaults to get_user_data_dir().
        DEF_SPEAKER_DIR (str): default speaker directory to stoke speaker values for voice cloning. Defaults to get_user_data_dir().
    """

    model: str = "bark"
    audio: BarkAudioConfig = field(default_factory=BarkAudioConfig)
    num_chars: int = 0
    semantic_config: GPTConfig = field(default_factory=GPTConfig)
    fine_config: FineGPTConfig = field(default_factory=FineGPTConfig)
    coarse_config: GPTConfig = field(default_factory=GPTConfig)
    CONTEXT_WINDOW_SIZE: int = 1024
    SEMANTIC_RATE_HZ: float = 49.9
    SEMANTIC_VOCAB_SIZE: int = 10_000
    CODEBOOK_SIZE: int = 1024
    N_COARSE_CODEBOOKS: int = 2
    N_FINE_CODEBOOKS: int = 8
    COARSE_RATE_HZ: int = 75
    SAMPLE_RATE: int = 24_000
    USE_SMALLER_MODELS: bool = False

    TEXT_ENCODING_OFFSET: int = 10_048
    SEMANTIC_PAD_TOKEN: int = 10_000
    TEXT_PAD_TOKEN: int = 129_595
    SEMANTIC_INFER_TOKEN: int = 129_599
    COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
    COARSE_INFER_TOKEN: int = 12_050

    REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
    REMOTE_MODEL_PATHS: Dict = None
    LOCAL_MODEL_PATHS: Dict = None
    SMALL_REMOTE_MODEL_PATHS: Dict = None
    CACHE_DIR: str = str(get_user_data_dir("tts/suno/bark_v0"))
    DEF_SPEAKER_DIR: str = str(get_user_data_dir("tts/bark_v0/speakers"))

    def __post_init__(self):
        self.REMOTE_MODEL_PATHS = {
            "text": {
                "path": os.path.join(self.REMOTE_BASE_URL, "text_2.pt"),
                "checksum": "54afa89d65e318d4f5f80e8e8799026a",
            },
            "coarse": {
                "path": os.path.join(self.REMOTE_BASE_URL, "coarse_2.pt"),
                "checksum": "8a98094e5e3a255a5c9c0ab7efe8fd28",
            },
            "fine": {
                "path": os.path.join(self.REMOTE_BASE_URL, "fine_2.pt"),
                "checksum": "59d184ed44e3650774a2f0503a48a97b",
            },
        }
        self.LOCAL_MODEL_PATHS = {
            "text": os.path.join(self.CACHE_DIR, "text_2.pt"),
            "coarse": os.path.join(self.CACHE_DIR, "coarse_2.pt"),
            "fine": os.path.join(self.CACHE_DIR, "fine_2.pt"),
            "hubert_tokenizer": os.path.join(self.CACHE_DIR, "tokenizer.pth"),
            "hubert": os.path.join(self.CACHE_DIR, "hubert.pt"),
        }
        self.SMALL_REMOTE_MODEL_PATHS = {
            "text": {"path": os.path.join(self.REMOTE_BASE_URL, "text.pt")},
            "coarse": {"path": os.path.join(self.REMOTE_BASE_URL, "coarse.pt")},
            "fine": {"path": os.path.join(self.REMOTE_BASE_URL, "fine.pt")},
        }
        self.sample_rate = self.SAMPLE_RATE  # pylint: disable=attribute-defined-outside-init