Spaces:
Running
on
T4
Running
on
T4
# coding=utf-8 | |
# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" BARK model generation configuration""" | |
import copy | |
from typing import Dict | |
from ...generation.configuration_utils import GenerationConfig | |
from ...utils import logging | |
logger = logging.get_logger(__name__) | |
class BarkSemanticGenerationConfig(GenerationConfig): | |
model_type = "semantic" | |
def __init__( | |
self, | |
eos_token_id=10_000, | |
renormalize_logits=True, | |
max_new_tokens=768, | |
output_scores=False, | |
return_dict_in_generate=False, | |
output_hidden_states=False, | |
output_attentions=False, | |
temperature=1.0, | |
do_sample=False, | |
text_encoding_offset=10_048, | |
text_pad_token=129_595, | |
semantic_infer_token=129_599, | |
semantic_vocab_size=10_000, | |
max_input_semantic_length=256, | |
semantic_rate_hz=49.9, | |
**kwargs, | |
): | |
"""Class that holds a generation configuration for [`BarkSemanticModel`]. | |
This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the | |
documentation from [`GenerationConfig`] for more information. | |
Args: | |
eos_token_id (`int`, *optional*, defaults to 10_000): | |
The id of the *end-of-sequence* token. | |
renormalize_logits (`bool`, *optional*, defaults to `True`): | |
Whether to renormalize the logits after applying all the logits processors or warpers (including the | |
custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the | |
score logits are normalized but some logit processors or warpers break the normalization. | |
max_new_tokens (`int`, *optional*, defaults to 768): | |
The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt. | |
output_scores (`bool`, *optional*, defaults to `False`): | |
Whether or not to return the prediction scores. See `scores` under returned tensors for more details. | |
return_dict_in_generate (`bool`, *optional*, defaults to `False`): | |
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. | |
output_hidden_states (`bool`, *optional*, defaults to `False`): | |
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors | |
for more details. | |
output_attentions (`bool`, *optional*, defaults to `False`): | |
Whether or not to return the attentions tensors of all attention layers. See `attentions` under | |
returned tensors for more details. | |
temperature (`float`, *optional*, defaults to 1.0): | |
The value used to modulate the next token probabilities. | |
do_sample (`bool`, *optional*, defaults to `False`): | |
Whether or not to use sampling ; use greedy decoding otherwise. | |
text_encoding_offset (`int`, *optional*, defaults to 10_048): | |
Text encoding offset. | |
text_pad_token (`int`, *optional*, defaults to 129_595): | |
Text pad token. | |
semantic_infer_token (`int`, *optional*, defaults to 129_599): | |
Semantic infer token. | |
semantic_vocab_size (`int`, *optional*, defaults to 10_000): | |
Semantic vocab size. | |
max_input_semantic_length (`int`, *optional*, defaults to 256): | |
Max length of semantic input vector. | |
semantic_rate_hz (`float`, *optional*, defaults to 49.9): | |
Semantic rate in Hertz. | |
""" | |
super().__init__( | |
temperature=temperature, | |
do_sample=do_sample, | |
eos_token_id=eos_token_id, | |
renormalize_logits=renormalize_logits, | |
max_new_tokens=max_new_tokens, | |
output_scores=output_scores, | |
return_dict_in_generate=return_dict_in_generate, | |
output_hidden_states=output_hidden_states, | |
output_attentions=output_attentions, | |
**kwargs, | |
) | |
self.text_encoding_offset = text_encoding_offset | |
self.text_pad_token = text_pad_token | |
self.semantic_pad_token = eos_token_id | |
self.semantic_infer_token = semantic_infer_token | |
self.semantic_vocab_size = semantic_vocab_size | |
self.max_input_semantic_length = max_input_semantic_length | |
self.semantic_rate_hz = semantic_rate_hz | |
class BarkCoarseGenerationConfig(GenerationConfig): | |
model_type = "coarse_acoustics" | |
def __init__( | |
self, | |
renormalize_logits=True, | |
output_scores=False, | |
return_dict_in_generate=False, | |
output_hidden_states=False, | |
output_attentions=False, | |
temperature=1.0, | |
do_sample=False, | |
coarse_semantic_pad_token=12_048, | |
coarse_rate_hz=75, | |
n_coarse_codebooks=2, | |
coarse_infer_token=12_050, | |
max_coarse_input_length=256, | |
max_coarse_history: int = 630, | |
sliding_window_len: int = 60, | |
**kwargs, | |
): | |
"""Class that holds a generation configuration for [`BarkCoarseModel`]. | |
This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the | |
documentation from [`GenerationConfig`] for more information. | |
Args: | |
renormalize_logits (`bool`, *optional*, defaults to `True`): | |
Whether to renormalize the logits after applying all the logits processors or warpers (including the | |
custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the | |
score logits are normalized but some logit processors or warpers break the normalization. | |
output_scores (`bool`, *optional*, defaults to `False`): | |
Whether or not to return the prediction scores. See `scores` under returned tensors for more details. | |
return_dict_in_generate (`bool`, *optional*, defaults to `False`): | |
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. | |
output_hidden_states (`bool`, *optional*, defaults to `False`): | |
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors | |
for more details. | |
output_attentions (`bool`, *optional*, defaults to `False`): | |
Whether or not to return the attentions tensors of all attention layers. See `attentions` under | |
returned tensors for more details. | |
temperature (`float`, *optional*, defaults to 1.0): | |
The value used to modulate the next token probabilities. | |
do_sample (`bool`, *optional*, defaults to `False`): | |
Whether or not to use sampling ; use greedy decoding otherwise. | |
coarse_semantic_pad_token (`int`, *optional*, defaults to 12_048): | |
Coarse semantic pad token. | |
coarse_rate_hz (`int`, *optional*, defaults to 75): | |
Coarse rate in Hertz. | |
n_coarse_codebooks (`int`, *optional*, defaults to 2): | |
Number of coarse codebooks. | |
coarse_infer_token (`int`, *optional*, defaults to 12_050): | |
Coarse infer token. | |
max_coarse_input_length (`int`, *optional*, defaults to 256): | |
Max length of input coarse vector. | |
max_coarse_history (`int`, *optional*, defaults to 630): | |
Max length of the output of the coarse acoustics model used in the fine generation step. | |
sliding_window_len (`int`, *optional*, defaults to 60): | |
The coarse generation step uses a sliding window to generate raw audio. | |
""" | |
super().__init__( | |
temperature=temperature, | |
do_sample=do_sample, | |
renormalize_logits=renormalize_logits, | |
output_scores=output_scores, | |
return_dict_in_generate=return_dict_in_generate, | |
output_hidden_states=output_hidden_states, | |
output_attentions=output_attentions, | |
**kwargs, | |
) | |
self.coarse_semantic_pad_token = coarse_semantic_pad_token | |
self.coarse_rate_hz = coarse_rate_hz | |
self.n_coarse_codebooks = n_coarse_codebooks | |
self.coarse_infer_token = coarse_infer_token | |
self.max_coarse_input_length = max_coarse_input_length | |
self.max_coarse_history = max_coarse_history | |
self.sliding_window_len = sliding_window_len | |
class BarkFineGenerationConfig(GenerationConfig): | |
model_type = "fine_acoustics" | |
def __init__( | |
self, | |
temperature=1.0, | |
max_fine_history_length=512, | |
max_fine_input_length=1024, | |
n_fine_codebooks=8, | |
**kwargs, | |
): | |
"""Class that holds a generation configuration for [`BarkFineModel`]. | |
[`BarkFineModel`] is an autoencoder model, so should not usually be used for generation. However, under the | |
hood, it uses `temperature` when used by [`BarkModel`] | |
This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the | |
documentation from [`GenerationConfig`] for more information. | |
Args: | |
temperature (`float`, *optional*): | |
The value used to modulate the next token probabilities. | |
max_fine_history_length (`int`, *optional*, defaults to 512): | |
Max length of the fine history vector. | |
max_fine_input_length (`int`, *optional*, defaults to 1024): | |
Max length of fine input vector. | |
n_fine_codebooks (`int`, *optional*, defaults to 8): | |
Number of codebooks used. | |
""" | |
super().__init__(temperature=temperature) | |
self.max_fine_history_length = max_fine_history_length | |
self.max_fine_input_length = max_fine_input_length | |
self.n_fine_codebooks = n_fine_codebooks | |
def validate(self, **kwargs): | |
""" | |
Overrides GenerationConfig.validate because BarkFineGenerationConfig don't use any parameters outside | |
temperature. | |
""" | |
pass | |
class BarkGenerationConfig(GenerationConfig): | |
model_type = "bark" | |
is_composition = True | |
# TODO (joao): nested from_dict | |
def __init__( | |
self, | |
semantic_config: Dict = None, | |
coarse_acoustics_config: Dict = None, | |
fine_acoustics_config: Dict = None, | |
sample_rate=24_000, | |
codebook_size=1024, | |
**kwargs, | |
): | |
"""Class that holds a generation configuration for [`BarkModel`]. | |
The [`BarkModel`] does not have a `generate` method, but uses this class to generate speeches with a nested | |
[`BarkGenerationConfig`] which uses [`BarkSemanticGenerationConfig`], [`BarkCoarseGenerationConfig`], | |
[`BarkFineGenerationConfig`]. | |
This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the | |
documentation from [`GenerationConfig`] for more information. | |
Args: | |
semantic_config (`Dict`, *optional*): | |
Semantic generation configuration. | |
coarse_acoustics_config (`Dict`, *optional*): | |
Coarse generation configuration. | |
fine_acoustics_config (`Dict`, *optional*): | |
Fine generation configuration. | |
sample_rate (`int`, *optional*, defaults to 24_000): | |
Sample rate. | |
codebook_size (`int`, *optional*, defaults to 1024): | |
Vector length for each codebook. | |
""" | |
if semantic_config is None: | |
semantic_config = {} | |
logger.info("semantic_config is None. initializing the semantic model with default values.") | |
if coarse_acoustics_config is None: | |
coarse_acoustics_config = {} | |
logger.info("coarse_acoustics_config is None. initializing the coarse model with default values.") | |
if fine_acoustics_config is None: | |
fine_acoustics_config = {} | |
logger.info("fine_acoustics_config is None. initializing the fine model with default values.") | |
self.semantic_config = BarkSemanticGenerationConfig(**semantic_config) | |
self.coarse_acoustics_config = BarkCoarseGenerationConfig(**coarse_acoustics_config) | |
self.fine_acoustics_config = BarkFineGenerationConfig(**fine_acoustics_config) | |
self.sample_rate = sample_rate | |
self.codebook_size = codebook_size | |
def from_sub_model_configs( | |
cls, | |
semantic_config: BarkSemanticGenerationConfig, | |
coarse_acoustics_config: BarkCoarseGenerationConfig, | |
fine_acoustics_config: BarkFineGenerationConfig, | |
**kwargs, | |
): | |
r""" | |
Instantiate a [`BarkGenerationConfig`] (or a derived class) from bark sub-models generation configuration. | |
Returns: | |
[`BarkGenerationConfig`]: An instance of a configuration object | |
""" | |
return cls( | |
semantic_config=semantic_config.to_dict(), | |
coarse_acoustics_config=coarse_acoustics_config.to_dict(), | |
fine_acoustics_config=fine_acoustics_config.to_dict(), | |
**kwargs, | |
) | |
def to_dict(self): | |
""" | |
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. | |
Returns: | |
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, | |
""" | |
output = copy.deepcopy(self.__dict__) | |
output["semantic_config"] = self.semantic_config.to_dict() | |
output["coarse_acoustics_config"] = self.coarse_acoustics_config.to_dict() | |
output["fine_acoustics_config"] = self.fine_acoustics_config.to_dict() | |
output["model_type"] = self.__class__.model_type | |
return output | |