Spaces:

llizhx
/

TinyGPT-V

Runtime error

TinyGPT-V / modified /generation /configuration_utils.py

Li Zhaoxu

init

122057f 11 months ago

52.4 kB

	# coding=utf-8
	# Copyright 2022 The HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" Generation configuration class and utilities."""

	import copy
	import json
	import os
	import warnings
	from typing import Any, Dict, Optional, Union

	from .. import __version__
	from ..configuration_utils import PretrainedConfig
	from ..utils import (
	GENERATION_CONFIG_NAME,
	PushToHubMixin,
	cached_file,
	download_url,
	extract_commit_hash,
	is_remote_url,
	logging,
	)


	logger = logging.get_logger(__name__)
	METADATA_FIELDS = ("_from_model_config", "_commit_hash", "_original_object_hash", "transformers_version")


	class GenerationConfig(PushToHubMixin):
	# no-format
	r"""
	Class that holds a configuration for a generation task. A `generate` call supports the following generation methods
	for text-decoder, text-to-text, speech-to-text, and vision-to-text models:

	- greedy decoding by calling [`~generation.GenerationMixin.greedy_search`] if `num_beams=1` and
	`do_sample=False`
	- contrastive search by calling [`~generation.GenerationMixin.contrastive_search`] if `penalty_alpha>0.`
	and `top_k>1`
	- multinomial sampling by calling [`~generation.GenerationMixin.sample`] if `num_beams=1` and
	`do_sample=True`
	- beam-search decoding by calling [`~generation.GenerationMixin.beam_search`] if `num_beams>1` and
	`do_sample=False`
	- beam-search multinomial sampling by calling [`~generation.GenerationMixin.beam_sample`] if
	`num_beams>1` and `do_sample=True`
	- diverse beam-search decoding by calling [`~generation.GenerationMixin.group_beam_search`], if
	`num_beams>1` and `num_beam_groups>1`
	- constrained beam-search decoding by calling [`~generation.GenerationMixin.constrained_beam_search`], if
	`constraints!=None` or `force_words_ids!=None`
	- assisted decoding by calling [`~generation.GenerationMixin.assisted_decoding`], if
	`assistant_model` is passed to `.generate()`

	You do not need to call any of the above methods directly. Pass custom parameter values to '.generate()'. To learn
	more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).

	<Tip>

	A large number of these flags control the logits or the stopping criteria of the generation. Make sure you check
	the [generate-related classes](https://huggingface.co/docs/transformers/internal/generation_utils) for a full
	description of the possible manipulations, as well as examples of their usage.

	</Tip>

	Arg:
	> Parameters that control the length of the output

	max_length (`int`, optional, defaults to 20):
	The maximum length the generated tokens can have. Corresponds to the length of the input prompt +
	`max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
	max_new_tokens (`int`, optional):
	The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
	min_length (`int`, optional, defaults to 0):
	The minimum length of the sequence to be generated. Corresponds to the length of the input prompt +
	`min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.
	min_new_tokens (`int`, optional):
	The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.
	early_stopping (`bool` or `str`, optional, defaults to `False`):
	Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
	`True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an
	heuristic is applied and the generation stops when is it very unlikely to find better candidates;
	`"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical
	beam search algorithm).
	max_time(`float`, optional):
	The maximum amount of time you allow the computation to run for in seconds. generation will still finish
	the current pass after allocated time has been passed.

	> Parameters that control the generation strategy used

	do_sample (`bool`, optional, defaults to `False`):
	Whether or not to use sampling ; use greedy decoding otherwise.
	num_beams (`int`, optional, defaults to 1):
	Number of beams for beam search. 1 means no beam search.
	num_beam_groups (`int`, optional, defaults to 1):
	Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
	[this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
	penalty_alpha (`float`, optional):
	The values balance the model confidence and the degeneration penalty in contrastive search decoding.
	use_cache (`bool`, optional, defaults to `True`):
	Whether or not the model should use the past last key/values attentions (if applicable to the model) to
	speed up decoding.

	> Parameters for manipulation of the model output logits

	temperature (`float`, optional, defaults to 1.0):
	The value used to modulate the next token probabilities.
	top_k (`int`, optional, defaults to 50):
	The number of highest probability vocabulary tokens to keep for top-k-filtering.
	top_p (`float`, optional, defaults to 1.0):
	If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to
	`top_p` or higher are kept for generation.
	typical_p (`float`, optional, defaults to 1.0):
	Local typicality measures how similar the conditional probability of predicting a target token next is to
	the expected conditional probability of predicting a random token next, given the partial text already
	generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that
	add up to `typical_p` or higher are kept for generation. See [this
	paper](https://arxiv.org/pdf/2202.00666.pdf) for more details.
	epsilon_cutoff (`float`, optional, defaults to 0.0):
	If set to float strictly between 0 and 1, only tokens with a conditional probability greater than
	`epsilon_cutoff` will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the
	size of the model. See [Truncation Sampling as Language Model
	Desmoothing](https://arxiv.org/abs/2210.15191) for more details.
	eta_cutoff (`float`, optional, defaults to 0.0):
	Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between
	0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) *
	exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token
	probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3,
	depending on the size of the model. See [Truncation Sampling as Language Model
	Desmoothing](https://arxiv.org/abs/2210.15191) for more details.
	diversity_penalty (`float`, optional, defaults to 0.0):
	This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
	particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
	repetition_penalty (`float`, optional, defaults to 1.0):
	The parameter for repetition penalty. 1.0 means no penalty. See [this
	paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
	encoder_repetition_penalty (`float`, optional, defaults to 1.0):
	The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
	original input. 1.0 means no penalty.
	length_penalty (`float`, optional, defaults to 1.0):
	Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
	the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
	likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
	`length_penalty` < 0.0 encourages shorter sequences.
	no_repeat_ngram_size (`int`, optional, defaults to 0):
	If set to int > 0, all ngrams of that size can only occur once.
	bad_words_ids(`List[List[int]]`, optional):
	List of list of token ids that are not allowed to be generated. Check
	[`~generation.NoBadWordsLogitsProcessor`] for further documentation and examples.
	force_words_ids(`List[List[int]]` or `List[List[List[int]]]`, optional):
	List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple list of
	words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`, this
	triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one
	can allow different forms of each word.
	renormalize_logits (`bool`, optional, defaults to `False`):
	Whether to renormalize the logits after applying all the logits processors or warpers (including the custom
	ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the score logits
	are normalized but some logit processors or warpers break the normalization.
	constraints (`List[Constraint]`, optional):
	Custom constraints that can be added to the generation to ensure that the output will contain the use of
	certain tokens as defined by `Constraint` objects, in the most sensible way possible.
	forced_bos_token_id (`int`, optional, defaults to `model.config.forced_bos_token_id`):
	The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
	multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
	language token.
	forced_eos_token_id (`Union[int, List[int]]`, optional, defaults to `model.config.forced_eos_token_id`):
	The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a
	list to set multiple end-of-sequence tokens.
	remove_invalid_values (`bool`, optional, defaults to `model.config.remove_invalid_values`):
	Whether to remove possible nan and inf outputs of the model to prevent the generation method to crash.
	Note that using `remove_invalid_values` can slow down generation.
	exponential_decay_length_penalty (`tuple(int, float)`, optional):
	This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been
	generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where
	penalty starts and `decay_factor` represents the factor of exponential decay
	suppress_tokens (`List[int]`, optional):
	A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their
	log probs to `-inf` so that they are not sampled.
	begin_suppress_tokens (`List[int]`, optional):
	A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit
	processor will set their log probs to `-inf` so that they are not sampled.
	forced_decoder_ids (`List[List[int]]`, optional):
	A list of pairs of integers which indicates a mapping from generation indices to token indices that will be
	forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token
	of index 123.
	sequence_bias (`Dict[Tuple[int], float]`, optional)):
	Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
	sequence being selected, while negative biases do the opposite. Check
	[`~generation.SequenceBiasLogitsProcessor`] for further documentation and examples.
	guidance_scale (`float`, optional):
	The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
	Higher guidance scale encourages the model to generate samples that are more closely linked to the input
	prompt, usually at the expense of poorer quality.
	low_memory (`bool`, optional):
	Switch to sequential topk for contrastive search to reduce peak memory. Used with contrastive search.


	> Parameters that define the output variables of `generate`

	num_return_sequences(`int`, optional, defaults to 1):
	The number of independently computed returned sequences for each element in the batch.
	output_attentions (`bool`, optional, defaults to `False`):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
	tensors for more details.
	output_hidden_states (`bool`, optional, defaults to `False`):
	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
	more details.
	output_scores (`bool`, optional, defaults to `False`):
	Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
	return_dict_in_generate (`bool`, optional, defaults to `False`):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

	> Special tokens that can be used at generation time

	pad_token_id (`int`, optional):
	The id of the padding token.
	bos_token_id (`int`, optional):
	The id of the beginning-of-sequence token.
	eos_token_id (`Union[int, List[int]]`, optional):
	The id of the end-of-sequence token. Optionally, use a list to set multiple end-of-sequence tokens.

	> Generation parameters exclusive to encoder-decoder models

	encoder_no_repeat_ngram_size (`int`, optional, defaults to 0):
	If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
	`decoder_input_ids`.
	decoder_start_token_id (`int`, optional):
	If an encoder-decoder model starts decoding with a different token than bos, the id of that token.

	> Generation parameters exclusive to [assistant generation](https://arxiv.org/abs/2211.17192)

	num_assistant_tokens (`int`, optional, defaults to 5):
	Defines the number of _speculative tokens_ that shall be generated by the assistant model before being
	checked by the target model at each iteration. Higher values for `num_assistant_tokens` make the generation
	more _speculative_ : If the assistant model is performant larger speed-ups can be reached, if the assistant
	model requires lots of corrections, lower speed-ups are reached.

	num_assistant_tokens_schedule (`str`, optional, defaults to `"heuristic"`):
	Defines the schedule at which max assistant tokens shall be changed during inference.
	- `"_heuristic_`: When all _speculative_ tokens are correct, increase `num_assistant_tokens` by 2 else
	reduce by 1
	- `"constant"`: `num_assistant_tokens` stays unchanged during generation

	> Wild card

	generation_kwargs:
	Additional generation kwargs will be forwarded to the `generate` function of the model. Kwargs that are not
	present in `generate`'s signature will be used in the model forward pass.
	"""

	def __init__(self, **kwargs):
	# Parameters that control the length of the output
	# if the default `max_length` is updated here, make sure to update the `generate` tests following https://github.com/huggingface/transformers/pull/25030
	self.max_length = kwargs.pop("max_length", 20)
	self.max_new_tokens = kwargs.pop("max_new_tokens", None)
	self.min_length = kwargs.pop("min_length", 0)
	self.min_new_tokens = kwargs.pop("min_new_tokens", None)
	self.early_stopping = kwargs.pop("early_stopping", False)
	self.max_time = kwargs.pop("max_time", None)

	# Parameters that control the generation strategy used
	self.do_sample = kwargs.pop("do_sample", False)
	self.num_beams = kwargs.pop("num_beams", 1)
	self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
	self.penalty_alpha = kwargs.pop("penalty_alpha", None)
	self.use_cache = kwargs.pop("use_cache", True)

	# Parameters for manipulation of the model output logits
	self.temperature = kwargs.pop("temperature", 1.0)
	self.top_k = kwargs.pop("top_k", 50)
	self.top_p = kwargs.pop("top_p", 1.0)
	self.typical_p = kwargs.pop("typical_p", 1.0)
	self.epsilon_cutoff = kwargs.pop("epsilon_cutoff", 0.0)
	self.eta_cutoff = kwargs.pop("eta_cutoff", 0.0)
	self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
	self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
	self.encoder_repetition_penalty = kwargs.pop("encoder_repetition_penalty", 1.0)
	self.length_penalty = kwargs.pop("length_penalty", 1.0)
	self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
	self.bad_words_ids = kwargs.pop("bad_words_ids", None)
	self.force_words_ids = kwargs.pop("force_words_ids", None)
	self.renormalize_logits = kwargs.pop("renormalize_logits", False)
	self.constraints = kwargs.pop("constraints", None)
	self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None)
	self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
	self.remove_invalid_values = kwargs.pop("remove_invalid_values", False)
	self.exponential_decay_length_penalty = kwargs.pop("exponential_decay_length_penalty", None)
	self.suppress_tokens = kwargs.pop("suppress_tokens", None)
	self.begin_suppress_tokens = kwargs.pop("begin_suppress_tokens", None)
	self.forced_decoder_ids = kwargs.pop("forced_decoder_ids", None)
	self.sequence_bias = kwargs.pop("sequence_bias", None)
	self.guidance_scale = kwargs.pop("guidance_scale", None)
	self.low_memory = kwargs.pop("low_memory", None)

	# Parameters that define the output variables of `generate`
	self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
	self.output_attentions = kwargs.pop("output_attentions", False)
	self.output_hidden_states = kwargs.pop("output_hidden_states", False)
	self.output_scores = kwargs.pop("output_scores", False)
	self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", False)

	# Special tokens that can be used at generation time
	self.pad_token_id = kwargs.pop("pad_token_id", None)
	self.bos_token_id = kwargs.pop("bos_token_id", None)
	self.eos_token_id = kwargs.pop("eos_token_id", None)

	# Generation parameters exclusive to encoder-decoder models
	self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", 0)
	self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)

	# Assistant generation
	self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", 5)
	self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", "heuristic")

	# Wild card
	self.generation_kwargs = kwargs.pop("generation_kwargs", {})

	# The remaining attributes do not parametrize `.generate()`, but are informative and/or used by the hub
	# interface.
	self._from_model_config = kwargs.pop("_from_model_config", False)
	self._commit_hash = kwargs.pop("_commit_hash", None)
	self.transformers_version = kwargs.pop("transformers_version", __version__)

	# Additional attributes without default values
	if not self._from_model_config:
	# we don't want to copy values from the model config if we're initializing a `GenerationConfig` from a
	# model's default configuration file
	for key, value in kwargs.items():
	try:
	setattr(self, key, value)
	except AttributeError as err:
	logger.error(f"Can't set {key} with value {value} for {self}")
	raise err

	# Validate the values of the attributes
	self.validate(is_init=True)

	def __hash__(self):
	return hash(self.to_json_string(ignore_metadata=True))

	def __eq__(self, other):
	if not isinstance(other, GenerationConfig):
	return False

	self_without_metadata = self.to_json_string(use_diff=False, ignore_metadata=True)
	other_without_metadata = other.to_json_string(use_diff=False, ignore_metadata=True)
	return self_without_metadata == other_without_metadata

	def __repr__(self):
	return f"{self.__class__.__name__} {self.to_json_string(ignore_metadata=True)}"

	def validate(self, is_init=False):
	"""
	Validates the values of the attributes of the [`GenerationConfig`] instance. Raises exceptions in the presence
	of parameterization that can be detected as incorrect from the configuration instance alone.

	Note that some parameters are best validated at generate runtime, as they may depend on other inputs and/or the
	model, such as parameters related to the generation length.
	"""

	# Validation of individual attributes
	if self.early_stopping not in {True, False, "never"}:
	raise ValueError(f"`early_stopping` must be a boolean or 'never', but is {self.early_stopping}.")

	# Validation of attribute relations:
	fix_location = ""
	if is_init:
	fix_location = (
	" This was detected when initializing the generation config instance, which means the corresponding "
	"file may hold incorrect parameterization and should be fixed."
	)

	# 1. detect sampling-only parameterization when not in sampling mode
	if self.do_sample is False:
	greedy_wrong_parameter_msg = (
	"`do_sample` is set to `False`. However, `{flag_name}` is set to `{flag_value}` -- this flag is only "
	"used in sample-based generation modes. You should set `do_sample=True` or unset `{flag_name}`."
	+ fix_location
	)
	if self.temperature != 1.0:
	warnings.warn(
	greedy_wrong_parameter_msg.format(flag_name="temperature", flag_value=self.temperature),
	UserWarning,
	)
	if self.top_p != 1.0:
	warnings.warn(
	greedy_wrong_parameter_msg.format(flag_name="top_p", flag_value=self.top_p),
	UserWarning,
	)
	if self.typical_p != 1.0:
	warnings.warn(
	greedy_wrong_parameter_msg.format(flag_name="typical_p", flag_value=self.typical_p),
	UserWarning,
	)
	if self.top_k != 50 and self.penalty_alpha is None: # contrastive search uses top_k
	warnings.warn(
	greedy_wrong_parameter_msg.format(flag_name="top_k", flag_value=self.top_k),
	UserWarning,
	)
	if self.epsilon_cutoff != 0.0:
	warnings.warn(
	greedy_wrong_parameter_msg.format(flag_name="epsilon_cutoff", flag_value=self.epsilon_cutoff),
	UserWarning,
	)
	if self.eta_cutoff != 0.0:
	warnings.warn(
	greedy_wrong_parameter_msg.format(flag_name="eta_cutoff", flag_value=self.eta_cutoff),
	UserWarning,
	)

	# 2. detect beam-only parameterization when not in beam mode
	if self.num_beams is None:
	warnings.warn("`num_beams` is set to None - defaulting to 1.", UserWarning)
	self.num_beams = 1

	if self.num_beams == 1:
	single_beam_wrong_parameter_msg = (
	"`num_beams` is set to 1. However, `{flag_name}` is set to `{flag_value}` -- this flag is only used "
	"in beam-based generation modes. You should set `num_beams>1` or unset `{flag_name}`." + fix_location
	)
	if self.early_stopping is not False:
	warnings.warn(
	single_beam_wrong_parameter_msg.format(flag_name="early_stopping", flag_value=self.early_stopping),
	UserWarning,
	)
	if self.num_beam_groups != 1:
	warnings.warn(
	single_beam_wrong_parameter_msg.format(
	flag_name="num_beam_groups", flag_value=self.num_beam_groups
	),
	UserWarning,
	)
	if self.diversity_penalty != 0.0:
	warnings.warn(
	single_beam_wrong_parameter_msg.format(
	flag_name="diversity_penalty", flag_value=self.diversity_penalty
	),
	UserWarning,
	)
	if self.length_penalty != 1.0:
	warnings.warn(
	single_beam_wrong_parameter_msg.format(flag_name="length_penalty", flag_value=self.length_penalty),
	UserWarning,
	)
	if self.constraints is not None:
	warnings.warn(
	single_beam_wrong_parameter_msg.format(flag_name="constraints", flag_value=self.constraints),
	UserWarning,
	)

	# 3. detect incorrect paramaterization specific to advanced beam modes
	else:
	# constrained beam search
	if self.constraints is not None:
	constrained_wrong_parameter_msg = (
	"`constraints` is not `None`, triggering constrained beam search. However, `{flag_name}` is set "
	"to `{flag_value}`, which is incompatible with this generation mode. Set `constraints=None` or "
	"unset `{flag_name}` to continue." + fix_location
	)
	if self.do_sample is True:
	raise ValueError(
	constrained_wrong_parameter_msg.format(flag_name="do_sample", flag_value=self.do_sample)
	)
	if self.num_beam_groups != 1:
	raise ValueError(
	constrained_wrong_parameter_msg.format(
	flag_name="num_beam_groups", flag_value=self.num_beam_groups
	)
	)
	# group beam search
	if self.diversity_penalty != 0.0 or self.num_beam_groups != 1:
	group_error_prefix = (
	"`diversity_penalty` is not 0.0 or `num_beam_groups` is not 1, triggering group beam search. In "
	"this generation mode, "
	)
	if self.do_sample is True:
	raise ValueError(group_error_prefix + "`do_sample` must be set to `False`")
	if self.num_beams % self.num_beam_groups != 0:
	raise ValueError(group_error_prefix + "`num_beams` should be divisible by `num_beam_groups`")
	if self.diversity_penalty == 0.0:
	raise ValueError(
	group_error_prefix
	+ "`diversity_penalty` should be greater than `0.0`, otherwise your groups will be identical."
	)

	# 4. check `num_return_sequences`
	if self.num_return_sequences != 1:
	if self.num_beams == 1:
	if self.do_sample is False:
	raise ValueError(
	"Greedy methods without beam search do not support `num_return_sequences` different than 1 "
	f"(got {self.num_return_sequences})."
	)
	elif self.num_return_sequences > self.num_beams:
	raise ValueError(
	f"`num_return_sequences` ({self.num_return_sequences}) has to be smaller or equal to `num_beams` "
	f"({self.num_beams})."
	)

	# 5. check common issue: passing `generate` arguments inside the generation config
	generate_arguments = (
	"logits_processor",
	"stopping_criteria",
	"prefix_allowed_tokens_fn",
	"synced_gpus",
	"assistant_model",
	"streamer",
	"negative_prompt_ids",
	"negative_prompt_attention_mask",
	)
	for arg in generate_arguments:
	if hasattr(self, arg):
	raise ValueError(
	f"Argument `{arg}` is not a valid argument of `GenerationConfig`. It should be passed to "
	"`generate()` (or a pipeline) directly."
	)

	def save_pretrained(
	self,
	save_directory: Union[str, os.PathLike],
	config_file_name: Optional[Union[str, os.PathLike]] = None,
	push_to_hub: bool = False,
	**kwargs,
	):
	r"""
	Save a generation configuration object to the directory `save_directory`, so that it can be re-loaded using the
	[`~GenerationConfig.from_pretrained`] class method.

	Args:
	save_directory (`str` or `os.PathLike`):
	Directory where the configuration JSON file will be saved (will be created if it does not exist).
	config_file_name (`str` or `os.PathLike`, optional, defaults to `"generation_config.json"`):
	Name of the generation configuration JSON file to be saved in `save_directory`.
	push_to_hub (`bool`, optional, defaults to `False`):
	Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
	repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
	namespace).
	kwargs (`Dict[str, Any]`, optional):
	Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
	"""

	# At save time, validate the instance -- if any warning/exception is thrown, we refuse to save the instance
	try:
	with warnings.catch_warnings(record=True) as caught_warnings:
	self.validate()
	for w in caught_warnings:
	raise ValueError(w.message)
	except ValueError as exc:
	warnings.warn(
	"The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions. "
	"Fix these issues to save the configuration. This warning will be raised to an exception in v4.34."
	"\n\nThrown during validation:\n" + str(exc),
	UserWarning,
	)
	return

	use_auth_token = kwargs.pop("use_auth_token", None)

	if use_auth_token is not None:
	warnings.warn(
	"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
	FutureWarning,
	)
	if kwargs.get("token", None) is not None:
	raise ValueError(
	"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
	)
	kwargs["token"] = use_auth_token

	config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME

	if os.path.isfile(save_directory):
	raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")

	os.makedirs(save_directory, exist_ok=True)

	if push_to_hub:
	commit_message = kwargs.pop("commit_message", None)
	repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
	repo_id = self._create_repo(repo_id, **kwargs)
	files_timestamps = self._get_files_timestamps(save_directory)

	output_config_file = os.path.join(save_directory, config_file_name)

	self.to_json_file(output_config_file, use_diff=True)
	logger.info(f"Configuration saved in {output_config_file}")

	if push_to_hub:
	self._upload_modified_files(
	save_directory,
	repo_id,
	files_timestamps,
	commit_message=commit_message,
	token=kwargs.get("token"),
	)

	@classmethod
	def from_pretrained(
	cls,
	pretrained_model_name: Union[str, os.PathLike],
	config_file_name: Optional[Union[str, os.PathLike]] = None,
	cache_dir: Optional[Union[str, os.PathLike]] = None,
	force_download: bool = False,
	local_files_only: bool = False,
	token: Optional[Union[str, bool]] = None,
	revision: str = "main",
	**kwargs,
	) -> "GenerationConfig":
	r"""
	Instantiate a [`GenerationConfig`] from a generation configuration file.

	Args:
	pretrained_model_name (`str` or `os.PathLike`):
	This can be either:

	- a string, the model id of a pretrained model configuration hosted inside a model repo on
	huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
	namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
	- a path to a directory containing a configuration file saved using the
	[`~GenerationConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
	config_file_name (`str` or `os.PathLike`, optional, defaults to `"generation_config.json"`):
	Name of the generation configuration JSON file to be loaded from `pretrained_model_name`.
	cache_dir (`str` or `os.PathLike`, optional):
	Path to a directory in which a downloaded pretrained model configuration should be cached if the
	standard cache should not be used.
	force_download (`bool`, optional, defaults to `False`):
	Whether or not to force to (re-)download the configuration files and override the cached versions if
	they exist.
	resume_download (`bool`, optional, defaults to `False`):
	Whether or not to delete incompletely received file. Attempts to resume the download if such a file
	exists.
	proxies (`Dict[str, str]`, optional):
	A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
	'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
	token (`str` or `bool`, optional):
	The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
	the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
	revision (`str`, optional, defaults to `"main"`):
	The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
	git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
	identifier allowed by git.

	<Tip>

	To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".

	</Tip>

	return_unused_kwargs (`bool`, optional, defaults to `False`):
	If `False`, then this function returns just the final configuration object.

	If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where unused_kwargs is a
	dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the
	part of `kwargs` which has not been used to update `config` and is otherwise ignored.
	subfolder (`str`, optional, defaults to `""`):
	In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
	specify the folder name here.
	kwargs (`Dict[str, Any]`, optional):
	The values in kwargs of any keys which are configuration attributes will be used to override the loaded
	values. Behavior concerning key/value pairs whose keys are not configuration attributes is controlled
	by the `return_unused_kwargs` keyword parameter.

	Returns:
	[`GenerationConfig`]: The configuration object instantiated from this pretrained model.

	Examples:

	```python
	>>> from transformers import GenerationConfig

	>>> # Download configuration from huggingface.co and cache.
	>>> generation_config = GenerationConfig.from_pretrained("gpt2")

	>>> # E.g. config was saved using save_pretrained('./test/saved_model/')
	>>> generation_config.save_pretrained("./test/saved_model/")
	>>> generation_config = GenerationConfig.from_pretrained("./test/saved_model/")

	>>> # You can also specify configuration names to your generation configuration file
	>>> generation_config.save_pretrained("./test/saved_model/", config_file_name="my_configuration.json")
	>>> generation_config = GenerationConfig.from_pretrained("./test/saved_model/", "my_configuration.json")

	>>> # If you'd like to try a minor variation to an existing configuration, you can also pass generation
	>>> # arguments to `.from_pretrained()`. Be mindful that typos and unused arguments will be ignored
	>>> generation_config, unused_kwargs = GenerationConfig.from_pretrained(
	... "gpt2", top_k=1, foo=False, do_sample=True, return_unused_kwargs=True
	... )
	>>> generation_config.top_k
	1

	>>> unused_kwargs
	{'foo': False}
	```"""
	config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME

	resume_download = kwargs.pop("resume_download", False)
	proxies = kwargs.pop("proxies", None)
	use_auth_token = kwargs.pop("use_auth_token", None)
	subfolder = kwargs.pop("subfolder", "")
	from_pipeline = kwargs.pop("_from_pipeline", None)
	from_auto_class = kwargs.pop("_from_auto", False)
	commit_hash = kwargs.pop("_commit_hash", None)

	if use_auth_token is not None:
	warnings.warn(
	"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
	FutureWarning,
	)
	if token is not None:
	raise ValueError(
	"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
	)
	token = use_auth_token

	user_agent = {"file_type": "config", "from_auto_class": from_auto_class}
	if from_pipeline is not None:
	user_agent["using_pipeline"] = from_pipeline

	config_path = os.path.join(pretrained_model_name, config_file_name)
	config_path = str(config_path)

	is_local = os.path.exists(config_path)
	if os.path.isfile(os.path.join(subfolder, config_path)):
	# Special case when config_path is a local file
	resolved_config_file = config_path
	is_local = True
	elif is_remote_url(config_path):
	configuration_file = config_path
	resolved_config_file = download_url(config_path)
	else:
	configuration_file = config_file_name
	try:
	# Load from local folder or from cache or download from model Hub and cache
	resolved_config_file = cached_file(
	pretrained_model_name,
	configuration_file,
	cache_dir=cache_dir,
	force_download=force_download,
	proxies=proxies,
	resume_download=resume_download,
	local_files_only=local_files_only,
	token=token,
	user_agent=user_agent,
	revision=revision,
	subfolder=subfolder,
	_commit_hash=commit_hash,
	)
	commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
	except EnvironmentError:
	# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
	# the original exception.
	raise
	except Exception:
	# For any other exception, we throw a generic error.
	raise EnvironmentError(
	f"Can't load the configuration of '{pretrained_model_name}'. If you were trying to load it"
	" from 'https://huggingface.co/models', make sure you don't have a local directory with the same"
	f" name. Otherwise, make sure '{pretrained_model_name}' is the correct path to a directory"
	f" containing a {configuration_file} file"
	)

	try:
	# Load config dict
	config_dict = cls._dict_from_json_file(resolved_config_file)
	config_dict["_commit_hash"] = commit_hash
	except (json.JSONDecodeError, UnicodeDecodeError):
	raise EnvironmentError(
	f"It looks like the config file at '{resolved_config_file}' is not a valid JSON file."
	)

	if is_local:
	logger.info(f"loading configuration file {resolved_config_file}")
	else:
	logger.info(f"loading configuration file {configuration_file} from cache at {resolved_config_file}")

	if kwargs.get("return_unused_kwargs") is True:
	config, unused_kwargs = cls.from_dict(config_dict, **kwargs)
	config._original_object_hash = hash(config) # Hash to detect whether the instance was modified
	return config, unused_kwargs
	else:
	config = cls.from_dict(config_dict, **kwargs)
	config._original_object_hash = hash(config) # Hash to detect whether the instance was modified
	return config

	@classmethod
	def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
	with open(json_file, "r", encoding="utf-8") as reader:
	text = reader.read()
	return json.loads(text)

	@classmethod
	def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "GenerationConfig":
	"""
	Instantiates a [`GenerationConfig`] from a Python dictionary of parameters.

	Args:
	config_dict (`Dict[str, Any]`):
	Dictionary that will be used to instantiate the configuration object.
	kwargs (`Dict[str, Any]`):
	Additional parameters from which to initialize the configuration object.

	Returns:
	[`GenerationConfig`]: The configuration object instantiated from those parameters.
	"""
	return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
	# Those arguments may be passed along for our internal telemetry.
	# We remove them so they don't appear in `return_unused_kwargs`.
	kwargs.pop("_from_auto", None)
	kwargs.pop("_from_pipeline", None)
	# The commit hash might have been updated in the `config_dict`, we don't want the kwargs to erase that update.
	if "_commit_hash" in kwargs and "_commit_hash" in config_dict:
	kwargs["_commit_hash"] = config_dict["_commit_hash"]

	# The line below allows model-specific config to be loaded as well through kwargs, with safety checks.
	# See https://github.com/huggingface/transformers/pull/21269
	config = cls({config_dict, **kwargs})
	unused_kwargs = config.update(**kwargs)

	logger.info(f"Generate config {config}")
	if return_unused_kwargs:
	return config, unused_kwargs
	else:
	return config

	def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
	"""
	Checks whether the passed dictionary and its nested dicts have a torch_dtype key and if it's not None,
	converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into "float32"
	string, which can then be stored in the json format.
	"""
	if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
	d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1]
	for value in d.values():
	if isinstance(value, dict):
	self.dict_torch_dtype_to_str(value)

	def to_diff_dict(self) -> Dict[str, Any]:
	"""
	Removes all attributes from config which correspond to the default config attributes for better readability and
	serializes to a Python dictionary.

	Returns:
	`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
	"""
	config_dict = self.to_dict()

	# get the default config dict
	default_config_dict = GenerationConfig().to_dict()

	serializable_config_dict = {}

	# only serialize values that differ from the default config
	for key, value in config_dict.items():
	if key not in default_config_dict or key == "transformers_version" or value != default_config_dict[key]:
	serializable_config_dict[key] = value

	self.dict_torch_dtype_to_str(serializable_config_dict)
	return serializable_config_dict

	def to_dict(self) -> Dict[str, Any]:
	"""
	Serializes this instance to a Python dictionary.

	Returns:
	`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
	"""
	output = copy.deepcopy(self.__dict__)

	# Fields to ignore at serialization time
	if "_commit_hash" in output:
	del output["_commit_hash"]
	if "_original_object_hash" in output:
	del output["_original_object_hash"]

	# Transformers version when serializing this file
	output["transformers_version"] = __version__

	self.dict_torch_dtype_to_str(output)
	return output

	def to_json_string(self, use_diff: bool = True, ignore_metadata: bool = False) -> str:
	"""
	Serializes this instance to a JSON string.

	Args:
	use_diff (`bool`, optional, defaults to `True`):
	If set to `True`, only the difference between the config instance and the default `GenerationConfig()`
	is serialized to JSON string.
	ignore_metadata (`bool`, optional, defaults to `False`):
	Whether to ignore the metadata fields present in the instance

	Returns:
	`str`: String containing all the attributes that make up this configuration instance in JSON format.
	"""
	if use_diff is True:
	config_dict = self.to_diff_dict()
	else:
	config_dict = self.to_dict()

	if ignore_metadata:
	for metadata_field in METADATA_FIELDS:
	config_dict.pop(metadata_field, None)

	return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"

	def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True):
	"""
	Save this instance to a JSON file.

	Args:
	json_file_path (`str` or `os.PathLike`):
	Path to the JSON file in which this configuration instance's parameters will be saved.
	use_diff (`bool`, optional, defaults to `True`):
	If set to `True`, only the difference between the config instance and the default `GenerationConfig()`
	is serialized to JSON file.
	"""
	with open(json_file_path, "w", encoding="utf-8") as writer:
	writer.write(self.to_json_string(use_diff=use_diff))

	@classmethod
	def from_model_config(cls, model_config: PretrainedConfig) -> "GenerationConfig":
	"""
	Instantiates a [`GenerationConfig`] from a [`PretrainedConfig`]. This function is useful to convert legacy
	[`PretrainedConfig`] objects, which may contain generation parameters, into a stand-alone [`GenerationConfig`].

	Args:
	model_config (`PretrainedConfig`):
	The model config that will be used to instantiate the generation config.

	Returns:
	[`GenerationConfig`]: The configuration object instantiated from those parameters.
	"""
	config_dict = model_config.to_dict()
	config_dict.pop("_from_model_config", None)
	config = cls.from_dict(config_dict, return_unused_kwargs=False, _from_model_config=True)

	# Special case: some models have generation attributes set in the decoder. Use them if still unset in the
	# generation config.
	for decoder_name in ("decoder", "generator", "text_config"):
	if decoder_name in config_dict:
	default_generation_config = GenerationConfig()
	decoder_config = config_dict[decoder_name]
	for attr in config.to_dict().keys():
	if attr in decoder_config and getattr(config, attr) == getattr(default_generation_config, attr):
	setattr(config, attr, decoder_config[attr])

	config._original_object_hash = hash(config) # Hash to detect whether the instance was modified
	return config

	def update(self, **kwargs):
	"""
	Updates attributes of this class instance with attributes from `kwargs` if they match existing atributtes,
	returning all the unused kwargs.

	Args:
	kwargs (`Dict[str, Any]`):
	Dictionary of attributes to tentatively update this class.

	Returns:
	`Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
	"""
	to_remove = []
	for key, value in kwargs.items():
	if hasattr(self, key):
	setattr(self, key, value)
	to_remove.append(key)

	# remove all the attributes that were updated, without modifying the input dict
	unused_kwargs = {key: value for key, value in kwargs.items() if key not in to_remove}
	return unused_kwargs