Spaces:
Paused
Paused
# coding=utf-8 | |
# Copyright 2023 The HuggingFace Inc. team. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import inspect | |
import os | |
import re | |
from transformers.configuration_utils import PretrainedConfig | |
from transformers.utils import direct_transformers_import | |
# All paths are set with the intent you should run this script from the root of the repo with the command | |
# python utils/check_config_docstrings.py | |
PATH_TO_TRANSFORMERS = "src/transformers" | |
# This is to make sure the transformers module imported is the one in the repo. | |
transformers = direct_transformers_import(PATH_TO_TRANSFORMERS) | |
CONFIG_MAPPING = transformers.models.auto.configuration_auto.CONFIG_MAPPING | |
SPECIAL_CASES_TO_ALLOW = { | |
# used to compute the property `self.chunk_length` | |
"EncodecConfig": ["overlap"], | |
# used as `self.bert_model = BertModel(config, ...)` | |
"DPRConfig": True, | |
# not used in modeling files, but it's an important information | |
"FSMTConfig": ["langs"], | |
# used internally in the configuration class file | |
"GPTNeoConfig": ["attention_types"], | |
# used internally in the configuration class file | |
"EsmConfig": ["is_folding_model"], | |
# used during training (despite we don't have training script for these models yet) | |
"Mask2FormerConfig": ["ignore_value"], | |
# `ignore_value` used during training (despite we don't have training script for these models yet) | |
# `norm` used in conversion script (despite not using in the modeling file) | |
"OneFormerConfig": ["ignore_value", "norm"], | |
# used during preprocessing and collation, see `collating_graphormer.py` | |
"GraphormerConfig": ["spatial_pos_max"], | |
# used internally in the configuration class file | |
"T5Config": ["feed_forward_proj"], | |
# used internally in the configuration class file | |
# `tokenizer_class` get default value `T5Tokenizer` intentionally | |
"MT5Config": ["feed_forward_proj", "tokenizer_class"], | |
"UMT5Config": ["feed_forward_proj", "tokenizer_class"], | |
# used internally in the configuration class file | |
"LongT5Config": ["feed_forward_proj"], | |
# used internally in the configuration class file | |
"Pop2PianoConfig": ["feed_forward_proj"], | |
# used internally in the configuration class file | |
"SwitchTransformersConfig": ["feed_forward_proj"], | |
# having default values other than `1e-5` - we can't fix them without breaking | |
"BioGptConfig": ["layer_norm_eps"], | |
# having default values other than `1e-5` - we can't fix them without breaking | |
"GLPNConfig": ["layer_norm_eps"], | |
# having default values other than `1e-5` - we can't fix them without breaking | |
"SegformerConfig": ["layer_norm_eps"], | |
# having default values other than `1e-5` - we can't fix them without breaking | |
"CvtConfig": ["layer_norm_eps"], | |
# having default values other than `1e-5` - we can't fix them without breaking | |
"PerceiverConfig": ["layer_norm_eps"], | |
# used internally to calculate the feature size | |
"InformerConfig": ["num_static_real_features", "num_time_features"], | |
# used internally to calculate the feature size | |
"TimeSeriesTransformerConfig": ["num_static_real_features", "num_time_features"], | |
# used internally to calculate the feature size | |
"AutoformerConfig": ["num_static_real_features", "num_time_features"], | |
# used internally to calculate `mlp_dim` | |
"SamVisionConfig": ["mlp_ratio"], | |
# For (head) training, but so far not implemented | |
"ClapAudioConfig": ["num_classes"], | |
# Not used, but providing useful information to users | |
"SpeechT5HifiGanConfig": ["sampling_rate"], | |
} | |
# TODO (ydshieh): Check the failing cases, try to fix them or move some cases to the above block once we are sure | |
SPECIAL_CASES_TO_ALLOW.update( | |
{ | |
"CLIPSegConfig": True, | |
"DeformableDetrConfig": True, | |
"DetaConfig": True, | |
"DinatConfig": True, | |
"DonutSwinConfig": True, | |
"EfficientFormerConfig": True, | |
"FSMTConfig": True, | |
"JukeboxConfig": True, | |
"LayoutLMv2Config": True, | |
"MaskFormerSwinConfig": True, | |
"MT5Config": True, | |
# For backward compatibility with trust remote code models | |
"MptConfig": True, | |
"MptAttentionConfig": True, | |
"NatConfig": True, | |
"OneFormerConfig": True, | |
"PerceiverConfig": True, | |
"RagConfig": True, | |
"SpeechT5Config": True, | |
"SwinConfig": True, | |
"Swin2SRConfig": True, | |
"Swinv2Config": True, | |
"SwitchTransformersConfig": True, | |
"TableTransformerConfig": True, | |
"TapasConfig": True, | |
"TransfoXLConfig": True, | |
"UniSpeechConfig": True, | |
"UniSpeechSatConfig": True, | |
"WavLMConfig": True, | |
"WhisperConfig": True, | |
# TODO: @Arthur (for `alignment_head` and `alignment_layer`) | |
"JukeboxPriorConfig": True, | |
# TODO: @Younes (for `is_decoder`) | |
"Pix2StructTextConfig": True, | |
"IdeficsConfig": True, | |
"IdeficsVisionConfig": True, | |
"IdeficsPerceiverConfig": True, | |
} | |
) | |
def check_attribute_being_used(config_class, attributes, default_value, source_strings): | |
"""Check if any name in `attributes` is used in one of the strings in `source_strings` | |
Args: | |
config_class (`type`): | |
The configuration class for which the arguments in its `__init__` will be checked. | |
attributes (`List[str]`): | |
The name of an argument (or attribute) and its variant names if any. | |
default_value (`Any`): | |
A default value for the attribute in `attributes` assigned in the `__init__` of `config_class`. | |
source_strings (`List[str]`): | |
The python source code strings in the same modeling directory where `config_class` is defined. The file | |
containing the definition of `config_class` should be excluded. | |
""" | |
attribute_used = False | |
for attribute in attributes: | |
for modeling_source in source_strings: | |
# check if we can find `config.xxx`, `getattr(config, "xxx", ...)` or `getattr(self.config, "xxx", ...)` | |
if ( | |
f"config.{attribute}" in modeling_source | |
or f'getattr(config, "{attribute}"' in modeling_source | |
or f'getattr(self.config, "{attribute}"' in modeling_source | |
): | |
attribute_used = True | |
# Deal with multi-line cases | |
elif ( | |
re.search( | |
rf'getattr[ \t\v\n\r\f]*\([ \t\v\n\r\f]*(self\.)?config,[ \t\v\n\r\f]*"{attribute}"', | |
modeling_source, | |
) | |
is not None | |
): | |
attribute_used = True | |
# `SequenceSummary` is called with `SequenceSummary(config)` | |
elif attribute in [ | |
"summary_type", | |
"summary_use_proj", | |
"summary_activation", | |
"summary_last_dropout", | |
"summary_proj_to_labels", | |
"summary_first_dropout", | |
]: | |
if "SequenceSummary" in modeling_source: | |
attribute_used = True | |
if attribute_used: | |
break | |
if attribute_used: | |
break | |
# common and important attributes, even if they do not always appear in the modeling files | |
attributes_to_allow = [ | |
"bos_index", | |
"eos_index", | |
"pad_index", | |
"unk_index", | |
"mask_index", | |
"image_size", | |
"use_cache", | |
"out_features", | |
"out_indices", | |
"sampling_rate", | |
] | |
attributes_used_in_generation = ["encoder_no_repeat_ngram_size"] | |
# Special cases to be allowed | |
case_allowed = True | |
if not attribute_used: | |
case_allowed = False | |
for attribute in attributes: | |
# Allow if the default value in the configuration class is different from the one in `PretrainedConfig` | |
if attribute in ["is_encoder_decoder"] and default_value is True: | |
case_allowed = True | |
elif attribute in ["tie_word_embeddings"] and default_value is False: | |
case_allowed = True | |
# Allow cases without checking the default value in the configuration class | |
elif attribute in attributes_to_allow + attributes_used_in_generation: | |
case_allowed = True | |
elif attribute.endswith("_token_id"): | |
case_allowed = True | |
# configuration class specific cases | |
if not case_allowed: | |
allowed_cases = SPECIAL_CASES_TO_ALLOW.get(config_class.__name__, []) | |
case_allowed = allowed_cases is True or attribute in allowed_cases | |
return attribute_used or case_allowed | |
def check_config_attributes_being_used(config_class): | |
"""Check the arguments in `__init__` of `config_class` are used in the modeling files in the same directory | |
Args: | |
config_class (`type`): | |
The configuration class for which the arguments in its `__init__` will be checked. | |
""" | |
# Get the parameters in `__init__` of the configuration class, and the default values if any | |
signature = dict(inspect.signature(config_class.__init__).parameters) | |
parameter_names = [x for x in list(signature.keys()) if x not in ["self", "kwargs"]] | |
parameter_defaults = [signature[param].default for param in parameter_names] | |
# If `attribute_map` exists, an attribute can have different names to be used in the modeling files, and as long | |
# as one variant is used, the test should pass | |
reversed_attribute_map = {} | |
if len(config_class.attribute_map) > 0: | |
reversed_attribute_map = {v: k for k, v in config_class.attribute_map.items()} | |
# Get the path to modeling source files | |
config_source_file = inspect.getsourcefile(config_class) | |
model_dir = os.path.dirname(config_source_file) | |
# Let's check against all frameworks: as long as one framework uses an attribute, we are good. | |
modeling_paths = [os.path.join(model_dir, fn) for fn in os.listdir(model_dir) if fn.startswith("modeling_")] | |
# Get the source code strings | |
modeling_sources = [] | |
for path in modeling_paths: | |
if os.path.isfile(path): | |
with open(path, encoding="utf8") as fp: | |
modeling_sources.append(fp.read()) | |
unused_attributes = [] | |
for config_param, default_value in zip(parameter_names, parameter_defaults): | |
# `attributes` here is all the variant names for `config_param` | |
attributes = [config_param] | |
# some configuration classes have non-empty `attribute_map`, and both names could be used in the | |
# corresponding modeling files. As long as one of them appears, it is fine. | |
if config_param in reversed_attribute_map: | |
attributes.append(reversed_attribute_map[config_param]) | |
if not check_attribute_being_used(config_class, attributes, default_value, modeling_sources): | |
unused_attributes.append(attributes[0]) | |
return sorted(unused_attributes) | |
def check_config_attributes(): | |
"""Check the arguments in `__init__` of all configuration classes are used in python files""" | |
configs_with_unused_attributes = {} | |
for _config_class in list(CONFIG_MAPPING.values()): | |
# Skip deprecated models | |
if "models.deprecated" in _config_class.__module__: | |
continue | |
# Some config classes are not in `CONFIG_MAPPING` (e.g. `CLIPVisionConfig`, `Blip2VisionConfig`, etc.) | |
config_classes_in_module = [ | |
cls | |
for name, cls in inspect.getmembers( | |
inspect.getmodule(_config_class), | |
lambda x: inspect.isclass(x) | |
and issubclass(x, PretrainedConfig) | |
and inspect.getmodule(x) == inspect.getmodule(_config_class), | |
) | |
] | |
for config_class in config_classes_in_module: | |
unused_attributes = check_config_attributes_being_used(config_class) | |
if len(unused_attributes) > 0: | |
configs_with_unused_attributes[config_class.__name__] = unused_attributes | |
if len(configs_with_unused_attributes) > 0: | |
error = "The following configuration classes contain unused attributes in the corresponding modeling files:\n" | |
for name, attributes in configs_with_unused_attributes.items(): | |
error += f"{name}: {attributes}\n" | |
raise ValueError(error) | |
if __name__ == "__main__": | |
check_config_attributes() | |