Spaces:
Running
on
Zero
Running
on
Zero
# coding=utf-8 | |
# Copyright 2023 The HuggingFace Inc. team. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import importlib | |
import inspect | |
import os | |
import re | |
import warnings | |
from collections import OrderedDict | |
from difflib import get_close_matches | |
from pathlib import Path | |
from diffusers.models.auto import get_values | |
from diffusers.utils import ENV_VARS_TRUE_VALUES, is_flax_available, is_tf_available, is_torch_available | |
# All paths are set with the intent you should run this script from the root of the repo with the command | |
# python utils/check_repo.py | |
PATH_TO_DIFFUSERS = "src/diffusers" | |
PATH_TO_TESTS = "tests" | |
PATH_TO_DOC = "docs/source/en" | |
# Update this list with models that are supposed to be private. | |
PRIVATE_MODELS = [ | |
"DPRSpanPredictor", | |
"RealmBertModel", | |
"T5Stack", | |
"TFDPRSpanPredictor", | |
] | |
# Update this list for models that are not tested with a comment explaining the reason it should not be. | |
# Being in this list is an exception and should **not** be the rule. | |
IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [ | |
# models to ignore for not tested | |
"OPTDecoder", # Building part of bigger (tested) model. | |
"DecisionTransformerGPT2Model", # Building part of bigger (tested) model. | |
"SegformerDecodeHead", # Building part of bigger (tested) model. | |
"PLBartEncoder", # Building part of bigger (tested) model. | |
"PLBartDecoder", # Building part of bigger (tested) model. | |
"PLBartDecoderWrapper", # Building part of bigger (tested) model. | |
"BigBirdPegasusEncoder", # Building part of bigger (tested) model. | |
"BigBirdPegasusDecoder", # Building part of bigger (tested) model. | |
"BigBirdPegasusDecoderWrapper", # Building part of bigger (tested) model. | |
"DetrEncoder", # Building part of bigger (tested) model. | |
"DetrDecoder", # Building part of bigger (tested) model. | |
"DetrDecoderWrapper", # Building part of bigger (tested) model. | |
"M2M100Encoder", # Building part of bigger (tested) model. | |
"M2M100Decoder", # Building part of bigger (tested) model. | |
"Speech2TextEncoder", # Building part of bigger (tested) model. | |
"Speech2TextDecoder", # Building part of bigger (tested) model. | |
"LEDEncoder", # Building part of bigger (tested) model. | |
"LEDDecoder", # Building part of bigger (tested) model. | |
"BartDecoderWrapper", # Building part of bigger (tested) model. | |
"BartEncoder", # Building part of bigger (tested) model. | |
"BertLMHeadModel", # Needs to be setup as decoder. | |
"BlenderbotSmallEncoder", # Building part of bigger (tested) model. | |
"BlenderbotSmallDecoderWrapper", # Building part of bigger (tested) model. | |
"BlenderbotEncoder", # Building part of bigger (tested) model. | |
"BlenderbotDecoderWrapper", # Building part of bigger (tested) model. | |
"MBartEncoder", # Building part of bigger (tested) model. | |
"MBartDecoderWrapper", # Building part of bigger (tested) model. | |
"MegatronBertLMHeadModel", # Building part of bigger (tested) model. | |
"MegatronBertEncoder", # Building part of bigger (tested) model. | |
"MegatronBertDecoder", # Building part of bigger (tested) model. | |
"MegatronBertDecoderWrapper", # Building part of bigger (tested) model. | |
"PegasusEncoder", # Building part of bigger (tested) model. | |
"PegasusDecoderWrapper", # Building part of bigger (tested) model. | |
"DPREncoder", # Building part of bigger (tested) model. | |
"ProphetNetDecoderWrapper", # Building part of bigger (tested) model. | |
"RealmBertModel", # Building part of bigger (tested) model. | |
"RealmReader", # Not regular model. | |
"RealmScorer", # Not regular model. | |
"RealmForOpenQA", # Not regular model. | |
"ReformerForMaskedLM", # Needs to be setup as decoder. | |
"Speech2Text2DecoderWrapper", # Building part of bigger (tested) model. | |
"TFDPREncoder", # Building part of bigger (tested) model. | |
"TFElectraMainLayer", # Building part of bigger (tested) model (should it be a TFModelMixin ?) | |
"TFRobertaForMultipleChoice", # TODO: fix | |
"TrOCRDecoderWrapper", # Building part of bigger (tested) model. | |
"SeparableConv1D", # Building part of bigger (tested) model. | |
"FlaxBartForCausalLM", # Building part of bigger (tested) model. | |
"FlaxBertForCausalLM", # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM. | |
"OPTDecoderWrapper", | |
] | |
# Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't | |
# trigger the common tests. | |
TEST_FILES_WITH_NO_COMMON_TESTS = [ | |
"models/decision_transformer/test_modeling_decision_transformer.py", | |
"models/camembert/test_modeling_camembert.py", | |
"models/mt5/test_modeling_flax_mt5.py", | |
"models/mbart/test_modeling_mbart.py", | |
"models/mt5/test_modeling_mt5.py", | |
"models/pegasus/test_modeling_pegasus.py", | |
"models/camembert/test_modeling_tf_camembert.py", | |
"models/mt5/test_modeling_tf_mt5.py", | |
"models/xlm_roberta/test_modeling_tf_xlm_roberta.py", | |
"models/xlm_roberta/test_modeling_flax_xlm_roberta.py", | |
"models/xlm_prophetnet/test_modeling_xlm_prophetnet.py", | |
"models/xlm_roberta/test_modeling_xlm_roberta.py", | |
"models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py", | |
"models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py", | |
"models/decision_transformer/test_modeling_decision_transformer.py", | |
] | |
# Update this list for models that are not in any of the auto MODEL_XXX_MAPPING. Being in this list is an exception and | |
# should **not** be the rule. | |
IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ | |
# models to ignore for model xxx mapping | |
"DPTForDepthEstimation", | |
"DecisionTransformerGPT2Model", | |
"GLPNForDepthEstimation", | |
"ViltForQuestionAnswering", | |
"ViltForImagesAndTextClassification", | |
"ViltForImageAndTextRetrieval", | |
"ViltForMaskedLM", | |
"XGLMEncoder", | |
"XGLMDecoder", | |
"XGLMDecoderWrapper", | |
"PerceiverForMultimodalAutoencoding", | |
"PerceiverForOpticalFlow", | |
"SegformerDecodeHead", | |
"FlaxBeitForMaskedImageModeling", | |
"PLBartEncoder", | |
"PLBartDecoder", | |
"PLBartDecoderWrapper", | |
"BeitForMaskedImageModeling", | |
"CLIPTextModel", | |
"CLIPVisionModel", | |
"TFCLIPTextModel", | |
"TFCLIPVisionModel", | |
"FlaxCLIPTextModel", | |
"FlaxCLIPVisionModel", | |
"FlaxWav2Vec2ForCTC", | |
"DetrForSegmentation", | |
"DPRReader", | |
"FlaubertForQuestionAnswering", | |
"FlavaImageCodebook", | |
"FlavaTextModel", | |
"FlavaImageModel", | |
"FlavaMultimodalModel", | |
"GPT2DoubleHeadsModel", | |
"LukeForMaskedLM", | |
"LukeForEntityClassification", | |
"LukeForEntityPairClassification", | |
"LukeForEntitySpanClassification", | |
"OpenAIGPTDoubleHeadsModel", | |
"RagModel", | |
"RagSequenceForGeneration", | |
"RagTokenForGeneration", | |
"RealmEmbedder", | |
"RealmForOpenQA", | |
"RealmScorer", | |
"RealmReader", | |
"TFDPRReader", | |
"TFGPT2DoubleHeadsModel", | |
"TFOpenAIGPTDoubleHeadsModel", | |
"TFRagModel", | |
"TFRagSequenceForGeneration", | |
"TFRagTokenForGeneration", | |
"Wav2Vec2ForCTC", | |
"HubertForCTC", | |
"SEWForCTC", | |
"SEWDForCTC", | |
"XLMForQuestionAnswering", | |
"XLNetForQuestionAnswering", | |
"SeparableConv1D", | |
"VisualBertForRegionToPhraseAlignment", | |
"VisualBertForVisualReasoning", | |
"VisualBertForQuestionAnswering", | |
"VisualBertForMultipleChoice", | |
"TFWav2Vec2ForCTC", | |
"TFHubertForCTC", | |
"MaskFormerForInstanceSegmentation", | |
] | |
# Update this list for models that have multiple model types for the same | |
# model doc | |
MODEL_TYPE_TO_DOC_MAPPING = OrderedDict( | |
[ | |
("data2vec-text", "data2vec"), | |
("data2vec-audio", "data2vec"), | |
("data2vec-vision", "data2vec"), | |
] | |
) | |
# This is to make sure the transformers module imported is the one in the repo. | |
spec = importlib.util.spec_from_file_location( | |
"diffusers", | |
os.path.join(PATH_TO_DIFFUSERS, "__init__.py"), | |
submodule_search_locations=[PATH_TO_DIFFUSERS], | |
) | |
diffusers = spec.loader.load_module() | |
def check_model_list(): | |
"""Check the model list inside the transformers library.""" | |
# Get the models from the directory structure of `src/diffusers/models/` | |
models_dir = os.path.join(PATH_TO_DIFFUSERS, "models") | |
_models = [] | |
for model in os.listdir(models_dir): | |
model_dir = os.path.join(models_dir, model) | |
if os.path.isdir(model_dir) and "__init__.py" in os.listdir(model_dir): | |
_models.append(model) | |
# Get the models from the directory structure of `src/transformers/models/` | |
models = [model for model in dir(diffusers.models) if not model.startswith("__")] | |
missing_models = sorted(set(_models).difference(models)) | |
if missing_models: | |
raise Exception( | |
f"The following models should be included in {models_dir}/__init__.py: {','.join(missing_models)}." | |
) | |
# If some modeling modules should be ignored for all checks, they should be added in the nested list | |
# _ignore_modules of this function. | |
def get_model_modules(): | |
"""Get the model modules inside the transformers library.""" | |
_ignore_modules = [ | |
"modeling_auto", | |
"modeling_encoder_decoder", | |
"modeling_marian", | |
"modeling_mmbt", | |
"modeling_outputs", | |
"modeling_retribert", | |
"modeling_utils", | |
"modeling_flax_auto", | |
"modeling_flax_encoder_decoder", | |
"modeling_flax_utils", | |
"modeling_speech_encoder_decoder", | |
"modeling_flax_speech_encoder_decoder", | |
"modeling_flax_vision_encoder_decoder", | |
"modeling_transfo_xl_utilities", | |
"modeling_tf_auto", | |
"modeling_tf_encoder_decoder", | |
"modeling_tf_outputs", | |
"modeling_tf_pytorch_utils", | |
"modeling_tf_utils", | |
"modeling_tf_transfo_xl_utilities", | |
"modeling_tf_vision_encoder_decoder", | |
"modeling_vision_encoder_decoder", | |
] | |
modules = [] | |
for model in dir(diffusers.models): | |
# There are some magic dunder attributes in the dir, we ignore them | |
if not model.startswith("__"): | |
model_module = getattr(diffusers.models, model) | |
for submodule in dir(model_module): | |
if submodule.startswith("modeling") and submodule not in _ignore_modules: | |
modeling_module = getattr(model_module, submodule) | |
if inspect.ismodule(modeling_module): | |
modules.append(modeling_module) | |
return modules | |
def get_models(module, include_pretrained=False): | |
"""Get the objects in module that are models.""" | |
models = [] | |
model_classes = (diffusers.ModelMixin, diffusers.TFModelMixin, diffusers.FlaxModelMixin) | |
for attr_name in dir(module): | |
if not include_pretrained and ("Pretrained" in attr_name or "PreTrained" in attr_name): | |
continue | |
attr = getattr(module, attr_name) | |
if isinstance(attr, type) and issubclass(attr, model_classes) and attr.__module__ == module.__name__: | |
models.append((attr_name, attr)) | |
return models | |
def is_a_private_model(model): | |
"""Returns True if the model should not be in the main init.""" | |
if model in PRIVATE_MODELS: | |
return True | |
# Wrapper, Encoder and Decoder are all privates | |
if model.endswith("Wrapper"): | |
return True | |
if model.endswith("Encoder"): | |
return True | |
if model.endswith("Decoder"): | |
return True | |
return False | |
def check_models_are_in_init(): | |
"""Checks all models defined in the library are in the main init.""" | |
models_not_in_init = [] | |
dir_transformers = dir(diffusers) | |
for module in get_model_modules(): | |
models_not_in_init += [ | |
model[0] for model in get_models(module, include_pretrained=True) if model[0] not in dir_transformers | |
] | |
# Remove private models | |
models_not_in_init = [model for model in models_not_in_init if not is_a_private_model(model)] | |
if len(models_not_in_init) > 0: | |
raise Exception(f"The following models should be in the main init: {','.join(models_not_in_init)}.") | |
# If some test_modeling files should be ignored when checking models are all tested, they should be added in the | |
# nested list _ignore_files of this function. | |
def get_model_test_files(): | |
"""Get the model test files. | |
The returned files should NOT contain the `tests` (i.e. `PATH_TO_TESTS` defined in this script). They will be | |
considered as paths relative to `tests`. A caller has to use `os.path.join(PATH_TO_TESTS, ...)` to access the files. | |
""" | |
_ignore_files = [ | |
"test_modeling_common", | |
"test_modeling_encoder_decoder", | |
"test_modeling_flax_encoder_decoder", | |
"test_modeling_flax_speech_encoder_decoder", | |
"test_modeling_marian", | |
"test_modeling_tf_common", | |
"test_modeling_tf_encoder_decoder", | |
] | |
test_files = [] | |
# Check both `PATH_TO_TESTS` and `PATH_TO_TESTS/models` | |
model_test_root = os.path.join(PATH_TO_TESTS, "models") | |
model_test_dirs = [] | |
for x in os.listdir(model_test_root): | |
x = os.path.join(model_test_root, x) | |
if os.path.isdir(x): | |
model_test_dirs.append(x) | |
for target_dir in [PATH_TO_TESTS] + model_test_dirs: | |
for file_or_dir in os.listdir(target_dir): | |
path = os.path.join(target_dir, file_or_dir) | |
if os.path.isfile(path): | |
filename = os.path.split(path)[-1] | |
if "test_modeling" in filename and os.path.splitext(filename)[0] not in _ignore_files: | |
file = os.path.join(*path.split(os.sep)[1:]) | |
test_files.append(file) | |
return test_files | |
# This is a bit hacky but I didn't find a way to import the test_file as a module and read inside the tester class | |
# for the all_model_classes variable. | |
def find_tested_models(test_file): | |
"""Parse the content of test_file to detect what's in all_model_classes""" | |
# This is a bit hacky but I didn't find a way to import the test_file as a module and read inside the class | |
with open(os.path.join(PATH_TO_TESTS, test_file), "r", encoding="utf-8", newline="\n") as f: | |
content = f.read() | |
all_models = re.findall(r"all_model_classes\s+=\s+\(\s*\(([^\)]*)\)", content) | |
# Check with one less parenthesis as well | |
all_models += re.findall(r"all_model_classes\s+=\s+\(([^\)]*)\)", content) | |
if len(all_models) > 0: | |
model_tested = [] | |
for entry in all_models: | |
for line in entry.split(","): | |
name = line.strip() | |
if len(name) > 0: | |
model_tested.append(name) | |
return model_tested | |
def check_models_are_tested(module, test_file): | |
"""Check models defined in module are tested in test_file.""" | |
# XxxModelMixin are not tested | |
defined_models = get_models(module) | |
tested_models = find_tested_models(test_file) | |
if tested_models is None: | |
if test_file.replace(os.path.sep, "/") in TEST_FILES_WITH_NO_COMMON_TESTS: | |
return | |
return [ | |
f"{test_file} should define `all_model_classes` to apply common tests to the models it tests. " | |
+ "If this intentional, add the test filename to `TEST_FILES_WITH_NO_COMMON_TESTS` in the file " | |
+ "`utils/check_repo.py`." | |
] | |
failures = [] | |
for model_name, _ in defined_models: | |
if model_name not in tested_models and model_name not in IGNORE_NON_TESTED: | |
failures.append( | |
f"{model_name} is defined in {module.__name__} but is not tested in " | |
+ f"{os.path.join(PATH_TO_TESTS, test_file)}. Add it to the all_model_classes in that file." | |
+ "If common tests should not applied to that model, add its name to `IGNORE_NON_TESTED`" | |
+ "in the file `utils/check_repo.py`." | |
) | |
return failures | |
def check_all_models_are_tested(): | |
"""Check all models are properly tested.""" | |
modules = get_model_modules() | |
test_files = get_model_test_files() | |
failures = [] | |
for module in modules: | |
test_file = [file for file in test_files if f"test_{module.__name__.split('.')[-1]}.py" in file] | |
if len(test_file) == 0: | |
failures.append(f"{module.__name__} does not have its corresponding test file {test_file}.") | |
elif len(test_file) > 1: | |
failures.append(f"{module.__name__} has several test files: {test_file}.") | |
else: | |
test_file = test_file[0] | |
new_failures = check_models_are_tested(module, test_file) | |
if new_failures is not None: | |
failures += new_failures | |
if len(failures) > 0: | |
raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures)) | |
def get_all_auto_configured_models(): | |
"""Return the list of all models in at least one auto class.""" | |
result = set() # To avoid duplicates we concatenate all model classes in a set. | |
if is_torch_available(): | |
for attr_name in dir(diffusers.models.auto.modeling_auto): | |
if attr_name.startswith("MODEL_") and attr_name.endswith("MAPPING_NAMES"): | |
result = result | set(get_values(getattr(diffusers.models.auto.modeling_auto, attr_name))) | |
if is_tf_available(): | |
for attr_name in dir(diffusers.models.auto.modeling_tf_auto): | |
if attr_name.startswith("TF_MODEL_") and attr_name.endswith("MAPPING_NAMES"): | |
result = result | set(get_values(getattr(diffusers.models.auto.modeling_tf_auto, attr_name))) | |
if is_flax_available(): | |
for attr_name in dir(diffusers.models.auto.modeling_flax_auto): | |
if attr_name.startswith("FLAX_MODEL_") and attr_name.endswith("MAPPING_NAMES"): | |
result = result | set(get_values(getattr(diffusers.models.auto.modeling_flax_auto, attr_name))) | |
return list(result) | |
def ignore_unautoclassed(model_name): | |
"""Rules to determine if `name` should be in an auto class.""" | |
# Special white list | |
if model_name in IGNORE_NON_AUTO_CONFIGURED: | |
return True | |
# Encoder and Decoder should be ignored | |
if "Encoder" in model_name or "Decoder" in model_name: | |
return True | |
return False | |
def check_models_are_auto_configured(module, all_auto_models): | |
"""Check models defined in module are each in an auto class.""" | |
defined_models = get_models(module) | |
failures = [] | |
for model_name, _ in defined_models: | |
if model_name not in all_auto_models and not ignore_unautoclassed(model_name): | |
failures.append( | |
f"{model_name} is defined in {module.__name__} but is not present in any of the auto mapping. " | |
"If that is intended behavior, add its name to `IGNORE_NON_AUTO_CONFIGURED` in the file " | |
"`utils/check_repo.py`." | |
) | |
return failures | |
def check_all_models_are_auto_configured(): | |
"""Check all models are each in an auto class.""" | |
missing_backends = [] | |
if not is_torch_available(): | |
missing_backends.append("PyTorch") | |
if not is_tf_available(): | |
missing_backends.append("TensorFlow") | |
if not is_flax_available(): | |
missing_backends.append("Flax") | |
if len(missing_backends) > 0: | |
missing = ", ".join(missing_backends) | |
if os.getenv("TRANSFORMERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES: | |
raise Exception( | |
"Full quality checks require all backends to be installed (with `pip install -e .[dev]` in the " | |
f"Transformers repo, the following are missing: {missing}." | |
) | |
else: | |
warnings.warn( | |
"Full quality checks require all backends to be installed (with `pip install -e .[dev]` in the " | |
f"Transformers repo, the following are missing: {missing}. While it's probably fine as long as you " | |
"didn't make any change in one of those backends modeling files, you should probably execute the " | |
"command above to be on the safe side." | |
) | |
modules = get_model_modules() | |
all_auto_models = get_all_auto_configured_models() | |
failures = [] | |
for module in modules: | |
new_failures = check_models_are_auto_configured(module, all_auto_models) | |
if new_failures is not None: | |
failures += new_failures | |
if len(failures) > 0: | |
raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures)) | |
_re_decorator = re.compile(r"^\s*@(\S+)\s+$") | |
def check_decorator_order(filename): | |
"""Check that in the test file `filename` the slow decorator is always last.""" | |
with open(filename, "r", encoding="utf-8", newline="\n") as f: | |
lines = f.readlines() | |
decorator_before = None | |
errors = [] | |
for i, line in enumerate(lines): | |
search = _re_decorator.search(line) | |
if search is not None: | |
decorator_name = search.groups()[0] | |
if decorator_before is not None and decorator_name.startswith("parameterized"): | |
errors.append(i) | |
decorator_before = decorator_name | |
elif decorator_before is not None: | |
decorator_before = None | |
return errors | |
def check_all_decorator_order(): | |
"""Check that in all test files, the slow decorator is always last.""" | |
errors = [] | |
for fname in os.listdir(PATH_TO_TESTS): | |
if fname.endswith(".py"): | |
filename = os.path.join(PATH_TO_TESTS, fname) | |
new_errors = check_decorator_order(filename) | |
errors += [f"- {filename}, line {i}" for i in new_errors] | |
if len(errors) > 0: | |
msg = "\n".join(errors) | |
raise ValueError( | |
"The parameterized decorator (and its variants) should always be first, but this is not the case in the" | |
f" following files:\n{msg}" | |
) | |
def find_all_documented_objects(): | |
"""Parse the content of all doc files to detect which classes and functions it documents""" | |
documented_obj = [] | |
for doc_file in Path(PATH_TO_DOC).glob("**/*.rst"): | |
with open(doc_file, "r", encoding="utf-8", newline="\n") as f: | |
content = f.read() | |
raw_doc_objs = re.findall(r"(?:autoclass|autofunction):: transformers.(\S+)\s+", content) | |
documented_obj += [obj.split(".")[-1] for obj in raw_doc_objs] | |
for doc_file in Path(PATH_TO_DOC).glob("**/*.mdx"): | |
with open(doc_file, "r", encoding="utf-8", newline="\n") as f: | |
content = f.read() | |
raw_doc_objs = re.findall("\[\[autodoc\]\]\s+(\S+)\s+", content) | |
documented_obj += [obj.split(".")[-1] for obj in raw_doc_objs] | |
return documented_obj | |
# One good reason for not being documented is to be deprecated. Put in this list deprecated objects. | |
DEPRECATED_OBJECTS = [ | |
"AutoModelWithLMHead", | |
"BartPretrainedModel", | |
"DataCollator", | |
"DataCollatorForSOP", | |
"GlueDataset", | |
"GlueDataTrainingArguments", | |
"LineByLineTextDataset", | |
"LineByLineWithRefDataset", | |
"LineByLineWithSOPTextDataset", | |
"PretrainedBartModel", | |
"PretrainedFSMTModel", | |
"SingleSentenceClassificationProcessor", | |
"SquadDataTrainingArguments", | |
"SquadDataset", | |
"SquadExample", | |
"SquadFeatures", | |
"SquadV1Processor", | |
"SquadV2Processor", | |
"TFAutoModelWithLMHead", | |
"TFBartPretrainedModel", | |
"TextDataset", | |
"TextDatasetForNextSentencePrediction", | |
"Wav2Vec2ForMaskedLM", | |
"Wav2Vec2Tokenizer", | |
"glue_compute_metrics", | |
"glue_convert_examples_to_features", | |
"glue_output_modes", | |
"glue_processors", | |
"glue_tasks_num_labels", | |
"squad_convert_examples_to_features", | |
"xnli_compute_metrics", | |
"xnli_output_modes", | |
"xnli_processors", | |
"xnli_tasks_num_labels", | |
"TFTrainer", | |
"TFTrainingArguments", | |
] | |
# Exceptionally, some objects should not be documented after all rules passed. | |
# ONLY PUT SOMETHING IN THIS LIST AS A LAST RESORT! | |
UNDOCUMENTED_OBJECTS = [ | |
"AddedToken", # This is a tokenizers class. | |
"BasicTokenizer", # Internal, should never have been in the main init. | |
"CharacterTokenizer", # Internal, should never have been in the main init. | |
"DPRPretrainedReader", # Like an Encoder. | |
"DummyObject", # Just picked by mistake sometimes. | |
"MecabTokenizer", # Internal, should never have been in the main init. | |
"ModelCard", # Internal type. | |
"SqueezeBertModule", # Internal building block (should have been called SqueezeBertLayer) | |
"TFDPRPretrainedReader", # Like an Encoder. | |
"TransfoXLCorpus", # Internal type. | |
"WordpieceTokenizer", # Internal, should never have been in the main init. | |
"absl", # External module | |
"add_end_docstrings", # Internal, should never have been in the main init. | |
"add_start_docstrings", # Internal, should never have been in the main init. | |
"cached_path", # Internal used for downloading models. | |
"convert_tf_weight_name_to_pt_weight_name", # Internal used to convert model weights | |
"logger", # Internal logger | |
"logging", # External module | |
"requires_backends", # Internal function | |
] | |
# This list should be empty. Objects in it should get their own doc page. | |
SHOULD_HAVE_THEIR_OWN_PAGE = [ | |
# Benchmarks | |
"PyTorchBenchmark", | |
"PyTorchBenchmarkArguments", | |
"TensorFlowBenchmark", | |
"TensorFlowBenchmarkArguments", | |
] | |
def ignore_undocumented(name): | |
"""Rules to determine if `name` should be undocumented.""" | |
# NOT DOCUMENTED ON PURPOSE. | |
# Constants uppercase are not documented. | |
if name.isupper(): | |
return True | |
# ModelMixins / Encoders / Decoders / Layers / Embeddings / Attention are not documented. | |
if ( | |
name.endswith("ModelMixin") | |
or name.endswith("Decoder") | |
or name.endswith("Encoder") | |
or name.endswith("Layer") | |
or name.endswith("Embeddings") | |
or name.endswith("Attention") | |
): | |
return True | |
# Submodules are not documented. | |
if os.path.isdir(os.path.join(PATH_TO_DIFFUSERS, name)) or os.path.isfile( | |
os.path.join(PATH_TO_DIFFUSERS, f"{name}.py") | |
): | |
return True | |
# All load functions are not documented. | |
if name.startswith("load_tf") or name.startswith("load_pytorch"): | |
return True | |
# is_xxx_available functions are not documented. | |
if name.startswith("is_") and name.endswith("_available"): | |
return True | |
# Deprecated objects are not documented. | |
if name in DEPRECATED_OBJECTS or name in UNDOCUMENTED_OBJECTS: | |
return True | |
# MMBT model does not really work. | |
if name.startswith("MMBT"): | |
return True | |
if name in SHOULD_HAVE_THEIR_OWN_PAGE: | |
return True | |
return False | |
def check_all_objects_are_documented(): | |
"""Check all models are properly documented.""" | |
documented_objs = find_all_documented_objects() | |
modules = diffusers._modules | |
objects = [c for c in dir(diffusers) if c not in modules and not c.startswith("_")] | |
undocumented_objs = [c for c in objects if c not in documented_objs and not ignore_undocumented(c)] | |
if len(undocumented_objs) > 0: | |
raise Exception( | |
"The following objects are in the public init so should be documented:\n - " | |
+ "\n - ".join(undocumented_objs) | |
) | |
check_docstrings_are_in_md() | |
check_model_type_doc_match() | |
def check_model_type_doc_match(): | |
"""Check all doc pages have a corresponding model type.""" | |
model_doc_folder = Path(PATH_TO_DOC) / "model_doc" | |
model_docs = [m.stem for m in model_doc_folder.glob("*.mdx")] | |
model_types = list(diffusers.models.auto.configuration_auto.MODEL_NAMES_MAPPING.keys()) | |
model_types = [MODEL_TYPE_TO_DOC_MAPPING[m] if m in MODEL_TYPE_TO_DOC_MAPPING else m for m in model_types] | |
errors = [] | |
for m in model_docs: | |
if m not in model_types and m != "auto": | |
close_matches = get_close_matches(m, model_types) | |
error_message = f"{m} is not a proper model identifier." | |
if len(close_matches) > 0: | |
close_matches = "/".join(close_matches) | |
error_message += f" Did you mean {close_matches}?" | |
errors.append(error_message) | |
if len(errors) > 0: | |
raise ValueError( | |
"Some model doc pages do not match any existing model type:\n" | |
+ "\n".join(errors) | |
+ "\nYou can add any missing model type to the `MODEL_NAMES_MAPPING` constant in " | |
"models/auto/configuration_auto.py." | |
) | |
# Re pattern to catch :obj:`xx`, :class:`xx`, :func:`xx` or :meth:`xx`. | |
_re_rst_special_words = re.compile(r":(?:obj|func|class|meth):`([^`]+)`") | |
# Re pattern to catch things between double backquotes. | |
_re_double_backquotes = re.compile(r"(^|[^`])``([^`]+)``([^`]|$)") | |
# Re pattern to catch example introduction. | |
_re_rst_example = re.compile(r"^\s*Example.*::\s*$", flags=re.MULTILINE) | |
def is_rst_docstring(docstring): | |
""" | |
Returns `True` if `docstring` is written in rst. | |
""" | |
if _re_rst_special_words.search(docstring) is not None: | |
return True | |
if _re_double_backquotes.search(docstring) is not None: | |
return True | |
if _re_rst_example.search(docstring) is not None: | |
return True | |
return False | |
def check_docstrings_are_in_md(): | |
"""Check all docstrings are in md""" | |
files_with_rst = [] | |
for file in Path(PATH_TO_DIFFUSERS).glob("**/*.py"): | |
with open(file, "r") as f: | |
code = f.read() | |
docstrings = code.split('"""') | |
for idx, docstring in enumerate(docstrings): | |
if idx % 2 == 0 or not is_rst_docstring(docstring): | |
continue | |
files_with_rst.append(file) | |
break | |
if len(files_with_rst) > 0: | |
raise ValueError( | |
"The following files have docstrings written in rst:\n" | |
+ "\n".join([f"- {f}" for f in files_with_rst]) | |
+ "\nTo fix this run `doc-builder convert path_to_py_file` after installing `doc-builder`\n" | |
"(`pip install git+https://github.com/huggingface/doc-builder`)" | |
) | |
def check_repo_quality(): | |
"""Check all models are properly tested and documented.""" | |
print("Checking all models are included.") | |
check_model_list() | |
print("Checking all models are public.") | |
check_models_are_in_init() | |
print("Checking all models are properly tested.") | |
check_all_decorator_order() | |
check_all_models_are_tested() | |
print("Checking all objects are properly documented.") | |
check_all_objects_are_documented() | |
print("Checking all models are in at least one auto class.") | |
check_all_models_are_auto_configured() | |
if __name__ == "__main__": | |
check_repo_quality() | |