GR00T / scripts /validate_hf_config_alignment.py

add: source files (batch 4)

b88b79e verified 10 days ago

37.8 kB

	# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Validate HuggingFace config alignment against source-of-truth definitions.

	Usage:
	# Internal consistency checks only (no HF download required):
	uv run python scripts/validate_hf_config_alignment.py

	# Full check with HF configs (requires auth + local dirs):
	uv run python scripts/validate_hf_config_alignment.py --hf-config-dir /tmp/hf_configs
	"""

	import argparse
	import json
	import math
	from pathlib import Path
	import re
	import sys


	REPO_ROOT = Path(__file__).resolve().parent.parent

	PASS = "\033[92m✓ PASS\033[0m"
	FAIL = "\033[91m✗ FAIL\033[0m"
	WARN = "\033[93m⚠ WARN\033[0m"
	INFO = "\033[94mℹ INFO\033[0m"
	SKIP = "\033[90m⊘ SKIP\033[0m"

	pass_count = 0
	fail_count = 0
	warn_count = 0
	skip_count = 0


	def check(condition, msg, *, warn_only=False, skip=False):
	global pass_count, fail_count, warn_count, skip_count
	if skip:
	skip_count += 1
	print(f" {SKIP} {msg}")
	return True
	if condition:
	pass_count += 1
	print(f" {PASS} {msg}")
	return True
	if warn_only:
	warn_count += 1
	print(f" {WARN} {msg}")
	return True
	fail_count += 1
	print(f" {FAIL} {msg}")
	return False


	def info(msg):
	print(f" {INFO} {msg}")


	# ──────────────────────── Source-of-Truth Loaders ────────────────────────


	def load_modality_configs():
	"""Load MODALITY_CONFIGS from embodiment_configs.py as serializable dicts."""
	sys.path.insert(0, str(REPO_ROOT))
	from gr00t.configs.data.embodiment_configs import MODALITY_CONFIGS
	from gr00t.data.utils import to_json_serializable

	raw = to_json_serializable(MODALITY_CONFIGS)
	return raw


	def load_model_config_defaults():
	"""Load Gr00tN1d7Config defaults."""
	sys.path.insert(0, str(REPO_ROOT))
	from gr00t.configs.model.gr00t_n1d7 import Gr00tN1d7Config

	cfg = Gr00tN1d7Config()
	return cfg


	def load_embodiment_tags():
	sys.path.insert(0, str(REPO_ROOT))
	from gr00t.data.embodiment_tags import POSTTRAIN_TAGS, PRETRAIN_TAGS, EmbodimentTag

	return EmbodimentTag, PRETRAIN_TAGS, POSTTRAIN_TAGS


	def load_projector_index():
	sys.path.insert(0, str(REPO_ROOT))
	from gr00t.model.gr00t_n1d7.processing_gr00t_n1d7 import EMBODIMENT_TAG_TO_PROJECTOR_INDEX

	return EMBODIMENT_TAG_TO_PROJECTOR_INDEX


	# ──────────────────────── HF Model Definitions ────────────────────────

	HF_MODELS = {
	"GR00T-N1.7-3B": {
	"hf_id": "nvidia/GR00T-N1.7-3B",
	"type": "base",
	"embodiment_tags": [
	"oxe_droid_relative_eef_relative_joint",
	"xdof_relative_eef_relative_joint",
	"xdof_relative_eef_relative_joint_subtask",
	"real_g1_relative_eef_relative_joints",
	"real_r1_pro_sharpa_relative_eef",
	"real_r1_pro_sharpa_relative_eef_human",
	"real_r1_pro_sharpa_relative_eef_maxinsights",
	"real_r1_pro_sharpa_relative_eef_mecka",
	],
	"subdir": None,
	},
	"GR00T-N1.7-DROID": {
	"hf_id": "nvidia/GR00T-N1.7-DROID",
	"type": "finetuned",
	"embodiment_tags": ["oxe_droid_relative_eef_relative_joint"],
	"subdir": None,
	},
	"GR00T-N1.7-LIBERO": {
	"hf_id": "nvidia/GR00T-N1.7-LIBERO",
	"type": "finetuned",
	"embodiment_tags": ["libero_sim"],
	"subdir": "libero_10",
	},
	"SimplerEnv-Fractal": {
	"hf_id": "nvidia/GR00T-N1.7-SimplerEnv-Fractal",
	"type": "finetuned",
	"embodiment_tags": ["simpler_env_google"],
	"subdir": None,
	},
	"SimplerEnv-Bridge": {
	"hf_id": "nvidia/GR00T-N1.7-SimplerEnv-Bridge",
	"type": "finetuned",
	"embodiment_tags": ["simpler_env_widowx"],
	"subdir": None,
	},
	}


	# ──────────────────────── Dimension F & Internal Consistency ────────────────────────


	def check_dim_f_internal_consistency():
	"""Dimension F — Cross-file consistency (source-of-truth only)."""
	print("\n" + "=" * 70)
	print("DIMENSION F — Internal Source-of-Truth Consistency")
	print("=" * 70)

	modality_configs = load_modality_configs()
	model_cfg = load_model_config_defaults()
	EmbodimentTag, PRETRAIN_TAGS, POSTTRAIN_TAGS = load_embodiment_tags()
	projector_index = load_projector_index()

	# F3: action horizon ≤ model max
	print("\n[F3] Action horizon ≤ model max capacity")
	for tag, cfg in modality_configs.items():
	actual_horizon = len(cfg["action"]["delta_indices"])
	check(
	actual_horizon <= model_cfg.action_horizon,
	f" {tag}: actual={actual_horizon} ≤ max={model_cfg.action_horizon}",
	)

	# F5: EMBODIMENT_TAG_TO_PROJECTOR_INDEX ↔ EmbodimentTag
	print("\n[F5] EMBODIMENT_TAG_TO_PROJECTOR_INDEX ↔ EmbodimentTag enum")
	for member in EmbodimentTag:
	if member.value in modality_configs:
	check(
	member.value in projector_index,
	f" {member.value} in MODALITY_CONFIGS → has projector index: {projector_index.get(member.value, 'MISSING')}",
	)

	all_tag_values = {m.value for m in EmbodimentTag}
	for tag in projector_index:
	check(
	tag in all_tag_values,
	f" projector index key '{tag}' → is valid EmbodimentTag value",
	)

	# F6: naming mismatch awareness
	print("\n[F6] Known naming mismatches (informational)")
	info(f"Model config: action_horizon={model_cfg.action_horizon}")
	info("Processor uses: max_action_horizon (same value, different key name)")
	info(f"Model config: use_albumentations_transforms={model_cfg.use_albumentations_transforms}")
	info("Processor uses: use_albumentations (same semantics, different key name)")


	def check_dim_e_documentation():
	"""Dimension E — README & Documentation Consistency."""
	print("\n" + "=" * 70)
	print("DIMENSION E — README & Documentation Consistency")
	print("=" * 70)

	EmbodimentTag, PRETRAIN_TAGS, POSTTRAIN_TAGS = load_embodiment_tags()
	modality_configs = load_modality_configs()

	# E1: Checkpoint table in README.md
	print("\n[E1] Checkpoint table in README.md")
	readme = (REPO_ROOT / "README.md").read_text()
	for model_name, model_info in HF_MODELS.items():
	check(
	model_info["hf_id"] in readme,
	f" {model_info['hf_id']} found in README.md",
	)

	# E2: --embodiment-tag in example commands uses enum NAMES
	print("\n[E2] --embodiment-tag uses enum NAMES in example commands")
	example_readmes = {
	"DROID": REPO_ROOT / "examples/DROID/README.md",
	"LIBERO": REPO_ROOT / "examples/LIBERO/README.md",
	"SimplerEnv": REPO_ROOT / "examples/SimplerEnv/README.md",
	}
	tag_name_to_value = {m.name: m.value for m in EmbodimentTag}

	for name, path in example_readmes.items():
	if not path.exists():
	check(False, f" {path} exists", skip=True)
	continue
	content = path.read_text()
	tags_in_commands = re.findall(r"--embodiment-tag\s+(\S+)", content)
	for tag in tags_in_commands:
	is_enum_name = tag in tag_name_to_value
	is_enum_value = tag in {m.value for m in EmbodimentTag}
	check(
	is_enum_name,
	f" {name}: --embodiment-tag {tag} is valid enum NAME"
	+ (" (used value instead of name)" if is_enum_value and not is_enum_name else ""),
	)

	# E4: DROID modality table
	print("\n[E4] DROID modality table matches MODALITY_CONFIGS")
	droid_readme = (REPO_ROOT / "examples/DROID/README.md").read_text()
	droid_cfg = modality_configs.get("oxe_droid_relative_eef_relative_joint", {})
	if droid_cfg:
	for vkey in droid_cfg["video"]["modality_keys"]:
	check(vkey in droid_readme, f" Video key '{vkey}' mentioned in DROID README")
	for skey in droid_cfg["state"]["modality_keys"]:
	check(skey in droid_readme, f" State key '{skey}' mentioned in DROID README")
	check(
	"17D" in droid_readme or "17d" in droid_readme.lower(),
	" 17D dimension mentioned in DROID README",
	warn_only=True,
	)

	# E5: Example modality.json files match MODALITY_CONFIGS
	print("\n[E5] Example modality.json ↔ MODALITY_CONFIGS key consistency")
	modality_json_map = {
	"simpler_env_google": REPO_ROOT / "examples/SimplerEnv/fractal_modality.json",
	"simpler_env_widowx": REPO_ROOT / "examples/SimplerEnv/bridge_modality.json",
	"libero_sim": REPO_ROOT / "examples/LIBERO/modality.json",
	}
	for tag, json_path in modality_json_map.items():
	if not json_path.exists():
	check(False, f" {json_path} exists", skip=True)
	continue
	with open(json_path) as f:
	mj = json.load(f)
	code_cfg = modality_configs.get(tag, {})
	if not code_cfg:
	check(False, f" {tag} in MODALITY_CONFIGS")
	continue

	mj_state_keys = list(mj.get("state", {}).keys())
	code_state_keys = code_cfg["state"]["modality_keys"]
	check(
	mj_state_keys == code_state_keys,
	f" {tag} state keys: modality.json={mj_state_keys} vs code={code_state_keys}",
	)

	mj_action_keys = list(mj.get("action", {}).keys())
	code_action_keys = code_cfg["action"]["modality_keys"]
	check(
	mj_action_keys == code_action_keys,
	f" {tag} action keys: modality.json={mj_action_keys} vs code={code_action_keys}",
	)

	mj_video_keys = list(mj.get("video", {}).keys())
	code_video_keys = code_cfg["video"]["modality_keys"]
	check(
	mj_video_keys == code_video_keys,
	f" {tag} video keys: modality.json={mj_video_keys} vs code={code_video_keys}",
	)

	# E7: --action-horizon in commands
	print("\n[E7] --action-horizon in commands ≤ embodiment actual horizon")
	for name, path in example_readmes.items():
	if not path.exists():
	continue
	content = path.read_text()
	horizons = re.findall(r"--action-horizon\s+(\d+)", content)
	for h in horizons:
	info(f" {name}: --action-horizon {h} found in commands")


	def check_dim_f2_modality_json():
	"""Dimension F2 — MODALITY_CONFIGS ↔ examples/*/modality.json."""
	print("\n" + "=" * 70)
	print("DIMENSION F2 — MODALITY_CONFIGS ↔ modality.json Structural Check")
	print("=" * 70)

	modality_configs = load_modality_configs()
	modality_json_files = {
	"simpler_env_google": REPO_ROOT / "examples/SimplerEnv/fractal_modality.json",
	"simpler_env_widowx": REPO_ROOT / "examples/SimplerEnv/bridge_modality.json",
	"libero_sim": REPO_ROOT / "examples/LIBERO/modality.json",
	}

	for tag, json_path in modality_json_files.items():
	print(f"\n [{tag}]")
	if not json_path.exists():
	check(False, f" {json_path.name} exists", skip=True)
	continue
	with open(json_path) as f:
	mj = json.load(f)

	code_cfg = modality_configs[tag]
	code_state_count = len(code_cfg["state"]["modality_keys"])
	mj_state_count = len(mj.get("state", {}))
	check(
	code_state_count == mj_state_count,
	f" State key count: code={code_state_count} vs modality.json={mj_state_count}",
	)

	code_action_count = len(code_cfg["action"]["modality_keys"])
	mj_action_count = len(mj.get("action", {}))
	check(
	code_action_count == mj_action_count,
	f" Action key count: code={code_action_count} vs modality.json={mj_action_count}",
	)


	# ──────────────────────── Dimension J — Enum Serialization ────────────────────────


	def check_dim_j_enum_serialization():
	"""Dimension J — Verify enum serialization uses names not values."""
	print("\n" + "=" * 70)
	print("DIMENSION J — Enum Serialization Format (code-level)")
	print("=" * 70)

	modality_configs = load_modality_configs()
	valid_rep_names = {"RELATIVE", "ABSOLUTE"}
	valid_type_names = {"EEF", "NON_EEF"}
	valid_format_names = {"DEFAULT", "XYZ_ROT6D", "ROTATION_6D", "SCALAR"}

	for tag, cfg in modality_configs.items():
	action_configs = cfg.get("action", {}).get("action_configs")
	if not action_configs:
	continue
	print(f"\n [{tag}]")
	for i, ac in enumerate(action_configs):
	rep = ac.get("rep")
	atype = ac.get("type")
	afmt = ac.get("format")
	check(
	rep in valid_rep_names,
	f" action_configs[{i}].rep = '{rep}' (valid name: {rep in valid_rep_names})",
	)
	check(
	atype in valid_type_names,
	f" action_configs[{i}].type = '{atype}' (valid name: {atype in valid_type_names})",
	)
	if afmt:
	check(
	afmt in valid_format_names,
	f" action_configs[{i}].format = '{afmt}' (valid name: {afmt in valid_format_names})",
	)


	# ──────────────────────── HF Config Checks (require downloads) ────────────────────────


	def load_hf_json(base_dir, model_name, filename, subdir=None):
	model_dir = Path(base_dir) / model_name
	if subdir:
	model_dir = model_dir / subdir
	path = model_dir / filename
	if not path.exists():
	return None
	with open(path) as f:
	return json.load(f)


	def check_dim_a_processor_config(hf_dir, model_name, model_def):
	"""Dimension A — processor_config.json checks for a single model."""
	print(f"\n--- {model_name} ---")
	pc = load_hf_json(hf_dir, model_name, "processor_config.json", model_def.get("subdir"))
	if pc is None:
	check(False, "processor_config.json found", skip=True)
	return

	modality_configs = load_modality_configs()

	# A10: processor_class
	check(
	pc.get("processor_class") == "Gr00tN1d7Processor",
	f"[A10] processor_class = '{pc.get('processor_class')}' (expected 'Gr00tN1d7Processor')",
	)

	pk = pc.get("processor_kwargs", {})

	# A1: modality_configs top-level keys
	hf_modality_keys = list(pk.get("modality_configs", {}).keys())
	for expected_tag in model_def["embodiment_tags"]:
	check(
	expected_tag in hf_modality_keys,
	f"[A1] modality_configs contains '{expected_tag}'",
	)

	# Per-tag modality checks
	for tag in model_def["embodiment_tags"]:
	hf_mc = pk.get("modality_configs", {}).get(tag)
	code_mc = modality_configs.get(tag)
	if not hf_mc:
	check(False, f"[A1] {tag} present in HF modality_configs")
	continue
	if not code_mc:
	info(f" {tag} not in code MODALITY_CONFIGS (pretrain-only tag, expected)")
	continue

	# A2: video.delta_indices
	check(
	hf_mc["video"]["delta_indices"] == code_mc["video"]["delta_indices"],
	f"[A2] {tag} video.delta_indices: HF={hf_mc['video']['delta_indices']} vs code={code_mc['video']['delta_indices']}",
	)

	# A3: video.modality_keys count
	hf_vkeys = hf_mc["video"]["modality_keys"]
	code_vkeys = code_mc["video"]["modality_keys"]
	check(
	len(hf_vkeys) == len(code_vkeys),
	f"[A3] {tag} video key count: HF={len(hf_vkeys)} ({hf_vkeys}) vs code={len(code_vkeys)} ({code_vkeys})",
	)

	# A4: state.delta_indices
	check(
	hf_mc["state"]["delta_indices"] == code_mc["state"]["delta_indices"],
	f"[A4] {tag} state.delta_indices: HF={hf_mc['state']['delta_indices']} vs code={code_mc['state']['delta_indices']}",
	)

	# A5: state.modality_keys
	check(
	hf_mc["state"]["modality_keys"] == code_mc["state"]["modality_keys"],
	f"[A5] {tag} state.modality_keys match",
	)

	# A6: action.delta_indices
	check(
	hf_mc["action"]["delta_indices"] == code_mc["action"]["delta_indices"],
	f"[A6] {tag} action.delta_indices: HF len={len(hf_mc['action']['delta_indices'])} vs code len={len(code_mc['action']['delta_indices'])}",
	)

	# A7: action.modality_keys
	check(
	hf_mc["action"]["modality_keys"] == code_mc["action"]["modality_keys"],
	f"[A7] {tag} action.modality_keys match",
	)

	# A8: action.action_configs
	hf_ac = hf_mc["action"].get("action_configs")
	code_ac = code_mc["action"].get("action_configs")
	if code_ac:
	check(
	hf_ac is not None,
	f"[A8] {tag} action.action_configs present in HF",
	)
	if hf_ac:
	check(
	len(hf_ac) == len(code_ac),
	f"[A8] {tag} action_configs count: HF={len(hf_ac)} vs code={len(code_ac)}",
	)
	for i, (h, c) in enumerate(zip(hf_ac, code_ac)):
	for field in ("rep", "type", "format"):
	check(
	h.get(field) == c.get(field),
	f"[A8] {tag} action_configs[{i}].{field}: HF={h.get(field)} vs code={c.get(field)}",
	)

	# A9: language.modality_keys
	check(
	hf_mc["language"]["modality_keys"] == code_mc["language"]["modality_keys"],
	f"[A9] {tag} language.modality_keys match",
	)

	# A11-A31: scalar parameters
	scalar_checks = {
	"max_state_dim": ("A11", None),
	"max_action_dim": ("A12", None),
	"max_action_horizon": ("A13", None),
	"model_name": ("A14", "nvidia/Cosmos-Reason2-2B"),
	"model_type": ("A15", "qwen"),
	"use_percentiles": ("A16", None),
	"apply_sincos_state_encoding": ("A17", None),
	"use_relative_action": ("A18", None),
	"formalize_language": ("A19", True),
	"clip_outliers": ("A20", True),
	"use_mean_std": ("A21", False),
	"letter_box_transform": ("A22", None),
	"exclude_state": ("A23", None),
	"state_dropout_prob": ("A24", None),
	"image_crop_size": ("A25", None),
	"image_target_size": ("A26", None),
	"shortest_image_edge": ("A27", 256),
	"crop_fraction": ("A28", 0.95),
	"use_albumentations": ("A29", None),
	"random_rotation_angle": ("A30", None),
	"color_jitter_params": ("A31", None),
	}
	for field, (item_id, expected) in scalar_checks.items():
	actual = pk.get(field)
	if expected is not None:
	check(
	actual == expected,
	f"[{item_id}] {field}: HF={actual!r} (expected {expected!r})",
	)
	else:
	info(f"[{item_id}] {field} = {actual!r}")


	def check_dim_b_config_json(hf_dir, model_name, model_def):
	"""Dimension B — config.json checks for a single model."""
	print(f"\n--- {model_name} ---")
	cfg = load_hf_json(hf_dir, model_name, "config.json", model_def.get("subdir"))
	if cfg is None:
	check(False, "config.json found", skip=True)
	return

	model_cfg = load_model_config_defaults()

	b_checks = {
	"B1": ("model_type", "Gr00tN1d7"),
	"B2": ("max_state_dim", None),
	"B3": ("max_action_dim", None),
	"B4": ("action_horizon", model_cfg.action_horizon),
	"B5": ("backbone_embedding_dim", model_cfg.backbone_embedding_dim),
	"B6": ("hidden_size", model_cfg.hidden_size),
	"B7": ("input_embedding_dim", model_cfg.input_embedding_dim),
	"B11": ("num_inference_timesteps", model_cfg.num_inference_timesteps),
	"B12": ("max_num_embodiments", model_cfg.max_num_embodiments),
	"B13": ("model_name", "nvidia/Cosmos-Reason2-2B"),
	"B14": ("select_layer", model_cfg.select_layer),
	"B15": ("state_history_length", model_cfg.state_history_length),
	"B16": ("noise_beta_alpha", model_cfg.noise_beta_alpha),
	"B17": ("noise_beta_beta", model_cfg.noise_beta_beta),
	"B18": ("noise_s", model_cfg.noise_s),
	"B19": ("num_timestep_buckets", model_cfg.num_timestep_buckets),
	"B20": ("add_pos_embed", model_cfg.add_pos_embed),
	"B21": ("attn_dropout", model_cfg.attn_dropout),
	"B22": ("use_vlln", model_cfg.use_vlln),
	"B23": ("max_seq_len", model_cfg.max_seq_len),
	"B24": ("use_alternate_vl_dit", model_cfg.use_alternate_vl_dit),
	"B25": ("attend_text_every_n_blocks", model_cfg.attend_text_every_n_blocks),
	"B27": ("backbone_model_type", model_cfg.backbone_model_type),
	"B28": ("reproject_vision", model_cfg.reproject_vision),
	"B29": ("use_percentiles", model_cfg.use_percentiles),
	"B30": ("use_relative_action", model_cfg.use_relative_action),
	}

	for item_id, (field, expected) in b_checks.items():
	actual = cfg.get(field)
	if expected is not None:
	check(
	actual == expected,
	f"[{item_id}] {field}: HF={actual!r} (expected {expected!r})",
	)
	else:
	info(f"[{item_id}] {field} = {actual!r}")

	# B8-B10: diffusion_model_cfg nested
	diff_cfg = cfg.get("diffusion_model_cfg", {})
	check(
	diff_cfg.get("num_layers") == 16,
	f"[B8] diffusion_model_cfg.num_layers: {diff_cfg.get('num_layers')} (expected 16)",
	)
	check(
	diff_cfg.get("num_attention_heads") == 32,
	f"[B9] diffusion_model_cfg.num_attention_heads: {diff_cfg.get('num_attention_heads')} (expected 32)",
	)
	check(
	diff_cfg.get("attention_head_dim") == 48,
	f"[B10] diffusion_model_cfg.attention_head_dim: {diff_cfg.get('attention_head_dim')} (expected 48)",
	)

	# I4: No internal/legacy field names
	legacy_fields = ["vlm_model_path", "GrootN1d7"]
	for lf in legacy_fields:
	check(lf not in cfg, f"[I4] No legacy field '{lf}' in config.json")

	# B26 / I2: torch_dtype
	dtype_val = cfg.get("torch_dtype") or cfg.get("model_dtype")
	info(f"[B26/I2] torch_dtype/model_dtype = {dtype_val!r}")

	# I1: architectures
	archs = cfg.get("architectures")
	if archs is not None:
	check(
	"Gr00tN1d7" in archs,
	f"[I1] architectures contains 'Gr00tN1d7': {archs}",
	)
	else:
	info("[I1] 'architectures' field not present")


	def check_dim_c_embodiment_id(hf_dir, model_name, model_def):
	"""Dimension C — embodiment_id.json checks."""
	print(f"\n--- {model_name} ---")
	eid = load_hf_json(hf_dir, model_name, "embodiment_id.json", model_def.get("subdir"))
	if eid is None:
	check(False, "embodiment_id.json found", skip=True)
	return

	projector_index = load_projector_index()

	# C1: all entries match code
	for tag, idx in eid.items():
	code_idx = projector_index.get(tag)
	if code_idx is not None:
	check(
	idx == code_idx,
	f"[C1] {tag}: HF={idx} vs code={code_idx}",
	)
	else:
	check(
	False, f"[C1] {tag} not in code EMBODIMENT_TAG_TO_PROJECTOR_INDEX", warn_only=True
	)

	# C2: pretrain tags present (derived from source of truth)
	_, PRETRAIN_TAGS, _ = load_embodiment_tags()
	pretrain_tag_values = [t.value for t in PRETRAIN_TAGS]
	for pt in pretrain_tag_values:
	check(
	pt in eid,
	f"[C2] Pretrain tag '{pt}' present in embodiment_id.json",
	)


	def check_dim_d_statistics(hf_dir, model_name, model_def):
	"""Dimension D — statistics.json checks."""
	print(f"\n--- {model_name} ---")
	stats = load_hf_json(hf_dir, model_name, "statistics.json", model_def.get("subdir"))
	pc = load_hf_json(hf_dir, model_name, "processor_config.json", model_def.get("subdir"))
	if stats is None:
	check(False, "statistics.json found", skip=True)
	return

	pk = pc.get("processor_kwargs", {}) if pc else {}
	use_percentiles = pk.get("use_percentiles", True)

	for tag in model_def["embodiment_tags"]:
	tag_stats = stats.get(tag)
	check(tag_stats is not None, f"[D1] Top-level key '{tag}' in statistics.json")
	if not tag_stats:
	continue

	# D2: state/action sub-dicts
	check("state" in tag_stats, f"[D2] {tag} has 'state' sub-dict")
	check("action" in tag_stats, f"[D2] {tag} has 'action' sub-dict")

	# D3: modality key coverage
	hf_mc = pk.get("modality_configs", {}).get(tag, {})
	for modality in ("state", "action"):
	if modality not in tag_stats or modality not in hf_mc:
	continue
	expected_keys = hf_mc[modality].get("modality_keys", [])
	actual_keys = list(tag_stats[modality].keys())
	for ek in expected_keys:
	check(
	ek in actual_keys,
	f"[D3] {tag}/{modality}: key '{ek}' in statistics",
	)

	# D4: normalization fields
	for modality in ("state", "action"):
	if modality not in tag_stats:
	continue
	for key, key_stats in tag_stats[modality].items():
	check(
	"min" in key_stats and "max" in key_stats,
	f"[D4] {tag}/{modality}/{key}: has min/max",
	)
	if use_percentiles:
	has_pct = "q01" in key_stats or "p01" in key_stats
	check(
	has_pct,
	f"[D4] {tag}/{modality}/{key}: has percentile fields (use_percentiles={use_percentiles})",
	)

	# D6: No NaN/Inf
	def check_finite(obj, path=""):
	if isinstance(obj, dict):
	for k, v in obj.items():
	check_finite(v, f"{path}/{k}")
	elif isinstance(obj, list):
	for i, v in enumerate(obj):
	if isinstance(v, (int, float)):
	check(
	math.isfinite(v),
	f"[D6] {path}[{i}] = {v} is finite",
	)
	elif isinstance(obj, (int, float)):
	check(math.isfinite(obj), f"[D6] {path} = {obj} is finite")

	check_finite(tag_stats, f"{tag}")


	def check_dim_f1_cross_file(hf_dir, model_name, model_def):
	"""Dimension F1 — config.json ↔ processor_config.json agreement."""
	print(f"\n--- {model_name} ---")
	cfg = load_hf_json(hf_dir, model_name, "config.json", model_def.get("subdir"))
	pc = load_hf_json(hf_dir, model_name, "processor_config.json", model_def.get("subdir"))
	if cfg is None or pc is None:
	check(False, "Both config.json and processor_config.json found", skip=True)
	return

	pk = pc.get("processor_kwargs", {})

	# F1: max_state_dim, max_action_dim, action_horizon
	check(
	cfg.get("max_state_dim") == pk.get("max_state_dim"),
	f"[F1] max_state_dim: config.json={cfg.get('max_state_dim')} vs processor={pk.get('max_state_dim')}",
	)
	check(
	cfg.get("max_action_dim") == pk.get("max_action_dim"),
	f"[F1] max_action_dim: config.json={cfg.get('max_action_dim')} vs processor={pk.get('max_action_dim')}",
	)
	check(
	cfg.get("action_horizon") == pk.get("max_action_horizon"),
	f"[F1] action_horizon={cfg.get('action_horizon')} vs max_action_horizon={pk.get('max_action_horizon')}",
	)

	# F7: use_percentiles, use_relative_action
	check(
	cfg.get("use_percentiles") == pk.get("use_percentiles"),
	f"[F7] use_percentiles: config.json={cfg.get('use_percentiles')} vs processor={pk.get('use_percentiles')}",
	)
	check(
	cfg.get("use_relative_action") == pk.get("use_relative_action"),
	f"[F7] use_relative_action: config.json={cfg.get('use_relative_action')} vs processor={pk.get('use_relative_action')}",
	)

	# B13 cross: model_name
	check(
	cfg.get("model_name") == pk.get("model_name"),
	f"[B13] model_name: config.json={cfg.get('model_name')} vs processor={pk.get('model_name')}",
	)


	# ──────────────────────── Test Fixture Check ────────────────────────


	def check_test_fixture():
	"""Check the test fixture processor_config against source of truth."""
	print("\n" + "=" * 70)
	print("TEST FIXTURE — tests/fixtures/processor_config/ Check")
	print("=" * 70)

	fixture_dir = REPO_ROOT / "tests/fixtures/processor_config"
	pc_path = fixture_dir / "processor_config.json"
	eid_path = fixture_dir / "embodiment_id.json"
	stats_path = fixture_dir / "statistics.json"

	if not pc_path.exists():
	check(False, "Test fixture processor_config.json exists", skip=True)
	return

	with open(pc_path) as f:
	pc = json.load(f)
	if not eid_path.exists():
	check(False, "Test fixture embodiment_id.json exists", skip=True)
	return
	with open(eid_path) as f:
	eid = json.load(f)
	if not stats_path.exists():
	check(False, "Test fixture statistics.json exists", skip=True)
	return
	with open(stats_path) as f:
	stats = json.load(f)

	modality_configs = load_modality_configs()
	model_cfg = load_model_config_defaults()
	projector_index = load_projector_index()

	pk = pc.get("processor_kwargs", {})

	# processor_class
	check(
	pc.get("processor_class") == "Gr00tN1d7Processor",
	f"processor_class = '{pc.get('processor_class')}'",
	)

	# modality_configs: libero_sim
	hf_mc = pk.get("modality_configs", {}).get("libero_sim")
	code_mc = modality_configs.get("libero_sim")
	check(hf_mc is not None, "modality_configs contains 'libero_sim'")

	if hf_mc and code_mc:
	# video delta_indices
	check(
	hf_mc["video"]["delta_indices"] == code_mc["video"]["delta_indices"],
	f"video.delta_indices: fixture={hf_mc['video']['delta_indices']} vs code={code_mc['video']['delta_indices']}",
	)
	# video key count
	check(
	len(hf_mc["video"]["modality_keys"]) == len(code_mc["video"]["modality_keys"]),
	f"video key count: fixture={len(hf_mc['video']['modality_keys'])} vs code={len(code_mc['video']['modality_keys'])}",
	)
	# state keys
	check(
	hf_mc["state"]["modality_keys"] == code_mc["state"]["modality_keys"],
	"state.modality_keys match",
	)
	# action delta_indices
	check(
	hf_mc["action"]["delta_indices"] == code_mc["action"]["delta_indices"],
	f"action.delta_indices: fixture len={len(hf_mc['action']['delta_indices'])} vs code len={len(code_mc['action']['delta_indices'])}",
	)
	# action keys
	check(
	hf_mc["action"]["modality_keys"] == code_mc["action"]["modality_keys"],
	"action.modality_keys match",
	)
	# language keys
	check(
	hf_mc["language"]["modality_keys"] == code_mc["language"]["modality_keys"],
	"language.modality_keys match",
	)

	# Scalar params — notable mismatches to flag
	print("\n Scalar Parameter Comparison (fixture vs model config defaults):")
	info(
	f"max_state_dim: fixture={pk.get('max_state_dim')} vs model_cfg default={model_cfg.max_state_dim}"
	)
	info(
	f"max_action_dim: fixture={pk.get('max_action_dim')} vs model_cfg default={model_cfg.max_action_dim}"
	)
	info(
	f"max_action_horizon: fixture={pk.get('max_action_horizon')} vs model_cfg.action_horizon={model_cfg.action_horizon}"
	)
	info(
	f"use_percentiles: fixture={pk.get('use_percentiles')} vs model_cfg={model_cfg.use_percentiles}"
	)
	info(
	f"apply_sincos_state_encoding: fixture={pk.get('apply_sincos_state_encoding')} vs model_cfg={model_cfg.apply_sincos_state_encoding}"
	)
	info(
	f"use_relative_action: fixture={pk.get('use_relative_action')} vs model_cfg={model_cfg.use_relative_action}"
	)

	# Check missing fields (new fields added to save_pretrained)
	expected_fields = [
	"letter_box_transform",
	"exclude_state",
	"state_dropout_prob",
	"use_mean_std",
	]
	print("\n New Fields Check (may be missing in older fixtures):")
	for field in expected_fields:
	present = field in pk
	check(present, f"Field '{field}' present in fixture processor_config", warn_only=True)

	# embodiment_id.json
	print("\n Embodiment ID Check:")
	for tag, idx in eid.items():
	code_idx = projector_index.get(tag)
	check(
	code_idx is not None and idx == code_idx,
	f" {tag}: fixture={idx} vs code={code_idx}",
	)

	# statistics.json structure
	print("\n Statistics Structure Check:")
	for tag in pk.get("modality_configs", {}).keys():
	check(tag in stats, f" statistics.json has key '{tag}'")
	if tag in stats:
	check("state" in stats[tag], f" {tag}/state present")
	check("action" in stats[tag], f" {tag}/action present")


	# ──────────────────────── Main ────────────────────────


	def main():
	parser = argparse.ArgumentParser(description="Validate HF config alignment")
	parser.add_argument(
	"--hf-config-dir",
	type=str,
	default=None,
	help="Directory containing downloaded HF configs (subdirs per model)",
	)
	args = parser.parse_args()

	print("╔" + "═" * 68 + "╗")
	print("║ HuggingFace Config Alignment Validation ║")
	print("╚" + "═" * 68 + "╝")

	# Always run: internal consistency checks
	check_dim_f_internal_consistency()
	check_dim_e_documentation()
	check_dim_f2_modality_json()
	check_dim_j_enum_serialization()
	check_test_fixture()

	# HF config checks (if directory provided)
	if args.hf_config_dir:
	hf_dir = Path(args.hf_config_dir)
	if not hf_dir.exists():
	print(f"\n[ERROR] HF config directory not found: {hf_dir}")
	sys.exit(1)

	for model_name, model_def in HF_MODELS.items():
	print("\n" + "=" * 70)
	print(f"DIMENSION A — processor_config.json: {model_name}")
	print("=" * 70)
	check_dim_a_processor_config(hf_dir, model_name, model_def)

	for model_name, model_def in HF_MODELS.items():
	print("\n" + "=" * 70)
	print(f"DIMENSION B — config.json: {model_name}")
	print("=" * 70)
	check_dim_b_config_json(hf_dir, model_name, model_def)

	for model_name, model_def in HF_MODELS.items():
	print("\n" + "=" * 70)
	print(f"DIMENSION C — embodiment_id.json: {model_name}")
	print("=" * 70)
	check_dim_c_embodiment_id(hf_dir, model_name, model_def)

	for model_name, model_def in HF_MODELS.items():
	print("\n" + "=" * 70)
	print(f"DIMENSION D — statistics.json: {model_name}")
	print("=" * 70)
	check_dim_d_statistics(hf_dir, model_name, model_def)

	for model_name, model_def in HF_MODELS.items():
	print("\n" + "=" * 70)
	print(f"DIMENSION F1 — Cross-file: {model_name}")
	print("=" * 70)
	check_dim_f1_cross_file(hf_dir, model_name, model_def)
	else:
	print("\n" + "=" * 70)
	print("HF CONFIG CHECKS SKIPPED — No --hf-config-dir provided")
	print("To run full checks, download HF configs first:")
	print(" uv run huggingface-cli login")
	print(" # Then download configs for each model (see checklist)")
	print(
	" uv run python scripts/validate_hf_config_alignment.py --hf-config-dir /tmp/hf_configs"
	)
	print("=" * 70)

	# Summary
	print("\n" + "=" * 70)
	print("SUMMARY")
	print("=" * 70)
	print(f" {PASS}: {pass_count}")
	print(f" {FAIL}: {fail_count}")
	print(f" {WARN}: {warn_count}")
	print(f" {SKIP}: {skip_count}")
	total = pass_count + fail_count
	if total > 0:
	print(f" Pass rate: {pass_count}/{total} ({100 * pass_count / total:.1f}%)")

	if fail_count > 0:
	sys.exit(1)


	if __name__ == "__main__":
	main()