pashto-language-resources / scripts /validate_resource_catalog.py

musaw

Sync main snapshot to Hugging Face (no local binary banner)

2f53244 2 months ago

8.78 kB

	"""Validate the machine-readable Pashto resource catalog.

	Usage:
	python scripts/validate_resource_catalog.py
	python scripts/validate_resource_catalog.py --catalog resources/catalog/resources.json
	"""

	from __future__ import annotations

	import argparse
	import json
	import re
	from datetime import date
	from pathlib import Path
	from typing import Any
	from urllib.parse import urlparse


	ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper", "project", "code"}
	ALLOWED_SOURCES = {
	"huggingface",
	"mozilla",
	"kaggle",
	"github",
	"gitlab",
	"arxiv",
	"openalex",
	"crossref",
	"zenodo",
	"dataverse",
	"datacite",
	"meta",
	"other",
	}
	ALLOWED_STATUS = {"verified", "candidate"}
	RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")
	STRICT_PASHTO_CATEGORIES = {"model", "paper", "tool", "code", "project"}
	PASHTO_PUSHTO_WORD_RE = re.compile(r"(?<![a-z0-9])pushto(?![a-z0-9])")
	PASHTO_CODE_RE = re.compile(r"\b(ps(_af)?\|pus\|pbt[_-]?arab)\b")


	def _load_json(path: Path) -> dict[str, Any]:
	return json.loads(path.read_text(encoding="utf-8"))


	def _is_valid_http_url(value: str) -> bool:
	parsed = urlparse(value)
	return parsed.scheme in {"http", "https"} and bool(parsed.netloc)


	def _validate_iso_date(value: str) -> bool:
	try:
	date.fromisoformat(value)
	except ValueError:
	return False
	return True


	def _contains_pashto_marker(value: str) -> bool:
	if not isinstance(value, str):
	return False
	lowered = value.casefold()
	if any(marker in lowered for marker in ("pashto", "pukhto", "pakhto")):
	return True
	if PASHTO_PUSHTO_WORD_RE.search(lowered):
	return True
	if PASHTO_CODE_RE.search(lowered):
	return True
	return any(marker in value for marker in ("پښتو", "پشتو"))


	def _has_pashto_evidence(evidence: dict[str, Any]) -> bool:
	fields: list[str] = []
	evidence_text = evidence.get("evidence_text")
	if isinstance(evidence_text, str):
	fields.append(evidence_text)

	evidence_url = evidence.get("evidence_url")
	if isinstance(evidence_url, str):
	fields.append(evidence_url)

	markers = evidence.get("markers")
	if isinstance(markers, list):
	fields.extend(marker for marker in markers if isinstance(marker, str))

	return any(_contains_pashto_marker(field) for field in fields)


	def validate_resource(resource: dict[str, Any], index: int) -> list[str]:
	errors: list[str] = []
	prefix = f"resource[{index}]"

	required_fields = {
	"id",
	"title",
	"url",
	"category",
	"source",
	"status",
	"summary",
	"primary_use",
	"pashto_evidence",
	"tags",
	}
	missing = sorted(required_fields - resource.keys())
	if missing:
	errors.append(f"{prefix} missing required fields: {', '.join(missing)}")
	return errors

	rid = resource["id"]
	if not isinstance(rid, str) or not RESOURCE_ID_RE.fullmatch(rid):
	errors.append(f"{prefix}.id must match {RESOURCE_ID_RE.pattern}")

	title = resource["title"]
	if not isinstance(title, str) or len(title.strip()) < 3:
	errors.append(f"{prefix}.title must be a non-empty string")

	url = resource["url"]
	if not isinstance(url, str) or not _is_valid_http_url(url):
	errors.append(f"{prefix}.url must be a valid http/https URL")

	category = resource["category"]
	if category not in ALLOWED_CATEGORIES:
	errors.append(f"{prefix}.category must be one of {sorted(ALLOWED_CATEGORIES)}")

	source = resource["source"]
	if source not in ALLOWED_SOURCES:
	errors.append(f"{prefix}.source must be one of {sorted(ALLOWED_SOURCES)}")

	status = resource["status"]
	if status not in ALLOWED_STATUS:
	errors.append(f"{prefix}.status must be one of {sorted(ALLOWED_STATUS)}")

	summary = resource["summary"]
	if not isinstance(summary, str) or len(summary.strip()) < 10:
	errors.append(f"{prefix}.summary must be at least 10 characters")

	primary_use = resource["primary_use"]
	if not isinstance(primary_use, str) or len(primary_use.strip()) < 3:
	errors.append(f"{prefix}.primary_use must be a non-empty string")

	if "tasks" in resource and not (
	isinstance(resource["tasks"], list)
	and all(isinstance(item, str) and item.strip() for item in resource["tasks"])
	):
	errors.append(f"{prefix}.tasks must be a list of strings")

	tags = resource["tags"]
	if not (isinstance(tags, list) and tags and all(isinstance(tag, str) and tag.strip() for tag in tags)):
	errors.append(f"{prefix}.tags must be a non-empty list of strings")

	evidence = resource["pashto_evidence"]
	if not isinstance(evidence, dict):
	errors.append(f"{prefix}.pashto_evidence must be an object")
	return errors

	for key in ("evidence_text", "evidence_url", "markers"):
	if key not in evidence:
	errors.append(f"{prefix}.pashto_evidence missing '{key}'")

	evidence_text = evidence.get("evidence_text")
	if not isinstance(evidence_text, str) or len(evidence_text.strip()) < 3:
	errors.append(f"{prefix}.pashto_evidence.evidence_text must be a string")

	evidence_url = evidence.get("evidence_url")
	if not isinstance(evidence_url, str) or not _is_valid_http_url(evidence_url):
	errors.append(f"{prefix}.pashto_evidence.evidence_url must be a valid http/https URL")

	markers = evidence.get("markers")
	if not (isinstance(markers, list) and markers and all(isinstance(marker, str) and marker.strip() for marker in markers)):
	errors.append(f"{prefix}.pashto_evidence.markers must be a non-empty list of strings")

	if category in STRICT_PASHTO_CATEGORIES and not (
	_contains_pashto_marker(title)
	or _contains_pashto_marker(url)
	or _has_pashto_evidence(evidence)
	):
	errors.append(
	f"{prefix} must be Pashto-centric for category '{category}' "
	"(include a Pashto marker in title, URL, or pashto_evidence)"
	)

	return errors


	def validate_catalog(catalog: dict[str, Any]) -> list[str]:
	errors: list[str] = []

	for key in ("version", "updated_on", "resources"):
	if key not in catalog:
	errors.append(f"catalog missing required top-level key: {key}")

	if errors:
	return errors

	version = catalog["version"]
	if not isinstance(version, str) or not re.fullmatch(r"^\d+\.\d+\.\d+$", version):
	errors.append("catalog.version must look like '1.0.0'")

	updated_on = catalog["updated_on"]
	if not isinstance(updated_on, str) or not _validate_iso_date(updated_on):
	errors.append("catalog.updated_on must be a valid ISO date (YYYY-MM-DD)")

	resources = catalog["resources"]
	if not isinstance(resources, list):
	errors.append("catalog.resources must be a list")
	return errors

	seen_ids: set[str] = set()
	for index, resource in enumerate(resources):
	if not isinstance(resource, dict):
	errors.append(f"resource[{index}] must be an object")
	continue
	errors.extend(validate_resource(resource, index))
	resource_id = resource.get("id")
	if isinstance(resource_id, str):
	if resource_id in seen_ids:
	errors.append(f"duplicate resource id: {resource_id}")
	seen_ids.add(resource_id)

	return errors


	def main() -> int:
	parser = argparse.ArgumentParser()
	parser.add_argument("--catalog", default="resources/catalog/resources.json")
	parser.add_argument("--schema", default="resources/schema/resource.schema.json")
	args = parser.parse_args()

	catalog_path = Path(args.catalog)
	schema_path = Path(args.schema)

	if not catalog_path.exists():
	print(f"Missing catalog file: {catalog_path}")
	return 1
	if not schema_path.exists():
	print(f"Missing schema file: {schema_path}")
	return 1

	try:
	schema = _load_json(schema_path)
	catalog = _load_json(catalog_path)
	except json.JSONDecodeError as exc:
	print(f"Invalid JSON: {exc}")
	return 1

	# Basic schema sanity check (this script enforces the validation rules directly).
	if not isinstance(schema, dict) or "$schema" not in schema:
	print("Schema file must be a JSON object with a '$schema' key")
	return 1

	errors = validate_catalog(catalog)
	if errors:
	print("Resource catalog validation failed:")
	for error in errors:
	print(f"- {error}")
	return 1

	print(f"Resource catalog valid: {len(catalog['resources'])} resources")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())