| { |
| "model_id": "mindbomber/aana", |
| "architecture": "Alignment-Aware Neural Architecture", |
| "system_model": "S = (f_theta, E_phi, R, Pi_psi, G)", |
| "results": [ |
| { |
| "benchmark": "PIIMB", |
| "submission": "https://huggingface.co/datasets/piimb/pii-masking-benchmark-results/discussions/3", |
| "model_card": "https://huggingface.co/mindbomber/aana-presidio-piimb-policy-v1", |
| "dataset": "piimb/pii-masking-benchmark", |
| "dataset_revision": "df8299e90ff053fa6fd1d3678f6693a454f4ecc0", |
| "subset": "sentences", |
| "base_detector": "microsoft/presidio-analyzer", |
| "base_average_masking_f2": 0.4492985573, |
| "aana_average_masking_f2": 0.5629171363, |
| "delta_average_masking_f2": 0.113618579, |
| "base_average_recall": 0.4008557794, |
| "aana_average_recall": 0.5159532273, |
| "delta_average_recall": 0.1150974479, |
| "scope": "official PIIMB submission showing AANA verifier/correction gain over the same specialist detector" |
| }, |
| { |
| "benchmark": "PIIMB", |
| "submission": "https://huggingface.co/datasets/piimb/pii-masking-benchmark-results/discussions/2", |
| "model_card": "https://huggingface.co/mindbomber/aana-piimb-policy-baseline", |
| "dataset": "piimb/pii-masking-benchmark", |
| "dataset_revision": "df8299e90ff053fa6fd1d3678f6693a454f4ecc0", |
| "subset": "sentences", |
| "aana_average_masking_f2": 0.5195345497, |
| "scope": "official PIIMB submission for a zero-parameter deterministic policy baseline" |
| }, |
| { |
| "benchmark": "TruthfulQA", |
| "dataset": "truthfulqa/truthful_qa", |
| "configuration": "multiple_choice", |
| "split": "validation", |
| "sample_size": 100, |
| "base_generator": "openai/gpt-4o-mini", |
| "mc1_accuracy": 0.85, |
| "scope": "local AANA-gated run and public artifact publication, not an official leaderboard submission" |
| } |
| ], |
| "claim_limits": [ |
| "AANA is a runtime architecture, not a standalone neural-weight checkpoint.", |
| "Current public results do not claim state-of-the-art performance.", |
| "Current public results do not guarantee hallucination removal, PII removal, or production safety.", |
| "Production readiness requires external deployment evidence beyond local benchmark results." |
| ] |
| } |
|
|