Token Classification
Transformers
ONNX
Safetensors
English
Irish
distilbert
pii
de-identification
ireland
irish
gaelic
diffusion-style
denoising
ppsn
eircode
int8
dynamic-quantization
cpu
Eval Results (legacy)
Instructions to use temsa/IrishCore-DiffMask-135M-v1-rc2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use temsa/IrishCore-DiffMask-135M-v1-rc2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="temsa/IrishCore-DiffMask-135M-v1-rc2")# Load model directly from transformers import AutoTokenizer, AutoModel tokenizer = AutoTokenizer.from_pretrained("temsa/IrishCore-DiffMask-135M-v1-rc2") model = AutoModel.from_pretrained("temsa/IrishCore-DiffMask-135M-v1-rc2") - Notebooks
- Google Colab
- Kaggle
| { | |
| "release": "IrishCore-DiffMask-135M-v1-rc2", | |
| "base_model": "OpenMed/OpenMed-PII-mLiteClinical-Base-135M-v1", | |
| "public_references": { | |
| "rc5": "temsa/OpenMed-mLiteClinical-IrishCorePII-135M-v2-rc5", | |
| "rc8": "temsa/OpenMed-mLiteClinical-IrishCorePII-135M-v2-rc8" | |
| }, | |
| "task": "Irish core PII detection and masking in English and Irish Gaelic", | |
| "coverage": [ | |
| "PPSN", | |
| "ACCOUNT_NUMBER", | |
| "BANK_ROUTING_NUMBER", | |
| "CREDIT_DEBIT_CARD", | |
| "PASSPORT_NUMBER", | |
| "POSTCODE", | |
| "PHONE_NUMBER", | |
| "EMAIL", | |
| "FIRST_NAME", | |
| "LAST_NAME", | |
| "SWIFT_BIC" | |
| ], | |
| "architecture": { | |
| "family": "DistilBERT-size token-span extractor", | |
| "diffusion_style_training": true, | |
| "runtime_diffusion": false, | |
| "scanner_free": true, | |
| "validator_free": true, | |
| "heads": [ | |
| "token_presence_head", | |
| "typed_start_boundary_head", | |
| "typed_end_boundary_head" | |
| ] | |
| }, | |
| "training_data": { | |
| "published": [ | |
| "temsa/OpenMed-Irish-CorePII-TrainMix-v1", | |
| "temsa/OpenMed-Irish-PPSN-Eircode-Spec-v1", | |
| "joelniklaus/mapa", | |
| "gretelai/synthetic_pii_finance_multilingual" | |
| ], | |
| "local_synthetic_hardening_sets": [ | |
| "irish_dllm_hardening_v1", | |
| "dllm_gap_patch_v1", | |
| "dllm_gap_patch_v2", | |
| "dllm_gap_patch_v3", | |
| "dllm_gap_patch_v4", | |
| "dllm_uat_replay_v1", | |
| "dllm_uat_patch_v2", | |
| "irish_core_diffmask_v2_mix", | |
| "irish_core_diffmask_v3_mix", | |
| "irish_core_diffmask_v4_mix", | |
| "irish_core_diffmask_v5_mix", | |
| "irish_core_diffmask_focus_v1" | |
| ], | |
| "selection_note": "The published checkpoint was selected from multiple continuation and interpolation runs to balance Irish core, multilingual PPSN, hardening performance, and the UAT replay exact suite after fixing label contamination in the v5 mix." | |
| }, | |
| "training_recipe": { | |
| "noise_schedule_family": "linear masked denoising schedule", | |
| "runtime_diffusion": false, | |
| "train_time_diffusion_steps": 4, | |
| "start_noise_fraction": 0.65, | |
| "end_noise_fraction": 0.05, | |
| "loss": "average BCE losses over token presence and typed boundaries across noised passes" | |
| }, | |
| "release_selection": { | |
| "published_checkpoint": "selected interpolation blend used for rc2", | |
| "selection_strategy": "interpolation blend between the stronger broad-coverage DiffMask checkpoint and the cleaned v5 continuation", | |
| "reason": "This blend gave the best overall deployment-path tradeoff once the new UAT replay exact suite was added." | |
| }, | |
| "known_remaining_misses": [ | |
| "Second phone number inside the long Client Identity Services sentence: 071 967 2616", | |
| "Postcode inside the longer allocation-centre block: R93 EC57", | |
| "Email mailbox form: EPStamp4@enterprise.gov.ie", | |
| "One D02 XY45 address form from the UAT replay suite" | |
| ], | |
| "references": [ | |
| { | |
| "title": "BERT", | |
| "url": "https://arxiv.org/abs/1810.04805" | |
| }, | |
| { | |
| "title": "DistilBERT", | |
| "url": "https://arxiv.org/abs/1910.01108" | |
| }, | |
| { | |
| "title": "Boundary Smoothing for Named Entity Recognition", | |
| "url": "https://aclanthology.org/2022.acl-long.490/" | |
| }, | |
| { | |
| "title": "SPANNER: Named Entity Re-/Recognition as Span Prediction", | |
| "url": "https://aclanthology.org/2021.acl-long.558/" | |
| }, | |
| { | |
| "title": "LLaDA 2.0: Scaling Up Diffusion Language Models to 100B", | |
| "url": "https://arxiv.org/abs/2512.15745" | |
| }, | |
| { | |
| "title": "Scaling Diffusion Language Models via Adaptation from Autoregressive Models", | |
| "url": "https://arxiv.org/abs/2410.17891" | |
| } | |
| ] | |
| } | |