sentinel-universal-tokenizer / deep_benchmark.py
5dimension's picture
Add deep benchmark script
d1551aa verified
"""
Deep benchmark: Sentinel Universal Tokenizer vs SOTA
Per-language comparison, edge cases, code/math handling, multimodal features
"""
import json
import math
import os
import numpy as np
from transformers import AutoTokenizer
INV_E = 1.0 / math.e
C1 = -0.007994021805952546
C2 = 0.00020005604296784437
# Load tokenizers
print("Loading tokenizers...")
sentinel = AutoTokenizer.from_pretrained("/app/sentinel_universal_tokenizer_v1")
gpt2 = AutoTokenizer.from_pretrained("gpt2")
gemma = AutoTokenizer.from_pretrained("google/gemma-2b")
qwen = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")
tokenizers = {
"Sentinel-SUT (61K)": sentinel,
"GPT-2 (50K)": gpt2,
"Gemma (256K)": gemma,
"Qwen2 (152K)": qwen,
}
# Comprehensive test suite
TEST_SUITE = {
# ── European languages ──────────────────────────────
"English": "Machine learning algorithms optimize gradient-based objective functions through iterative parameter updates. The Sentinel Manifold provides a unified mathematical framework for understanding convergence behavior across diverse optimization landscapes.",
"French": "Les algorithmes d'apprentissage automatique optimisent les fonctions objectives basées sur le gradient grâce à des mises à jour itératives des paramètres. Le manifold Sentinel fournit un cadre mathématique unifié.",
"German": "Algorithmen des maschinellen Lernens optimieren gradientenbasierte Zielfunktionen durch iterative Parameteraktualisierungen. Die Sentinel-Mannigfaltigkeit bietet einen einheitlichen mathematischen Rahmen.",
"Spanish": "Los algoritmos de aprendizaje automático optimizan funciones objetivo basadas en gradientes mediante actualizaciones iterativas de parámetros. El colector Sentinel proporciona un marco matemático unificado.",
"Portuguese": "Os algoritmos de aprendizado de máquina otimizam funções objetivo baseadas em gradientes por meio de atualizações iterativas de parâmetros.",
"Italian": "Gli algoritmi di apprendimento automatico ottimizzano le funzioni obiettivo basate sul gradiente attraverso aggiornamenti iterativi dei parametri.",
"Dutch": "Machine learning-algoritmen optimaliseren op gradiënt gebaseerde doelfuncties door middel van iteratieve parameterupdates.",
"Polish": "Algorytmy uczenia maszynowego optymalizują funkcje celu oparte na gradiencie poprzez iteracyjne aktualizacje parametrów.",
"Swedish": "Maskininlärningsalgoritmer optimerar gradientbaserade målfunktioner genom iterativa parameteruppdateringar.",
"Turkish": "Makine öğrenimi algoritmaları, yinelemeli parametre güncellemeleri yoluyla gradyan tabanlı hedef fonksiyonlarını optimize eder.",
"Ukrainian": "Алгоритми машинного навчання оптимізують цільові функції на основі градієнтів шляхом ітеративного оновлення параметрів.",
# ── Asian languages ─────────────────────────────────
"Chinese": "机器学习算法通过迭代参数更新来优化基于梯度的目标函数。Sentinel流形为理解不同优化景观中的收敛行为提供了统一的数学框架。",
"Japanese": "機械学習アルゴリズムは、反復的なパラメータ更新を通じて勾配ベースの目的関数を最適化します。Sentinel多様体は、多様な最適化ランドスケープにおける収束挙動を理解するための統一的な数学的枠組みを提供します。",
"Korean": "머신러닝 알고리즘은 반복적인 매개변수 업데이트를 통해 그래디언트 기반 목적 함수를 최적화합니다. Sentinel 다양체는 다양한 최적화 환경에서의 수렴 동작을 이해하기 위한 통합 수학적 프레임워크를 제공합니다.",
"Vietnamese": "Các thuật toán học máy tối ưu hóa các hàm mục tiêu dựa trên gradient thông qua cập nhật tham số lặp đi lặp lại.",
"Thai": "อัลกอริทึมการเรียนรู้ของเครื่องเพิ่มประสิทธิภาพฟังก์ชันวัตถุประสงค์ที่อิงตามเกรเดียนต์ผ่านการอัปเดตพารามิเตอร์ซ้ำ",
"Hindi": "मशीन लर्निंग एल्गोरिदम पुनरावृत्तीय पैरामीटर अपडेट के माध्यम से ग्रेडिएंट-आधारित उद्देश्य फ़ंक्शन को अनुकूलित करते हैं।",
# ── Semitic / RTL languages ─────────────────────────
"Arabic": "تعمل خوارزميات التعلم الآلي على تحسين دوال الهدف القائمة على التدرج من خلال تحديثات المعلمات التكرارية. يوفر متنوع سنتينل إطارًا رياضيًا موحدًا.",
"Russian": "Алгоритмы машинного обучения оптимизируют целевые функции на основе градиента посредством итеративных обновлений параметров. Сентинельное многообразие обеспечивает унифицированную математическую структуру.",
# ── Code ─────────────────────────────────────────────
"Python": '''def sentinel_attention(Q, K, V, d_head):
"""Sech attention: bounded gradients, theorem-backed."""
scores = Q @ K.transpose(-2, -1) / math.sqrt(d_head)
attn = 1.0 / torch.cosh(scores) # sech(x) = 1/cosh(x)
attn = attn / (attn.sum(-1, keepdim=True) + 1e-8)
return attn @ V''',
"JavaScript": '''async function trainModel(config) {
const optimizer = new Adam({lr: 1/Math.E, beta1: 0.9});
for (let epoch = 0; epoch < config.epochs; epoch++) {
const loss = await model.trainStep(data);
console.log(`Epoch ${epoch}: loss=${loss.toFixed(6)}`);
if (loss < config.C2) break; // Escape threshold
}
}''',
"Rust": '''fn sentinel_sech(x: f64) -> f64 {
let inv_e: f64 = 1.0 / std::f64::consts::E;
1.0 / (x * inv_e).cosh()
}
fn quantize_sentinel(weight: f64, c1: f64) -> i8 {
let scale = weight.abs() * (1.0 / std::f64::consts::E);
((weight - c1) / scale).round().clamp(-128.0, 127.0) as i8
}''',
# ── Mathematics ──────────────────────────────────────
"LaTeX_Complex": r"\begin{align} F(z) &= \sum_{n=1}^{\infty} \frac{z^n}{n^n} \\ \lim_{z \to \infty} \frac{F'(z)}{F(z)} &= \frac{1}{e} \\ \nabla_\theta \mathcal{L} &= \mathbb{E}_{x \sim p(x)} \left[ \frac{\partial}{\partial \theta} \log p_\theta(x) \right] \end{align}",
"Unicode_Math": "∫₀¹ x⁻ˣ dx = Σ_{n=1}^∞ n⁻ⁿ ≈ 1.291, ∇·E = ρ/ε₀, ∂²u/∂t² = c²∇²u, det(A−λI) = 0",
"Mixed_Notation": "The loss function L(θ) = -1/N Σᵢ [yᵢ log(ŷᵢ) + (1-yᵢ) log(1-ŷᵢ)] converges with rate O(1/√T) when lr = η₀·(1/e)^(t/T).",
# ── Edge cases ───────────────────────────────────────
"Emoji_Heavy": "🦴🧠🔬💡🚀🌍🎯📊 The Sentinel Manifold 🦴 uses sech(x) = 1/cosh(x) for bounded gradients 📈↗️ across all modalities 🖼️🔊🎬",
"Numbers_Heavy": "C₁ = -0.007994021805952546, C₂ = 0.00020005604296784437, 1/e = 0.367879441171442, π = 3.14159265358979, τ = 6.28318530717959",
"URL_Path": "https://huggingface.co/5dimension/sentinel-universal-tokenizer/blob/main/tokenizer.json?download=true#section-3.2",
"Mixed_Script": "The word 'Привет' (Russian) means hello, '你好' (Chinese) also means hello, and 'مرحبا' (Arabic) is the same.",
"Repetition": "the the the the the machine learning machine learning machine learning optimization optimization optimization",
"Whitespace": " Hello World \n\n Multiple spaces and\t\ttabs \n",
"Empty_Adjacent": "word1 word2 word3 word4 word5",
}
print(f"\n{'='*100}")
print(f" DEEP BENCHMARK: SENTINEL UNIVERSAL TOKENIZER vs SOTA")
print(f"{'='*100}")
# Run benchmarks
results = {}
for name, tok in tokenizers.items():
tok_results = {}
for lang, text in TEST_SUITE.items():
try:
enc = tok.encode(text, add_special_tokens=False)
dec = tok.decode(enc, skip_special_tokens=False)
n_tokens = len(enc)
n_bytes = len(text.encode('utf-8'))
n_words = max(len(text.split()), 1)
# Check if decoded text contains the original (byte-level may add Ġ prefix)
clean_dec = dec.replace('Ġ', ' ').replace('▁', ' ').strip()
roundtrip = text.strip() in clean_dec or clean_dec in text.strip()
tok_results[lang] = {
"tokens": n_tokens,
"bytes": n_bytes,
"words": n_words,
"fertility": n_tokens / n_words,
"compression": n_bytes / max(n_tokens, 1),
"roundtrip": roundtrip,
}
except Exception as e:
tok_results[lang] = {"error": str(e)}
results[name] = tok_results
# ── Category Analysis ──
categories = {
"European": ["English", "French", "German", "Spanish", "Portuguese", "Italian", "Dutch", "Polish", "Swedish", "Turkish", "Ukrainian"],
"Asian": ["Chinese", "Japanese", "Korean", "Vietnamese", "Thai", "Hindi"],
"Semitic/RTL": ["Arabic", "Russian"],
"Code": ["Python", "JavaScript", "Rust"],
"Mathematics": ["LaTeX_Complex", "Unicode_Math", "Mixed_Notation"],
"Edge Cases": ["Emoji_Heavy", "Numbers_Heavy", "URL_Path", "Mixed_Script", "Repetition", "Whitespace", "Empty_Adjacent"],
}
print(f"\n {'='*96}")
print(f" CATEGORY ANALYSIS")
print(f" {'='*96}")
for category, langs in categories.items():
print(f"\n ── {category.upper()} ────────────────────────────────────")
# Header
print(f" {'Sample':<18}", end="")
for name in tokenizers:
short = name.split("(")[0].strip()[:12]
print(f" {short:>12}", end="")
print(f" {'Winner':<15}")
print(f" {'-'*18}", end="")
for _ in tokenizers:
print(f" {'-'*12}", end="")
print(f" {'-'*15}")
category_wins = {name: 0 for name in tokenizers}
for lang in langs:
print(f" {lang:<18}", end="")
compressions = {}
for name in tokenizers:
r = results[name].get(lang, {})
if "error" in r:
print(f" {'ERROR':>12}", end="")
else:
comp = r["compression"]
compressions[name] = comp
print(f" {comp:>12.3f}", end="")
if compressions:
winner = max(compressions, key=compressions.get)
category_wins[winner] += 1
short_winner = winner.split("(")[0].strip()[:12]
print(f" {short_winner:<15}")
else:
print()
# Category summary
print(f"\n Category wins: ", end="")
for name, wins in sorted(category_wins.items(), key=lambda x: -x[1]):
short = name.split("(")[0].strip()
if wins > 0:
print(f"{short}: {wins} ", end="")
print()
# ── Overall Summary ──
print(f"\n\n {'='*96}")
print(f" OVERALL SUMMARY")
print(f" {'='*96}")
overall = {}
for name in tokenizers:
fertilities = []
compressions = []
wins = 0
for lang in TEST_SUITE:
r = results[name].get(lang, {})
if "error" not in r:
fertilities.append(r["fertility"])
compressions.append(r["compression"])
# Count compression wins
others = {n: results[n].get(lang, {}).get("compression", 0) for n in tokenizers if n != name}
if r["compression"] > max(others.values(), default=0):
wins += 1
overall[name] = {
"avg_fertility": np.mean(fertilities),
"std_fertility": np.std(fertilities),
"median_fertility": np.median(fertilities),
"avg_compression": np.mean(compressions),
"median_compression": np.median(compressions),
"fairness": 1.0 / (1.0 + np.std(fertilities)),
"wins": wins,
"total_tests": len(fertilities),
}
# Table
print(f"\n {'Metric':<30}", end="")
for name in tokenizers:
short = name.split("(")[0].strip()[:14]
print(f" {short:>14}", end="")
print()
print(f" {'-'*30}", end="")
for _ in tokenizers:
print(f" {'-'*14}", end="")
print()
metrics_to_show = [
("Avg Compression ↑", "avg_compression"),
("Median Compression ↑", "median_compression"),
("Avg Fertility ↓", "avg_fertility"),
("Median Fertility ↓", "median_fertility"),
("Fertility σ ↓", "std_fertility"),
("Fairness ↑", "fairness"),
("Compression Wins", "wins"),
]
for metric_name, metric_key in metrics_to_show:
print(f" {metric_name:<30}", end="")
for name in tokenizers:
val = overall[name][metric_key]
if metric_key == "wins":
print(f" {int(val):>14}", end="")
else:
print(f" {val:>14.4f}", end="")
print()
# ── Efficiency: compression per vocab token ──
print(f"\n EFFICIENCY (compression per 1K vocab tokens):")
for name in tokenizers:
vocab = len(tokenizers[name])
comp = overall[name]["avg_compression"]
efficiency = comp / (vocab / 1000)
short = name.split("(")[0].strip()
print(f" {short:<20}: {efficiency:.6f} (vocab={vocab:,}, compress={comp:.3f})")
# ── Per-vocab-token analysis ──
print(f"\n NORMALIZED PERFORMANCE (accounting for vocabulary size):")
print(f" (Higher = better use of each vocabulary slot)")
for name in tokenizers:
vocab = len(tokenizers[name])
comp = overall[name]["avg_compression"]
# Compression per log2(vocab) — how efficiently each bit of vocab is used
bits = math.log2(vocab)
comp_per_bit = comp / bits
short = name.split("(")[0].strip()
print(f" {short:<20}: {comp_per_bit:.4f} compression per vocab bit (log₂(vocab)={bits:.1f})")
# ── Save full results ──
with open("/app/deep_benchmark_results.json", "w") as f:
json.dump({
"per_sample": {name: {lang: r for lang, r in res.items()} for name, res in results.items()},
"overall": {name: {k: float(v) for k, v in m.items()} for name, m in overall.items()},
"categories": {cat: list(langs) for cat, langs in categories.items()},
}, f, indent=2, default=str)
print(f"\n ✓ Full results saved to /app/deep_benchmark_results.json")
# ── Sentinel-specific features ──
print(f"\n\n {'='*96}")
print(f" SENTINEL-SPECIFIC FEATURES (no other tokenizer has these)")
print(f" {'='*96}")
print(f"\n 1. MULTIMODAL TOKEN ROUTING")
img_start = sentinel.convert_tokens_to_ids("<image_start>")
img_end = sentinel.convert_tokens_to_ids("<image_end>")
aud_start = sentinel.convert_tokens_to_ids("<audio_start>")
vid_start = sentinel.convert_tokens_to_ids("<video_start>")
print(f" <image_start>={img_start}, <image_end>={img_end}")
print(f" <audio_start>={aud_start}, <video_start>={vid_start}")
print(f" Image codebook: <img_0>={sentinel.convert_tokens_to_ids('<img_0>')} to <img_16383>={sentinel.convert_tokens_to_ids('<img_16383>')}")
print(f" Audio codebook: <aud_0>={sentinel.convert_tokens_to_ids('<aud_0>')} to <aud_8191>={sentinel.convert_tokens_to_ids('<aud_8191>')}")
print(f" Video codebook: <vid_0>={sentinel.convert_tokens_to_ids('<vid_0>')} to <vid_4095>={sentinel.convert_tokens_to_ids('<vid_4095>')}")
print(f"\n 2. SENTINEL MANIFOLD TOKENS")
print(f" <sentinel>={sentinel.convert_tokens_to_ids('<sentinel>')}")
print(f" <sentinel_c1>={sentinel.convert_tokens_to_ids('<sentinel_c1>')}")
print(f" <sentinel_c2>={sentinel.convert_tokens_to_ids('<sentinel_c2>')}")
print(f" <scale_1e>={sentinel.convert_tokens_to_ids('<scale_1e>')}")
print(f"\n 3. CHAT FORMAT")
print(f" <system>={sentinel.convert_tokens_to_ids('<system>')}")
print(f" <user>={sentinel.convert_tokens_to_ids('<user>')}")
print(f" <assistant>={sentinel.convert_tokens_to_ids('<assistant>')}")
print(f" <turn>={sentinel.convert_tokens_to_ids('<turn>')}")
print(f"\n 4. CODE/MATH BOUNDARIES")
print(f" <code_start>={sentinel.convert_tokens_to_ids('<code_start>')}")
print(f" <code_end>={sentinel.convert_tokens_to_ids('<code_end>')}")
print(f" <math_start>={sentinel.convert_tokens_to_ids('<math_start>')}")
print(f" <math_end>={sentinel.convert_tokens_to_ids('<math_end>')}")
print(f"\n 5. TASK TOKENS")
print(f" <translate>={sentinel.convert_tokens_to_ids('<translate>')}")
print(f" <summarize>={sentinel.convert_tokens_to_ids('<summarize>')}")
print(f" <generate>={sentinel.convert_tokens_to_ids('<generate>')}")
print(f" <understand>={sentinel.convert_tokens_to_ids('<understand>')}")
print(f" <caption>={sentinel.convert_tokens_to_ids('<caption>')}")
# Multimodal encoding demo
print(f"\n 6. MULTIMODAL ENCODING DEMO")
mm_text = "Describe: <image_start> <img_42> <img_1337> <img_8000> <image_end> A sunset over mountains"
mm_enc = sentinel.encode(mm_text, add_special_tokens=False)
print(f" Input: '{mm_text}'")
print(f" Tokens: {len(mm_enc)}")
print(f" IDs: {mm_enc}")
# Show which modality each token belongs to
print(f" Modality map:")
for tid in mm_enc:
if tid < 33:
mod = "SPECIAL"
elif tid < 32768:
mod = "TEXT"
token_str = sentinel.decode([tid])
elif tid < 49152:
mod = f"IMAGE[{tid-32768}]"
elif tid < 57344:
mod = f"AUDIO[{tid-49152}]"
elif tid < 61440:
mod = f"VIDEO[{tid-57344}]"
else:
mod = "UNKNOWN"
print(f" {tid:>6}: {mod}")
print(f"\n {'='*96}")
print(f" 🦴 DEEP BENCHMARK COMPLETE")
print(f" {'='*96}")