quickgrid commited on
Commit
fb8c813
·
verified ·
1 Parent(s): 36774ea

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +248 -18
index.html CHANGED
@@ -1,19 +1,249 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  </html>
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Universal Tokenizer Visualizer</title>
7
+ <script src="https://cdn.tailwindcss.com"></script>
8
+ <link href="https://cdn.jsdelivr.net/npm/@fontsource/inter@5.0.16/index.min.css" rel="stylesheet">
9
+ <style>
10
+ body { font-family: 'Inter', sans-serif; background-color: #0f172a; }
11
+ .token-chip {
12
+ transition: all 0.15s cubic-bezier(0.4, 0, 0.2, 1);
13
+ cursor: default;
14
+ position: relative;
15
+ }
16
+ .token-chip:hover {
17
+ transform: translateY(-2px) scale(1.03);
18
+ z-index: 20;
19
+ box-shadow: 0 8px 16px -4px rgba(0,0,0,0.5);
20
+ }
21
+ .token-chip::after {
22
+ content: attr(data-tooltip);
23
+ position: absolute;
24
+ bottom: 110%;
25
+ left: 50%;
26
+ transform: translateX(-50%) translateY(5px);
27
+ background: #1e293b;
28
+ color: #e2e8f0;
29
+ padding: 6px 10px;
30
+ border-radius: 6px;
31
+ font-size: 0.75rem;
32
+ white-space: nowrap;
33
+ opacity: 0;
34
+ pointer-events: none;
35
+ transition: all 0.2s;
36
+ border: 1px solid #334155;
37
+ box-shadow: 0 4px 12px rgba(0,0,0,0.3);
38
+ z-index: 30;
39
+ }
40
+ .token-chip:hover::after { opacity: 1; transform: translateX(-50%) translateY(0); }
41
+ .spinner {
42
+ width: 20px; height: 20px;
43
+ border: 2px solid rgba(255,255,255,0.3);
44
+ border-radius: 50%;
45
+ border-top-color: #fff;
46
+ animation: spin 0.8s linear infinite;
47
+ }
48
+ @keyframes spin { to { transform: rotate(360deg); } }
49
+ ::-webkit-scrollbar { width: 8px; height: 8px; }
50
+ ::-webkit-scrollbar-track { background: #1e293b; border-radius: 4px; }
51
+ ::-webkit-scrollbar-thumb { background: #475569; border-radius: 4px; }
52
+ ::-webkit-scrollbar-thumb:hover { background: #64748b; }
53
+ .fade-in { animation: fadeIn 0.3s ease-out forwards; }
54
+ @keyframes fadeIn { from { opacity: 0; transform: translateY(5px); } to { opacity: 1; transform: translateY(0); } }
55
+ </style>
56
+ </head>
57
+ <body class="min-h-screen text-slate-100 flex flex-col">
58
+ <div class="flex-grow max-w-6xl w-full mx-auto p-4 md:p-8">
59
+ <header class="mb-8 flex flex-col md:flex-row md:items-end justify-between gap-4">
60
+ <div>
61
+ <h1 class="text-3xl md:text-4xl font-bold bg-gradient-to-r from-cyan-400 via-blue-500 to-purple-500 bg-clip-text text-transparent tracking-tight">
62
+ 🧩 Tokenizer Visualizer
63
+ </h1>
64
+ <p class="text-slate-400 mt-2 text-sm md:text-base">
65
+ Load any HuggingFace model tokenizer directly in your browser. Client-side only, no GPU required.
66
+ </p>
67
+ </div>
68
+ <div class="flex gap-2 text-xs text-slate-500">
69
+ <span class="px-2 py-1 bg-slate-800 rounded border border-slate-700">Browser-Native</span>
70
+ <span class="px-2 py-1 bg-slate-800 rounded border border-slate-700">Extensible</span>
71
+ </div>
72
+ </header>
73
+
74
+ <div class="grid grid-cols-1 lg:grid-cols-12 gap-6">
75
+ <!-- Controls Panel -->
76
+ <div class="lg:col-span-4 bg-slate-800/50 backdrop-blur border border-slate-700 rounded-xl p-5 flex flex-col gap-4 h-fit shadow-xl">
77
+ <div>
78
+ <label class="block text-sm font-medium text-slate-300 mb-1.5">Model ID / Path</label>
79
+ <input type="text" id="modelId" value="gpt2"
80
+ class="w-full bg-slate-900 border border-slate-600 rounded-lg px-3 py-2.5 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500 transition"
81
+ placeholder="e.g., gpt2, deepseek-ai/DeepSeek-V3, meta-llama/Llama-3-8B">
82
+ </div>
83
+
84
+ <div class="flex-grow">
85
+ <label class="block text-sm font-medium text-slate-300 mb-1.5">Input Text</label>
86
+ <textarea id="inputText" rows="7"
87
+ class="w-full bg-slate-900 border border-slate-600 rounded-lg px-3 py-2.5 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500 resize-y transition"
88
+ placeholder="Type or paste text to visualize tokenization..."></textarea>
89
+ </div>
90
+
91
+ <button id="tokenizeBtn"
92
+ class="w-full bg-blue-600 hover:bg-blue-500 active:bg-blue-700 text-white font-semibold py-2.5 px-4 rounded-lg transition flex items-center justify-center gap-2 disabled:opacity-50 disabled:cursor-not-allowed shadow-lg shadow-blue-900/30">
93
+ <span>Tokenize</span>
94
+ </button>
95
+
96
+ <div id="status" class="text-sm text-slate-400 h-5 flex items-center gap-2 truncate"></div>
97
+ </div>
98
+
99
+ <!-- Visualization Panel -->
100
+ <div class="lg:col-span-8 bg-slate-800/50 backdrop-blur border border-slate-700 rounded-xl p-5 flex flex-col shadow-xl min-h-[450px]">
101
+ <div class="flex flex-wrap justify-between items-center mb-4 pb-3 border-b border-slate-700 gap-3">
102
+ <h2 class="text-lg font-semibold text-slate-100">Token Breakdown</h2>
103
+ <div class="flex flex-wrap gap-3 text-sm">
104
+ <div class="flex items-center gap-1.5 bg-slate-900 px-3 py-1.5 rounded-md border border-slate-700">
105
+ <span class="w-2 h-2 rounded-full bg-blue-400"></span>
106
+ <span class="text-slate-300">Tokens: <b id="statTokens" class="text-white">0</b></span>
107
+ </div>
108
+ <div class="flex items-center gap-1.5 bg-slate-900 px-3 py-1.5 rounded-md border border-slate-700">
109
+ <span class="w-2 h-2 rounded-full bg-purple-400"></span>
110
+ <span class="text-slate-300">Chars: <b id="statChars" class="text-white">0</b></span>
111
+ </div>
112
+ <div class="flex items-center gap-1.5 bg-slate-900 px-3 py-1.5 rounded-md border border-slate-700">
113
+ <span class="w-2 h-2 rounded-full bg-pink-400"></span>
114
+ <span class="text-slate-300">Ratio: <b id="statRatio" class="text-white">0.00</b></span>
115
+ </div>
116
+ </div>
117
+ </div>
118
+
119
+ <div id="outputGrid" class="flex flex-wrap gap-2 overflow-y-auto p-2 content-start flex-grow min-h-[300px] bg-slate-900/40 rounded-lg border border-slate-800/50">
120
+ <div class="w-full text-center text-slate-500 py-16 select-none">
121
+ <svg class="w-12 h-12 mx-auto mb-3 opacity-40" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5" d="M13 10V3L4 14h7v7l9-11h-7z"></path></svg>
122
+ <p>Enter text and click <span class="text-blue-400 font-medium">Tokenize</span> to see the breakdown.</p>
123
+ <p class="text-xs mt-1 opacity-60">Supports BPE, ByteLevel, WordPiece, Unigram (via transformers.js)</p>
124
+ </div>
125
+ </div>
126
+ </div>
127
+ </div>
128
+
129
+ <footer class="mt-8 text-center text-slate-500 text-xs flex flex-col md:flex-row items-center justify-center gap-2">
130
+ <span>Powered by <code class="bg-slate-800 px-1.5 py-0.5 rounded text-blue-300">@huggingface/transformers</code> v3</span>
131
+ <span>•</span>
132
+ <span>Runs entirely client-side via WebAssembly/JS</span>
133
+ <span>•</span>
134
+ <span>No server/GPU required</span>
135
+ </footer>
136
+ </div>
137
+
138
+ <script type="module">
139
+ import { AutoTokenizer } from 'https://esm.sh/@huggingface/transformers@3.0.2';
140
+
141
+ const COLORS = [
142
+ '#3b82f6', '#8b5cf6', '#ec4899', '#f59e0b', '#10b981',
143
+ '#06b6d4', '#6366f1', '#f97316', '#84cc16', '#14b8a6'
144
+ ];
145
+
146
+ const els = {
147
+ modelId: document.getElementById('modelId'),
148
+ inputText: document.getElementById('inputText'),
149
+ btn: document.getElementById('tokenizeBtn'),
150
+ output: document.getElementById('outputGrid'),
151
+ status: document.getElementById('status'),
152
+ statTokens: document.getElementById('statTokens'),
153
+ statChars: document.getElementById('statChars'),
154
+ statRatio: document.getElementById('statRatio')
155
+ };
156
+
157
+ let tokenizerCache = null;
158
+
159
+ function getColor(idx) { return COLORS[idx % COLORS.length]; }
160
+
161
+ function setStatus(msg, type = 'info') {
162
+ els.status.textContent = msg;
163
+ els.status.className = `text-sm h-5 flex items-center gap-2 truncate ${type === 'error' ? 'text-red-400' : type === 'success' ? 'text-green-400' : 'text-slate-400'}`;
164
+ }
165
+
166
+ els.btn.addEventListener('click', async () => {
167
+ const modelId = els.modelId.value.trim();
168
+ const text = els.inputText.value;
169
+
170
+ if (!modelId) return setStatus('Please enter a Model ID.', 'error');
171
+ if (!text) return setStatus('Please enter text to tokenize.', 'error');
172
+
173
+ els.btn.disabled = true;
174
+ els.btn.innerHTML = '<div class="spinner"></div> <span class="ml-2">Loading...</span>';
175
+ els.output.innerHTML = '';
176
+ setStatus('Fetching tokenizer configuration...');
177
+
178
+ try {
179
+ if (!tokenizerCache || tokenizerCache.model_id !== modelId) {
180
+ setStatus('Downloading tokenizer (cached for session)...');
181
+ tokenizerCache = await AutoTokenizer.from_pretrained(modelId);
182
+ tokenizerCache.model_id = modelId;
183
+ setStatus('Tokenizer loaded successfully.', 'success');
184
+ }
185
+
186
+ els.btn.innerHTML = '<div class="spinner"></div> <span class="ml-2">Tokenizing...</span>';
187
+
188
+ // Use backend tokenizer for precise offset mapping
189
+ const backend = tokenizerCache.backend_tokenizer || tokenizerCache.tokenizer;
190
+ const result = backend.encode(text);
191
+
192
+ renderTokens(result.tokens, result.ids, result.offsets, text);
193
+ setStatus('Done.', 'success');
194
+ } catch (err) {
195
+ console.error(err);
196
+ setStatus(`Error: ${err.message || 'Failed to load tokenizer.'}`, 'error');
197
+ els.output.innerHTML = `
198
+ <div class="w-full text-center py-12">
199
+ <div class="text-red-400 font-medium mb-2">Failed to Initialize</div>
200
+ <div class="text-slate-500 text-sm max-w-md mx-auto">
201
+ Ensure the Model ID is correct and the repository contains <code>tokenizer.json</code>.
202
+ Some gated models require a HF Token. Try <code>gpt2</code> as a fallback.
203
+ </div>
204
+ </div>`;
205
+ } finally {
206
+ els.btn.disabled = false;
207
+ els.btn.innerHTML = '<span>Tokenize</span>';
208
+ }
209
+ });
210
+
211
+ function renderTokens(tokens, ids, offsets, originalText) {
212
+ els.output.innerHTML = '';
213
+ const count = ids.length;
214
+ els.statTokens.textContent = count;
215
+ els.statChars.textContent = originalText.length;
216
+ els.statRatio.textContent = originalText.length > 0 ? (count / originalText.length).toFixed(2) : '0.00';
217
+
218
+ tokens.forEach((tok, i) => {
219
+ const [start, end] = offsets[i];
220
+ const chunk = originalText.slice(start, end);
221
+ const color = getColor(i);
222
+ const id = ids[i];
223
+
224
+ const chip = document.createElement('div');
225
+ chip.className = 'token-chip rounded-md px-2 py-1 text-sm font-mono border select-none fade-in';
226
+ chip.style.animationDelay = `${i * 0.015}s`;
227
+ chip.style.backgroundColor = `${color}15`;
228
+ chip.style.borderColor = `${color}80`;
229
+ chip.style.color = color;
230
+ chip.dataset.tooltip = `ID: ${id} | [${start}, ${end}) | "${chunk.replace(/"/g, '\\"')}"`;
231
+
232
+ // Handle invisible characters (like spaces, newlines) visually
233
+ const displayText = chunk === '' ? '␣' : chunk === '\n' ? '↵' : chunk;
234
+ chip.innerHTML = `<span class="font-medium">${displayText}</span><span class="opacity-50 text-[10px] ml-1 align-top">${id}</span>`;
235
+
236
+ els.output.appendChild(chip);
237
+ });
238
+ }
239
+
240
+ // Quick demo on load
241
+ window.addEventListener('load', () => {
242
+ if (!els.inputText.value) {
243
+ els.inputText.value = "Tokenization converts text into numerical IDs. DeepSeek-V4 uses advanced BPE!";
244
+ }
245
+ });
246
+ </script>
247
+ </body>
248
  </html>
249
+