PeterPinetree commited on
Commit
afe3de1
·
verified ·
1 Parent(s): 5512a5e

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +44 -24
index.html CHANGED
@@ -205,6 +205,7 @@
205
  const MODELS = {
206
  qwen: {
207
  id: "qwen", // this is the folder name under assets/models/
 
208
  onnx_file: "onnx/model_q4f16.onnx",
209
  emb_coords: "assets/embeddings/qwen_pca_top5k_coords.json",
210
  emb_nbrs: "assets/embeddings/qwen_neighbors_top5k_k40.json",
@@ -274,28 +275,52 @@
274
  return { setSources, load, drawBase, highlight };
275
  })();
276
 
277
- // --- Local Qwen tokenizer loader (no Hub, no path rewrite) ---
278
  async function loadTokenizerQwenLocal() {
279
  resetProgress("Tokenizer");
280
  setStatus("Tokenizer: starting…");
281
 
282
- const baseHref = new URL(MODELS.qwen.base, window.location.href).href;
 
 
 
 
283
 
284
- const tjsonURL = new URL("tokenizer.json", baseHref).href;
285
- const tcfgURL = new URL("tokenizer_config.json", baseHref).href;
286
- const smapURL = new URL("special_tokens_map.json", baseHref).href; // optional
287
 
288
- const [tokJSON, tokCfgJSON, smapJSON] = await Promise.all([
289
- fetch(tjsonURL).then(r => { if (!r.ok) throw new Error("missing tokenizer.json"); return r.json(); }),
290
- fetch(tcfgURL).then(r => { if (!r.ok) throw new Error("missing tokenizer_config.json"); return r.json(); }),
291
- fetch(smapURL).then(r => r.ok ? r.json() : null),
292
- ]);
293
 
294
  const files = new Map();
295
- files.set("tokenizer.json", new Blob([JSON.stringify(tokJSON)], { type: "application/json" }));
296
- files.set("tokenizer_config.json", new Blob([JSON.stringify(tokCfgJSON)], { type: "application/json" }));
297
- if (smapJSON) {
298
- files.set("special_tokens_map.json", new Blob([JSON.stringify(smapJSON)], { type: "application/json" }));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  }
300
 
301
  tokenizer = await AutoTokenizer.from_pretrained("", {
@@ -304,10 +329,7 @@
304
  progress_callback: onProgress,
305
  });
306
  }
307
-
308
- // --- Core model state ---
309
- let tokenizer = null, model = null;
310
- let loadSeq = 0;
311
 
312
  // Core model state (module scope)
313
  let tokenizer = null, model = null;
@@ -325,13 +347,10 @@
325
  setStatus("Tokenizer: starting…");
326
  try {
327
  if (key === "qwen") {
328
- env.allowRemoteModels = false; // force local for Qwen
329
- tokenizer = await AutoTokenizer.from_pretrained(MODELS.qwen.id, {
330
- progress_callback: onProgress,
331
- // local load uses env.localModelPath + "qwen"/files
332
- });
333
  } else {
334
- env.allowRemoteModels = true; // hub is fine for distilgpt2
335
  tokenizer = await AutoTokenizer.from_pretrained(MODELS.distilgpt2.repo, {
336
  progress_callback: onProgress,
337
  });
@@ -341,6 +360,7 @@
341
  setErr("Tokenizer failed to load.");
342
  return;
343
  }
 
344
  if (mySeq !== loadSeq) return;
345
 
346
  // --- Model ---
 
205
  const MODELS = {
206
  qwen: {
207
  id: "qwen", // this is the folder name under assets/models/
208
+ base: new URL("assets/models/qwen/", window.location.href).href,
209
  onnx_file: "onnx/model_q4f16.onnx",
210
  emb_coords: "assets/embeddings/qwen_pca_top5k_coords.json",
211
  emb_nbrs: "assets/embeddings/qwen_neighbors_top5k_k40.json",
 
275
  return { setSources, load, drawBase, highlight };
276
  })();
277
 
278
+ // --- Local Qwen tokenizer loader (supports both JSON and BPE files) ---
279
  async function loadTokenizerQwenLocal() {
280
  resetProgress("Tokenizer");
281
  setStatus("Tokenizer: starting…");
282
 
283
+ const base = MODELS.qwen.base; // e.g. .../assets/models/qwen/
284
+ // New-format filenames
285
+ const tjson = new URL("tokenizer.json", base).href;
286
+ const tcfg = new URL("tokenizer_config.json", base).href;
287
+ const smap = new URL("special_tokens_map.json", base).href;
288
 
289
+ // Classic BPE filenames (present in many ONNX repos, incl. Qwen)
290
+ const vocab = new URL("vocab.json", base).href;
291
+ const merges = new URL("merges.txt", base).href;
292
 
293
+ // Try new-format first; if missing, fall back to vocab/merges
294
+ const tjsonHead = await fetch(tjson, { method: "HEAD" });
295
+ const useNewFormat = tjsonHead.ok;
 
 
296
 
297
  const files = new Map();
298
+
299
+ if (useNewFormat) {
300
+ const [tokJSON, tokCfgJSON, smapJSON] = await Promise.all([
301
+ fetch(tjson).then(r => { if (!r.ok) throw new Error("missing tokenizer.json"); return r.json(); }),
302
+ fetch(tcfg).then(r => { if (!r.ok) throw new Error("missing tokenizer_config.json"); return r.json(); }),
303
+ fetch(smap).then(r => r.ok ? r.json() : null),
304
+ ]);
305
+ files.set("tokenizer.json", new Blob([JSON.stringify(tokJSON)], { type: "application/json" }));
306
+ files.set("tokenizer_config.json", new Blob([JSON.stringify(tokCfgJSON)],{ type: "application/json" }));
307
+ if (smapJSON) files.set("special_tokens_map.json", new Blob([JSON.stringify(smapJSON)], { type: "application/json" }));
308
+ } else {
309
+ // Fall back to BPE pair; this avoids the "e.split is not a function" crash
310
+ const [vocabText, mergesText, tokCfgJSON, smapJSON] = await Promise.all([
311
+ fetch(vocab).then(r => { if (!r.ok) throw new Error("missing vocab.json"); return r.text(); }),
312
+ fetch(merges).then(r => { if (!r.ok) throw new Error("missing merges.txt"); return r.text(); }),
313
+ fetch(tcfg).then(r => r.ok ? r.json() : { model_max_length: 32768 }),
314
+ fetch(smap).then(r => r.ok ? r.json() : null),
315
+ ]);
316
+ files.set("vocab.json", new Blob([vocabText], { type: "application/json" }));
317
+ files.set("merges.txt", new Blob([mergesText], { type: "text/plain" }));
318
+ files.set("tokenizer_config.json",
319
+ new Blob([JSON.stringify(tokCfgJSON)], { type: "application/json" })
320
+ );
321
+ if (smapJSON) files.set("special_tokens_map.json",
322
+ new Blob([JSON.stringify(smapJSON)], { type: "application/json" })
323
+ );
324
  }
325
 
326
  tokenizer = await AutoTokenizer.from_pretrained("", {
 
329
  progress_callback: onProgress,
330
  });
331
  }
332
+
 
 
 
333
 
334
  // Core model state (module scope)
335
  let tokenizer = null, model = null;
 
347
  setStatus("Tokenizer: starting…");
348
  try {
349
  if (key === "qwen") {
350
+ env.allowRemoteModels = false; // force local for Qwen
351
+ await loadTokenizerQwenLocal(); // <-- use the local loader
 
 
 
352
  } else {
353
+ env.allowRemoteModels = true; // Hub is fine for distilgpt2
354
  tokenizer = await AutoTokenizer.from_pretrained(MODELS.distilgpt2.repo, {
355
  progress_callback: onProgress,
356
  });
 
360
  setErr("Tokenizer failed to load.");
361
  return;
362
  }
363
+
364
  if (mySeq !== loadSeq) return;
365
 
366
  // --- Model ---