File size: 11,588 Bytes
4c4928f
 
 
94a91a9
b2ca260
50f59bb
37f2943
 
 
 
 
4c42c50
 
37f2943
94a91a9
b2ca260
15ad92b
5ef5062
b2ca260
 
 
0c484c0
b2ca260
44a6e1e
0629524
e6643db
6eb05ef
94a91a9
1477d49
b2ca260
 
1477d49
 
 
 
94a91a9
1477d49
 
 
 
94a91a9
1477d49
 
 
 
b2ca260
0629524
 
b2ca260
 
 
fcbf7ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2ca260
f9c460b
fcbf7ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2ca260
fcbf7ba
 
 
 
 
b2ca260
5ef5062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94a91a9
 
5ef5062
abaea80
 
 
 
 
 
37f2943
abaea80
 
 
37f2943
abaea80
 
 
37f2943
 
 
 
abaea80
 
 
37f2943
 
 
 
 
 
 
 
abaea80
37f2943
 
abaea80
 
 
 
 
 
 
9a35308
37f2943
 
 
 
 
 
 
 
abaea80
 
 
 
 
37f2943
 
 
 
 
 
abaea80
 
 
 
41c8086
 
 
 
 
abaea80
 
 
37f2943
abaea80
37f2943
 
abaea80
fde6d8b
abaea80
 
 
 
 
 
37f2943
 
abaea80
 
 
 
37f2943
 
abaea80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1477d49
abaea80
 
 
37f2943
 
abaea80
 
1477d49
 
abaea80
 
 
37f2943
abaea80
 
1477d49
37f2943
1477d49
37f2943
abaea80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37f2943
 
 
 
 
 
 
abaea80
37f2943
abaea80
 
 
37f2943
 
abaea80
 
37f2943
abaea80
37f2943
abaea80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37f2943
abaea80
37f2943
abaea80
 
 
 
 
 
8fa488a
 
 
abaea80
 
37f2943
1477d49
37f2943
76ac858
 
 
 
 
 
 
 
 
fcbf7ba
 
 
e6643db
fcbf7ba
e6643db
 
 
fcbf7ba
 
e6643db
fcbf7ba
 
ab198d8
fcbf7ba
2cfb880
 
e6643db
 
fcbf7ba
 
5ef5062
 
6eb05ef
5ef5062
 
 
 
 
6eb05ef
5ef5062
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
import { env, AutoTokenizer, RawImage, Tensor } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers';
import { getModelJSON, getModelFile } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/src/utils/hub.js";
import * as ort from "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.20.0/dist/ort.webgpu.mjs";

const EXAMPLE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg";
const INPUT_IMAGE_SIZE = [960, 960];
const HEIGHT_FACTOR = 10;
const WIDTH_FACTOR = 10;
const IMAGE_EMBED_SIZE = WIDTH_FACTOR * HEIGHT_FACTOR;
const MAX_SEQ_LENGTH = 1024;
const BASE_MODEL = "Qwen/Qwen2-VL-2B-Instruct";
const ONNX_MODEL = "pdufour/Qwen2-VL-2B-Instruct-ONNX-Q4-F16";
const QUANT = "q4f16";
const MAX_SINGLE_CHAT_LENGTH = 10;

// UI Elements
const exampleButton = document.getElementById('example');
const promptInput = document.querySelector('input[type="text"]');
const status = document.getElementById('status');
const imageContainer = document.getElementById('container');
const example = document.getElementById('example');
const uploadInput = document.getElementById('upload');

let ortSessionA, ortSessionB, ortSessionC, ortSessionD, ortSessionE;
let config;
let currentImage = '';
let currentQuery = '';

async function initializeSessions() {
  status.textContent = 'Loading model...';
  
  ortSessionA = await ort.InferenceSession.create(
    await getModelFile(ONNX_MODEL, `onnx/QwenVL_A_${QUANT}.onnx`),
    { executionProviders: ["webgpu"] }
  );

  ortSessionB = await ort.InferenceSession.create(
    await getModelFile(ONNX_MODEL, `onnx/QwenVL_B_${QUANT}.onnx`),
    { executionProviders: ["webgpu"] }
  );

  ortSessionC = await ort.InferenceSession.create(
    await getModelFile(ONNX_MODEL, `onnx/QwenVL_C_${QUANT}.onnx`),
    { executionProviders: ["webgpu"] }
  );

  config = (await getModelJSON(BASE_MODEL, "config.json"));

  status.textContent = 'Ready';
}

export function int64ToFloat16(int64Value) {
  // Convert BigInt to Number (float64)
  const float64Value = Number(int64Value);

  // Handle special cases
  if (!isFinite(float64Value)) return float64Value > 0 ? 0x7c00 : 0xfc00; // +/- infinity
  if (float64Value === 0) return 0; // Zero is represented as 0

  // Get sign, exponent, and mantissa from float64
  const sign = float64Value < 0 ? 1 : 0;
  const absValue = Math.abs(float64Value);
  const exponent = Math.floor(Math.log2(absValue));
  const mantissa = absValue / Math.pow(2, exponent) - 1;

  // Convert exponent and mantissa to float16 format
  const float16Exponent = exponent + 15; // Offset exponent by 15 (float16 bias)
  const float16Mantissa = Math.round(mantissa * 1024); // 10-bit mantissa for float16

  // Handle overflow/underflow
  if (float16Exponent <= 0) {
    // Subnormal numbers (exponent <= 0)
    return (sign << 15) | (float16Mantissa >> 1);
  } else if (float16Exponent >= 31) {
    // Overflow, set to infinity
    return (sign << 15) | 0x7c00;
  } else {
    // Normalized numbers
    return (sign << 15) | (float16Exponent << 10) | (float16Mantissa & 0x3ff);
  }
}

export function float16ToInt64(float16Value) {
  // Extract components from float16
  const sign = (float16Value & 0x8000) >> 15;
  const exponent = (float16Value & 0x7c00) >> 10;
  const mantissa = float16Value & 0x03ff;

  // Handle special cases
  if (exponent === 0 && mantissa === 0) return BigInt(0); // Zero
  if (exponent === 0x1f) return sign ? BigInt("-Infinity") : BigInt("Infinity"); // Infinity

  // Convert back to number
  let value;
  if (exponent === 0) {
    // Subnormal numbers
    value = Math.pow(2, -14) * (mantissa / 1024);
  } else {
    // Normalized numbers
    value = Math.pow(2, exponent - 15) * (1 + mantissa / 1024);
  }

  // Apply sign
  value = sign ? -value : value;

  return BigInt(Math.round(value));
}

// async function parse(img, txt) {
//   imageContainer.innerHTML = '';
//   imageContainer.style.backgroundImage = `url(${img})`;
//   status.textContent = 'Analysing...';
//   const output = await imageTextToText(img, txt);
//   status.textContent = output;
// }

async function handleQuery(imageUrl, query) {
  if (!imageUrl || !query.trim()) {
    status.textContent = 'Please provide both an image and a prompt';
    return;
  }
  
  try {
    status.textContent = 'Analyzing...';

    // container.style.backgroundImage = `url(${imageUrl})`;
    // container.style.backgroundSize = 'contain';
    // container.style.backgroundRepeat = 'no-repeat';
    // container.style.backgroundPosition = 'center';

    updatePreview(imageUrl);
    
    const result = await imageTextToText(imageUrl, query);
    status.textContent = result;
  } catch (err) {
    status.textContent = 'Error processing request';
    console.error(err);
  }
}


export async function imageTextToText(
  imagePath,
  query,
  vision = true
) {

  const prompt_head_len = new Tensor("int64", new BigInt64Array([5n]), [1]);

  let position_ids;
  let num_decode = 0;
  let history_len = new Tensor("int64", new BigInt64Array([0n]), [1]);

  var pos_factor_v = BigInt(1 - IMAGE_EMBED_SIZE + WIDTH_FACTOR);

  let past_key_states = new ort.Tensor(
    "float16",
    new Uint16Array(
      config.num_hidden_layers *
        config.num_key_value_heads *
        MAX_SEQ_LENGTH *
        (config.hidden_size / config.num_attention_heads)
    ).fill(0),
    [
      config.num_hidden_layers,
      config.num_key_value_heads,
      MAX_SEQ_LENGTH,
      config.hidden_size / config.num_attention_heads,
    ]
  );

  let past_value_states = past_key_states;

  let attention_mask = new ort.Tensor(
    "float16",
    new Uint16Array([0xfbff]),
    [1]
  );

  let pos_factor = new Tensor("float16", new Uint16Array([0]), [1]);
  
  const tokenizer = await AutoTokenizer.from_pretrained(BASE_MODEL);
  const prompt = `\n<|im_start|>user\n<|vision_start|><|vision_end|>${query}<|im_end|>\n<|im_start|>assistant\n`;
  const token = await tokenizer(prompt, {
    return_tensors: "pt",
    add_generation_prompt: false,
    tokenize: true,
  }).input_ids;

  const seq_length = token.dims[1];
  let ids_len = new Tensor("int64", new BigInt64Array([BigInt(seq_length)]), [
    1,
  ]);

  let input_ids = new ort.Tensor(
    "int32",
    new Int32Array(MAX_SEQ_LENGTH).fill(0),
    [MAX_SEQ_LENGTH]
  );

  input_ids.data.set(Array.from(token.data.slice(0, seq_length), Number));

  const dummy = new ort.Tensor("int32", new Int32Array([0]), []);

  let { hidden_states } = await ortSessionB.run({
    input_ids: input_ids,
    ids_len: ids_len,
  });

  ({ position_ids } = await ortSessionC.run({
    dummy: dummy,
  }));

  // Process image
  if (vision) {
    let image = await RawImage.fromURL(imagePath);

    image = await image.resize(INPUT_IMAGE_SIZE[0], INPUT_IMAGE_SIZE[1]);

    image = image.rgb();

    image = image.toTensor("CHW");
    image = image.to("float32");
    image = image.div_(255.0);
    const pixel_values = image.unsqueeze(0);

    const { image_embed } = await ortSessionA.run({
      pixel_values: pixel_values,
    });

    ids_len = ids_len.add(BigInt(IMAGE_EMBED_SIZE));

    const split_factor = new Tensor(
      "int32",
      new Int32Array([
        MAX_SEQ_LENGTH - Number(ids_len.item()) - IMAGE_EMBED_SIZE,
      ]),
      [1]
    );

    const ids_len_minus = new Tensor(
      "int32",
      new Int32Array([Number(ids_len.item()) - Number(prompt_head_len.item())]),
      [1]
    );

    await ortSessionA.release();
    ortSessionA = null;

    ortSessionD = await ort.InferenceSession.create(
      await getModelFile(ONNX_MODEL, `onnx/QwenVL_D_${QUANT}.onnx`),
      {
        executionProviders: ["webgpu"],
      }
    );

    ({ hidden_states, position_ids } = await ortSessionD.run({
      "hidden_states.1": hidden_states,
      image_embed,
      ids_len,
      ids_len_minus,
      split_factor,
    }));

    await ortSessionD.release();
    ortSessionD = null;
  }

  let output = '';

  while (
    num_decode < MAX_SINGLE_CHAT_LENGTH &&
    Number(history_len.data[0]) < MAX_SEQ_LENGTH
  ) {
    let token_id;

    if (!ortSessionE) {
      ortSessionE = await ort.InferenceSession.create(
        await getModelFile(ONNX_MODEL, `onnx/QwenVL_E_${QUANT}.onnx`),
        {
          executionProviders: ["wasm"],
        },
      );
    }

    ({
      max_logit_ids: token_id,
      past_key_states: past_key_states,
      past_value_states: past_value_states,
    } = await ortSessionE.run({
      hidden_states,
      attention_mask,
      "past_key_states.1": past_key_states,
      "past_value_states.1": past_value_states,
      history_len,
      ids_len,
      position_ids,
      pos_factor,
    }));

    if (token_id === 151643 || token_id === 151645) {
      break;
    }

    num_decode++;
    if (num_decode < 2) {
      history_len = history_len.add(BigInt(ids_len.data[0]));

      ids_len = new ort.Tensor("int64", new BigInt64Array([1n]), [1]);

      attention_mask = new ort.Tensor("float16", new Uint16Array([0]), [1]);

      if (vision) {
        pos_factor = new Tensor(
          "float16",
          new Uint16Array([int64ToFloat16(pos_factor_v + ids_len.data[0])]),
          [1]
        );
      } else {
        pos_factor = new Tensor(
          "float16",
          new Uint16Array([int64ToFloat16(history_len.data[0] + BigInt(1))]),
          [1]
        );
      }

    } else {
      history_len = history_len.add(BigInt(1));
      pos_factor = pos_factor.map((v) =>
        int64ToFloat16(float16ToInt64(v) + BigInt(1))
      );
    }
    (input_ids.data)[0] = Number(token_id.data[0]);

    const result_B = await ortSessionB.run({
      input_ids: input_ids,
      ids_len: ids_len,
    });
    hidden_states = result_B.hidden_states;

    if (
      !Number.isInteger(token_id.data[0]) &&
      !["bigint", "number"].includes(typeof token_id.data[0])
    ) {
      throw new Error(`Token ID is not an integer`);
    } else {
      const decoded = tokenizer.decode([...token_id.data]);
      console.log({decoded});
      
      output += decoded;
    }
  }
}

async function updatePreview(url) {
  const image = await RawImage.fromURL(url);
  const ar = image.width / image.height;
  const [cw, ch] = (ar > 1) ? [640, 640 / ar] : [640 * ar, 640];
  imageContainer.style.width = `${cw}px`;
  imageContainer.style.height = `${ch}px`;
  imageContainer.style.backgroundImage = `url(${url})`;
}

await initializeSessions();

// UI Event Handlers
exampleButton.addEventListener('click', (e) => {
  e.preventDefault();
  e.stopPropagation();
  currentImage = EXAMPLE_URL;
  status.textContent = promptInput.value.trim() ? 'Press Enter to analyze' : 'Add a prompt and press Enter';
});

uploadInput.addEventListener('change', (e) => {
  const file = e.target.files[0];
  if (!file) return;

  const reader = new FileReader();
  reader.onload = (e2) => {
    currentImage = e2.target.result;
    status.textContent = promptInput.value.trim() ? 'Press Enter to analyze' : 'Add a prompt and press Enter';
  };
  reader.readAsDataURL(file);
});

promptInput.addEventListener('keypress', (e) => {
  currentQuery = e.target.value;
  if (e.key === 'Enter') {
    if (!currentImage) {
      status.textContent = 'Please select an image first';
      return;
    }
    handleQuery(currentImage, currentQuery);
  }
});

promptInput.addEventListener('input', () => {
  if (currentImage && !promptInput.value.trim()) {
    status.textContent = 'Add a prompt and press Enter';
  } else if (currentImage && promptInput.value.trim()) {
    status.textContent = 'Press Enter to analyze';
  }
});