File size: 10,760 Bytes
4c4928f 94a91a9 b2ca260 50f59bb 37f2943 4c42c50 37f2943 94a91a9 b2ca260 15ad92b 5ef5062 b2ca260 4492c02 0c484c0 350ff37 b2ca260 44a6e1e 0629524 e6643db 6eb05ef 94a91a9 1477d49 b2ca260 1477d49 94a91a9 1477d49 94a91a9 1477d49 b2ca260 0629524 b2ca260 fcbf7ba b2ca260 f9c460b fcbf7ba b2ca260 fcbf7ba b2ca260 5ef5062 31a2c22 5ef5062 94a91a9 5ef5062 abaea80 37f2943 abaea80 37f2943 abaea80 37f2943 abaea80 37f2943 abaea80 37f2943 abaea80 9a35308 37f2943 abaea80 37f2943 abaea80 41c8086 abaea80 37f2943 abaea80 37f2943 abaea80 fde6d8b abaea80 37f2943 abaea80 37f2943 abaea80 1477d49 abaea80 37f2943 abaea80 1477d49 abaea80 37f2943 abaea80 1477d49 37f2943 1477d49 37f2943 abaea80 37f2943 abaea80 37f2943 abaea80 37f2943 abaea80 37f2943 abaea80 37f2943 abaea80 37f2943 abaea80 37f2943 abaea80 8fa488a abaea80 37f2943 1477d49 37f2943 76ac858 4492c02 42c6033 76ac858 fcbf7ba e6643db fcbf7ba e6643db fcbf7ba e6643db fcbf7ba 7563c18 ab198d8 fcbf7ba 2cfb880 d1563c4 e6643db fcbf7ba 5ef5062 2012cbe 350ff37 5ef5062 350ff37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 |
import { env, AutoTokenizer, RawImage, Tensor } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers';
import { getModelJSON, getModelFile } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/src/utils/hub.js";
import * as ort from "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.20.0/dist/ort.webgpu.mjs";
const EXAMPLE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg";
const INPUT_IMAGE_SIZE = [960, 960];
const HEIGHT_FACTOR = 10;
const WIDTH_FACTOR = 10;
const IMAGE_EMBED_SIZE = WIDTH_FACTOR * HEIGHT_FACTOR;
const MAX_SEQ_LENGTH = 1024;
const BASE_MODEL = "Qwen/Qwen2-VL-2B-Instruct";
const ONNX_MODEL = "pdufour/Qwen2-VL-2B-Instruct-ONNX-Q4-F16";
const QUANT = "q4f16";
const MAX_SINGLE_CHAT_LENGTH = 10;
// UI Elements
const exampleButton = document.getElementById('example');
const promptInput = document.querySelector('input[type="text"]');
const status = document.getElementById('status');
const imageContainer = document.getElementById('container');
const example = document.getElementById('example');
const thumb = document.getElementById('thumb');
const uploadInput = document.getElementById('upload');
const form = document.getElementById('form');
let ortSessionA, ortSessionB, ortSessionC, ortSessionD, ortSessionE;
let config;
let currentImage = '';
let currentQuery = '';
async function initializeSessions() {
status.textContent = 'Loading model...';
ortSessionA = await ort.InferenceSession.create(
await getModelFile(ONNX_MODEL, `onnx/QwenVL_A_${QUANT}.onnx`),
{ executionProviders: ["webgpu"] }
);
ortSessionB = await ort.InferenceSession.create(
await getModelFile(ONNX_MODEL, `onnx/QwenVL_B_${QUANT}.onnx`),
{ executionProviders: ["webgpu"] }
);
ortSessionC = await ort.InferenceSession.create(
await getModelFile(ONNX_MODEL, `onnx/QwenVL_C_${QUANT}.onnx`),
{ executionProviders: ["webgpu"] }
);
config = (await getModelJSON(BASE_MODEL, "config.json"));
status.textContent = 'Ready';
}
export function int64ToFloat16(int64Value) {
// Convert BigInt to Number (float64)
const float64Value = Number(int64Value);
// Handle special cases
if (!isFinite(float64Value)) return float64Value > 0 ? 0x7c00 : 0xfc00; // +/- infinity
if (float64Value === 0) return 0; // Zero is represented as 0
// Get sign, exponent, and mantissa from float64
const sign = float64Value < 0 ? 1 : 0;
const absValue = Math.abs(float64Value);
const exponent = Math.floor(Math.log2(absValue));
const mantissa = absValue / Math.pow(2, exponent) - 1;
// Convert exponent and mantissa to float16 format
const float16Exponent = exponent + 15; // Offset exponent by 15 (float16 bias)
const float16Mantissa = Math.round(mantissa * 1024); // 10-bit mantissa for float16
// Handle overflow/underflow
if (float16Exponent <= 0) {
// Subnormal numbers (exponent <= 0)
return (sign << 15) | (float16Mantissa >> 1);
} else if (float16Exponent >= 31) {
// Overflow, set to infinity
return (sign << 15) | 0x7c00;
} else {
// Normalized numbers
return (sign << 15) | (float16Exponent << 10) | (float16Mantissa & 0x3ff);
}
}
export function float16ToInt64(float16Value) {
// Extract components from float16
const sign = (float16Value & 0x8000) >> 15;
const exponent = (float16Value & 0x7c00) >> 10;
const mantissa = float16Value & 0x03ff;
// Handle special cases
if (exponent === 0 && mantissa === 0) return BigInt(0); // Zero
if (exponent === 0x1f) return sign ? BigInt("-Infinity") : BigInt("Infinity"); // Infinity
// Convert back to number
let value;
if (exponent === 0) {
// Subnormal numbers
value = Math.pow(2, -14) * (mantissa / 1024);
} else {
// Normalized numbers
value = Math.pow(2, exponent - 15) * (1 + mantissa / 1024);
}
// Apply sign
value = sign ? -value : value;
return BigInt(Math.round(value));
}
async function handleQuery(imageUrl, query) {
if (!imageUrl || !query.trim()) {
status.textContent = 'Please provide both an image and a prompt';
return;
}
cnosole.log('Analyzing...');
try {
status.textContent = 'Analyzing...';
const result = await imageTextToText(imageUrl, query);
status.textContent = result;
} catch (err) {
status.textContent = 'Error processing request';
console.error(err);
}
}
export async function imageTextToText(
imagePath,
query,
vision = true
) {
const prompt_head_len = new Tensor("int64", new BigInt64Array([5n]), [1]);
let position_ids;
let num_decode = 0;
let history_len = new Tensor("int64", new BigInt64Array([0n]), [1]);
var pos_factor_v = BigInt(1 - IMAGE_EMBED_SIZE + WIDTH_FACTOR);
let past_key_states = new ort.Tensor(
"float16",
new Uint16Array(
config.num_hidden_layers *
config.num_key_value_heads *
MAX_SEQ_LENGTH *
(config.hidden_size / config.num_attention_heads)
).fill(0),
[
config.num_hidden_layers,
config.num_key_value_heads,
MAX_SEQ_LENGTH,
config.hidden_size / config.num_attention_heads,
]
);
let past_value_states = past_key_states;
let attention_mask = new ort.Tensor(
"float16",
new Uint16Array([0xfbff]),
[1]
);
let pos_factor = new Tensor("float16", new Uint16Array([0]), [1]);
const tokenizer = await AutoTokenizer.from_pretrained(BASE_MODEL);
const prompt = `\n<|im_start|>user\n<|vision_start|><|vision_end|>${query}<|im_end|>\n<|im_start|>assistant\n`;
const token = await tokenizer(prompt, {
return_tensors: "pt",
add_generation_prompt: false,
tokenize: true,
}).input_ids;
const seq_length = token.dims[1];
let ids_len = new Tensor("int64", new BigInt64Array([BigInt(seq_length)]), [
1,
]);
let input_ids = new ort.Tensor(
"int32",
new Int32Array(MAX_SEQ_LENGTH).fill(0),
[MAX_SEQ_LENGTH]
);
input_ids.data.set(Array.from(token.data.slice(0, seq_length), Number));
const dummy = new ort.Tensor("int32", new Int32Array([0]), []);
let { hidden_states } = await ortSessionB.run({
input_ids: input_ids,
ids_len: ids_len,
});
({ position_ids } = await ortSessionC.run({
dummy: dummy,
}));
// Process image
if (vision) {
let image = await RawImage.fromURL(imagePath);
image = await image.resize(INPUT_IMAGE_SIZE[0], INPUT_IMAGE_SIZE[1]);
image = image.rgb();
image = image.toTensor("CHW");
image = image.to("float32");
image = image.div_(255.0);
const pixel_values = image.unsqueeze(0);
const { image_embed } = await ortSessionA.run({
pixel_values: pixel_values,
});
ids_len = ids_len.add(BigInt(IMAGE_EMBED_SIZE));
const split_factor = new Tensor(
"int32",
new Int32Array([
MAX_SEQ_LENGTH - Number(ids_len.item()) - IMAGE_EMBED_SIZE,
]),
[1]
);
const ids_len_minus = new Tensor(
"int32",
new Int32Array([Number(ids_len.item()) - Number(prompt_head_len.item())]),
[1]
);
await ortSessionA.release();
ortSessionA = null;
ortSessionD = await ort.InferenceSession.create(
await getModelFile(ONNX_MODEL, `onnx/QwenVL_D_${QUANT}.onnx`),
{
executionProviders: ["webgpu"],
}
);
({ hidden_states, position_ids } = await ortSessionD.run({
"hidden_states.1": hidden_states,
image_embed,
ids_len,
ids_len_minus,
split_factor,
}));
await ortSessionD.release();
ortSessionD = null;
}
let output = '';
while (
num_decode < MAX_SINGLE_CHAT_LENGTH &&
Number(history_len.data[0]) < MAX_SEQ_LENGTH
) {
let token_id;
if (!ortSessionE) {
ortSessionE = await ort.InferenceSession.create(
await getModelFile(ONNX_MODEL, `onnx/QwenVL_E_${QUANT}.onnx`),
{
executionProviders: ["wasm"],
},
);
}
({
max_logit_ids: token_id,
past_key_states: past_key_states,
past_value_states: past_value_states,
} = await ortSessionE.run({
hidden_states,
attention_mask,
"past_key_states.1": past_key_states,
"past_value_states.1": past_value_states,
history_len,
ids_len,
position_ids,
pos_factor,
}));
if (token_id === 151643 || token_id === 151645) {
break;
}
num_decode++;
if (num_decode < 2) {
history_len = history_len.add(BigInt(ids_len.data[0]));
ids_len = new ort.Tensor("int64", new BigInt64Array([1n]), [1]);
attention_mask = new ort.Tensor("float16", new Uint16Array([0]), [1]);
if (vision) {
pos_factor = new Tensor(
"float16",
new Uint16Array([int64ToFloat16(pos_factor_v + ids_len.data[0])]),
[1]
);
} else {
pos_factor = new Tensor(
"float16",
new Uint16Array([int64ToFloat16(history_len.data[0] + BigInt(1))]),
[1]
);
}
} else {
history_len = history_len.add(BigInt(1));
pos_factor = pos_factor.map((v) =>
int64ToFloat16(float16ToInt64(v) + BigInt(1))
);
}
(input_ids.data)[0] = Number(token_id.data[0]);
const result_B = await ortSessionB.run({
input_ids: input_ids,
ids_len: ids_len,
});
hidden_states = result_B.hidden_states;
if (
!Number.isInteger(token_id.data[0]) &&
!["bigint", "number"].includes(typeof token_id.data[0])
) {
throw new Error(`Token ID is not an integer`);
} else {
const decoded = tokenizer.decode([...token_id.data]);
console.log({decoded});
output += decoded;
}
}
}
async function updatePreview(url) {
const image = await RawImage.fromURL(url);
const ar = image.width / image.height;
const [cw, ch] = (ar > 1) ? [640, 640 / ar] : [640 * ar, 640];
thumb.style.width = `${cw}px`;
thumb.style.height = `${ch}px`;
thumb.style.backgroundImage = `url(${url})`;
thumb.innerHTML = '';
}
await initializeSessions();
// UI Event Handlers
exampleButton.addEventListener('click', (e) => {
e.preventDefault();
e.stopPropagation();
currentImage = EXAMPLE_URL;
});
uploadInput.addEventListener('change', (e) => {
const file = e.target.files[0];
if (!file) return;
console.log('file hcange');
const reader = new FileReader();
reader.onload = (e2) => {
currentImage = e2.target.result;
updatePreview(currentImage);
};
reader.readAsDataURL(file);
});
form.addEventListener('submit', (e) => {
e.preventDefault();
e.stopPropagation();
if (!currentImage || !currentQuery) {
status.textContent = 'Please select an image and type a prompt';
} else {
handleQuery(currentImage, currentQuery);
}
}); |