pdufour's picture
Update index.js
6eb05ef verified
raw
history blame
11.6 kB
import { env, AutoTokenizer, RawImage, Tensor } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers';
import { getModelJSON, getModelFile } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/src/utils/hub.js";
import * as ort from "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.20.0/dist/ort.webgpu.mjs";
const EXAMPLE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg";
const INPUT_IMAGE_SIZE = [960, 960];
const HEIGHT_FACTOR = 10;
const WIDTH_FACTOR = 10;
const IMAGE_EMBED_SIZE = WIDTH_FACTOR * HEIGHT_FACTOR;
const MAX_SEQ_LENGTH = 1024;
const BASE_MODEL = "Qwen/Qwen2-VL-2B-Instruct";
const ONNX_MODEL = "pdufour/Qwen2-VL-2B-Instruct-ONNX-Q4-F16";
const QUANT = "q4f16";
const MAX_SINGLE_CHAT_LENGTH = 10;
// UI Elements
const exampleButton = document.getElementById('example');
const promptInput = document.querySelector('input[type="text"]');
const status = document.getElementById('status');
const imageContainer = document.getElementById('container');
const example = document.getElementById('example');
const uploadInput = document.getElementById('upload');
let ortSessionA, ortSessionB, ortSessionC, ortSessionD, ortSessionE;
let config;
let currentImage = '';
let currentQuery = '';
async function initializeSessions() {
status.textContent = 'Loading model...';
ortSessionA = await ort.InferenceSession.create(
await getModelFile(ONNX_MODEL, `onnx/QwenVL_A_${QUANT}.onnx`),
{ executionProviders: ["webgpu"] }
);
ortSessionB = await ort.InferenceSession.create(
await getModelFile(ONNX_MODEL, `onnx/QwenVL_B_${QUANT}.onnx`),
{ executionProviders: ["webgpu"] }
);
ortSessionC = await ort.InferenceSession.create(
await getModelFile(ONNX_MODEL, `onnx/QwenVL_C_${QUANT}.onnx`),
{ executionProviders: ["webgpu"] }
);
config = (await getModelJSON(BASE_MODEL, "config.json"));
status.textContent = 'Ready';
}
export function int64ToFloat16(int64Value) {
// Convert BigInt to Number (float64)
const float64Value = Number(int64Value);
// Handle special cases
if (!isFinite(float64Value)) return float64Value > 0 ? 0x7c00 : 0xfc00; // +/- infinity
if (float64Value === 0) return 0; // Zero is represented as 0
// Get sign, exponent, and mantissa from float64
const sign = float64Value < 0 ? 1 : 0;
const absValue = Math.abs(float64Value);
const exponent = Math.floor(Math.log2(absValue));
const mantissa = absValue / Math.pow(2, exponent) - 1;
// Convert exponent and mantissa to float16 format
const float16Exponent = exponent + 15; // Offset exponent by 15 (float16 bias)
const float16Mantissa = Math.round(mantissa * 1024); // 10-bit mantissa for float16
// Handle overflow/underflow
if (float16Exponent <= 0) {
// Subnormal numbers (exponent <= 0)
return (sign << 15) | (float16Mantissa >> 1);
} else if (float16Exponent >= 31) {
// Overflow, set to infinity
return (sign << 15) | 0x7c00;
} else {
// Normalized numbers
return (sign << 15) | (float16Exponent << 10) | (float16Mantissa & 0x3ff);
}
}
export function float16ToInt64(float16Value) {
// Extract components from float16
const sign = (float16Value & 0x8000) >> 15;
const exponent = (float16Value & 0x7c00) >> 10;
const mantissa = float16Value & 0x03ff;
// Handle special cases
if (exponent === 0 && mantissa === 0) return BigInt(0); // Zero
if (exponent === 0x1f) return sign ? BigInt("-Infinity") : BigInt("Infinity"); // Infinity
// Convert back to number
let value;
if (exponent === 0) {
// Subnormal numbers
value = Math.pow(2, -14) * (mantissa / 1024);
} else {
// Normalized numbers
value = Math.pow(2, exponent - 15) * (1 + mantissa / 1024);
}
// Apply sign
value = sign ? -value : value;
return BigInt(Math.round(value));
}
// async function parse(img, txt) {
// imageContainer.innerHTML = '';
// imageContainer.style.backgroundImage = `url(${img})`;
// status.textContent = 'Analysing...';
// const output = await imageTextToText(img, txt);
// status.textContent = output;
// }
async function handleQuery(imageUrl, query) {
if (!imageUrl || !query.trim()) {
status.textContent = 'Please provide both an image and a prompt';
return;
}
try {
status.textContent = 'Analyzing...';
// container.style.backgroundImage = `url(${imageUrl})`;
// container.style.backgroundSize = 'contain';
// container.style.backgroundRepeat = 'no-repeat';
// container.style.backgroundPosition = 'center';
updatePreview(imageUrl);
const result = await imageTextToText(imageUrl, query);
status.textContent = result;
} catch (err) {
status.textContent = 'Error processing request';
console.error(err);
}
}
export async function imageTextToText(
imagePath,
query,
vision = true
) {
const prompt_head_len = new Tensor("int64", new BigInt64Array([5n]), [1]);
let position_ids;
let num_decode = 0;
let history_len = new Tensor("int64", new BigInt64Array([0n]), [1]);
var pos_factor_v = BigInt(1 - IMAGE_EMBED_SIZE + WIDTH_FACTOR);
let past_key_states = new ort.Tensor(
"float16",
new Uint16Array(
config.num_hidden_layers *
config.num_key_value_heads *
MAX_SEQ_LENGTH *
(config.hidden_size / config.num_attention_heads)
).fill(0),
[
config.num_hidden_layers,
config.num_key_value_heads,
MAX_SEQ_LENGTH,
config.hidden_size / config.num_attention_heads,
]
);
let past_value_states = past_key_states;
let attention_mask = new ort.Tensor(
"float16",
new Uint16Array([0xfbff]),
[1]
);
let pos_factor = new Tensor("float16", new Uint16Array([0]), [1]);
const tokenizer = await AutoTokenizer.from_pretrained(BASE_MODEL);
const prompt = `\n<|im_start|>user\n<|vision_start|><|vision_end|>${query}<|im_end|>\n<|im_start|>assistant\n`;
const token = await tokenizer(prompt, {
return_tensors: "pt",
add_generation_prompt: false,
tokenize: true,
}).input_ids;
const seq_length = token.dims[1];
let ids_len = new Tensor("int64", new BigInt64Array([BigInt(seq_length)]), [
1,
]);
let input_ids = new ort.Tensor(
"int32",
new Int32Array(MAX_SEQ_LENGTH).fill(0),
[MAX_SEQ_LENGTH]
);
input_ids.data.set(Array.from(token.data.slice(0, seq_length), Number));
const dummy = new ort.Tensor("int32", new Int32Array([0]), []);
let { hidden_states } = await ortSessionB.run({
input_ids: input_ids,
ids_len: ids_len,
});
({ position_ids } = await ortSessionC.run({
dummy: dummy,
}));
// Process image
if (vision) {
let image = await RawImage.fromURL(imagePath);
image = await image.resize(INPUT_IMAGE_SIZE[0], INPUT_IMAGE_SIZE[1]);
image = image.rgb();
image = image.toTensor("CHW");
image = image.to("float32");
image = image.div_(255.0);
const pixel_values = image.unsqueeze(0);
const { image_embed } = await ortSessionA.run({
pixel_values: pixel_values,
});
ids_len = ids_len.add(BigInt(IMAGE_EMBED_SIZE));
const split_factor = new Tensor(
"int32",
new Int32Array([
MAX_SEQ_LENGTH - Number(ids_len.item()) - IMAGE_EMBED_SIZE,
]),
[1]
);
const ids_len_minus = new Tensor(
"int32",
new Int32Array([Number(ids_len.item()) - Number(prompt_head_len.item())]),
[1]
);
await ortSessionA.release();
ortSessionA = null;
ortSessionD = await ort.InferenceSession.create(
await getModelFile(ONNX_MODEL, `onnx/QwenVL_D_${QUANT}.onnx`),
{
executionProviders: ["webgpu"],
}
);
({ hidden_states, position_ids } = await ortSessionD.run({
"hidden_states.1": hidden_states,
image_embed,
ids_len,
ids_len_minus,
split_factor,
}));
await ortSessionD.release();
ortSessionD = null;
}
let output = '';
while (
num_decode < MAX_SINGLE_CHAT_LENGTH &&
Number(history_len.data[0]) < MAX_SEQ_LENGTH
) {
let token_id;
if (!ortSessionE) {
ortSessionE = await ort.InferenceSession.create(
await getModelFile(ONNX_MODEL, `onnx/QwenVL_E_${QUANT}.onnx`),
{
executionProviders: ["wasm"],
},
);
}
({
max_logit_ids: token_id,
past_key_states: past_key_states,
past_value_states: past_value_states,
} = await ortSessionE.run({
hidden_states,
attention_mask,
"past_key_states.1": past_key_states,
"past_value_states.1": past_value_states,
history_len,
ids_len,
position_ids,
pos_factor,
}));
if (token_id === 151643 || token_id === 151645) {
break;
}
num_decode++;
if (num_decode < 2) {
history_len = history_len.add(BigInt(ids_len.data[0]));
ids_len = new ort.Tensor("int64", new BigInt64Array([1n]), [1]);
attention_mask = new ort.Tensor("float16", new Uint16Array([0]), [1]);
if (vision) {
pos_factor = new Tensor(
"float16",
new Uint16Array([int64ToFloat16(pos_factor_v + ids_len.data[0])]),
[1]
);
} else {
pos_factor = new Tensor(
"float16",
new Uint16Array([int64ToFloat16(history_len.data[0] + BigInt(1))]),
[1]
);
}
} else {
history_len = history_len.add(BigInt(1));
pos_factor = pos_factor.map((v) =>
int64ToFloat16(float16ToInt64(v) + BigInt(1))
);
}
(input_ids.data)[0] = Number(token_id.data[0]);
const result_B = await ortSessionB.run({
input_ids: input_ids,
ids_len: ids_len,
});
hidden_states = result_B.hidden_states;
if (
!Number.isInteger(token_id.data[0]) &&
!["bigint", "number"].includes(typeof token_id.data[0])
) {
throw new Error(`Token ID is not an integer`);
} else {
const decoded = tokenizer.decode([...token_id.data]);
console.log({decoded});
output += decoded;
}
}
}
async function updatePreview(url) {
const image = await RawImage.fromURL(url);
const ar = image.width / image.height;
const [cw, ch] = (ar > 1) ? [640, 640 / ar] : [640 * ar, 640];
imageContainer.style.width = `${cw}px`;
imageContainer.style.height = `${ch}px`;
imageContainer.style.backgroundImage = `url(${url})`;
}
await initializeSessions();
// UI Event Handlers
exampleButton.addEventListener('click', (e) => {
e.preventDefault();
e.stopPropagation();
currentImage = EXAMPLE_URL;
status.textContent = promptInput.value.trim() ? 'Press Enter to analyze' : 'Add a prompt and press Enter';
});
uploadInput.addEventListener('change', (e) => {
const file = e.target.files[0];
if (!file) return;
const reader = new FileReader();
reader.onload = (e2) => {
currentImage = e2.target.result;
status.textContent = promptInput.value.trim() ? 'Press Enter to analyze' : 'Add a prompt and press Enter';
};
reader.readAsDataURL(file);
});
promptInput.addEventListener('keypress', (e) => {
currentQuery = e.target.value;
if (e.key === 'Enter') {
if (!currentImage) {
status.textContent = 'Please select an image first';
return;
}
handleQuery(currentImage, currentQuery);
}
});
promptInput.addEventListener('input', () => {
if (currentImage && !promptInput.value.trim()) {
status.textContent = 'Add a prompt and press Enter';
} else if (currentImage && promptInput.value.trim()) {
status.textContent = 'Press Enter to analyze';
}
});