Spaces:

pdufour
/

Qwen2-VL-2B-Instruct-ONNX-Q4-F16

Running

App Files Files Community

pdufour commited on Nov 19

Commit

37f2943

•

1 Parent(s): 879676c

Update index.js

Browse files

Files changed (1) hide show

index.js +166 -30

index.js CHANGED Viewed

@@ -1,4 +1,7 @@
-import { pipeline, env } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers';
 // Since we will download the model from the Hugging Face Hub, we can skip the local model check
 env.allowLocalModels = false;
@@ -9,11 +12,18 @@ const fileUpload = document.getElementById('upload');
 const imageContainer = document.getElementById('container');
 const example = document.getElementById('example');
-const EXAMPLE_URL = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/city-streets.jpg';
-// Create a new object detection pipeline
 status.textContent = 'Loading model...';
-const detector = await pipeline('object-detection', 'Xenova/detr-resnet-50');
 status.textContent = 'Ready';
 example.addEventListener('click', (e) => {
@@ -50,30 +60,156 @@ async function detect(img) {
     output.forEach(renderBox);
 }
-// Render a bounding box and label on the image
-function renderBox({ box, label }) {
-    const { xmax, xmin, ymax, ymin } = box;
-    // Generate a random color for the box
-    const color = '#' + Math.floor(Math.random() * 0xFFFFFF).toString(16).padStart(6, 0);
-    // Draw the box
-    const boxElement = document.createElement('div');
-    boxElement.className = 'bounding-box';
-    Object.assign(boxElement.style, {
-        borderColor: color,
-        left: 100 * xmin + '%',
-        top: 100 * ymin + '%',
-        width: 100 * (xmax - xmin) + '%',
-        height: 100 * (ymax - ymin) + '%',
-    })
-    // Draw label
-    const labelElement = document.createElement('span');
-    labelElement.textContent = label;
-    labelElement.className = 'bounding-box-label';
-    labelElement.style.backgroundColor = color;
-    boxElement.appendChild(labelElement);
-    imageContainer.appendChild(boxElement);
 }

+import { pipeline, env, AutoTokenizer, RawImage } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers';
+import { getModelJSON } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers/utils/hub.js";
+import { Tensor } from "https://cdn.jsdelivr.net/npm/@huggingface/transformer/utils/tensor.js";
+import * as ort from "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.20.0/dist/ort.webgpu.mjs";
 // Since we will download the model from the Hugging Face Hub, we can skip the local model check
 env.allowLocalModels = false;
 const imageContainer = document.getElementById('container');
 const example = document.getElementById('example');
+const EXAMPLE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg",
+const INPUT_IMAGE_SIZE = [960, 960] as const;
+const HEIGHT_FACTOR = 10;
+const WIDTH_FACTOR = 10;
+const IMAGE_EMBED_SIZE = WIDTH_FACTOR * HEIGHT_FACTOR;
+const MAX_SEQ_LENGTH = 1024;
+const ONNX_URL = "http://localhost:3004/onnx";
+const BASE_MODEL = "Qwen/Qwen2-VL-2B-Instruct";
+const QUANTIZATION = "q4f16";
+const MAX_SINGLE_CHAT_LENGTH = 10;
 status.textContent = 'Loading model...';
 status.textContent = 'Ready';
 example.addEventListener('click', (e) => {
     output.forEach(renderBox);
 }
+export async function simplifiedLLMVision(
+  imagePath: string,
+  query: string,
+  vision = true
+) {
+  const suffix = QUANTIZATION ? `_${QUANTIZATION}` : "";
+  const config = (await getModelJSON(BASE_MODEL, "config.json")) as any;
+  const prompt_head_len = new Tensor("int64", new BigInt64Array([5n]), [1]);
+  let position_ids;
+  let num_decode = 0;
+  let history_len = new Tensor("int64", new BigInt64Array([0n]), [1]);
+  let past_key_states = new ort.Tensor(
+    "float16",
+    new Uint16Array(
+      config.num_hidden_layers *
+        config.num_key_value_heads *
+        MAX_SEQ_LENGTH *
+        (config.hidden_size / config.num_attention_heads)
+    ).fill(0),
+    [
+      config.num_hidden_layers,
+      config.num_key_value_heads,
+      MAX_SEQ_LENGTH,
+      config.hidden_size / config.num_attention_heads,
+    ]
+  );
+  let past_value_states = past_key_states;
+  let attention_mask = new ort.Tensor(
+    "float16",
+    new Uint16Array([0xfbff]), // -65504.0 in float16
+    [1]
+  );
+  let pos_factor = new Tensor("float16", new Uint16Array([0]), [1]);
+  const tokenizer = await AutoTokenizer.from_pretrained(BASE_MODEL);
+  const prompt = `\n<|im_start|>user\n<|vision_start|><|vision_end|>${query}<|im_end|>\n<|im_start|>assistant\n`;
+  const token = await tokenizer(prompt, {
+    return_tensors: "pt",
+    add_generation_prompt: false,
+    tokenize: true,
+  }).input_ids;
+  const seq_length = token.dims[1];
+  let ids_len = new Tensor("int64", new BigInt64Array([BigInt(seq_length)]), [
+    1,
+  ]);
+  let input_ids = new ort.Tensor(
+    "int32",
+    new Int32Array(MAX_SEQ_LENGTH).fill(0),
+    [MAX_SEQ_LENGTH]
+  );
+  input_ids.data.set(Array.from(token.data.slice(0, seq_length), Number));
+  if (vision) {
+    let image = await RawImage.fromURL(imagePath);
+    image = image.rgb().toTensor("CHW").to("float32").div_(255.0);
+    const pixel_values = image.unsqueeze(0);
+    const ortSessionA = await ort.InferenceSession.create(
+      `${BASE_URL}/QwenVL_A${suffix}.onnx`,
+      { executionProviders: ["webgpu"] }
+    );
+    const { image_embed } = await ortSessionA.run({ pixel_values });
+    ids_len = ids_len.add(BigInt(IMAGE_EMBED_SIZE));
+    const ortSessionD = await ort.InferenceSession.create(
+      `${BASE_URL}/QwenVL_D${suffix}.onnx`,
+      { executionProviders: ["webgpu"] }
+    );
+    ({ hidden_states: past_key_states, position_ids } =
+      await ortSessionD.run({
+        "hidden_states.1": past_key_states,
+        image_embed,
+        ids_len,
+        "ids_len_minus": new Tensor(
+          "int32",
+          new Int32Array([Number(ids_len.item()) - Number(prompt_head_len.item())]),
+          [1]
+        ),
+        "split_factor": new Tensor(
+          "int32",
+          new Int32Array([
+            MAX_SEQ_LENGTH - Number(ids_len.item()) - IMAGE_EMBED_SIZE,
+          ]),
+          [1]
+        ),
+      }));
+  }
+  const ortSessionB = await ort.InferenceSession.create(
+    `${BASE_URL}/QwenVL_B${suffix}.onnx`,
+    { executionProviders: ["webgpu"] }
+  );
+  while (
+    num_decode < MAX_SINGLE_CHAT_LENGTH &&
+    Number(history_len.data[0]) < MAX_SEQ_LENGTH
+  ) {
+    const ortSessionE = await ort.InferenceSession.create(
+      `${BASE_URL}/QwenVL_E_q4f16.onnx`,
+      { executionProviders: ["wasm"] }
+    );
+    const result = await ortSessionE.run({
+      hidden_states: past_key_states,
+      attention_mask,
+      "past_key_states.1": past_key_states,
+      "past_value_states.1": past_value_states,
+      history_len,
+      ids_len,
+      position_ids,
+      pos_factor,
+    });
+    const token_id = result.max_logit_ids;
+    if (token_id === 151643 || token_id === 151645) break;
+    num_decode++;
+    history_len = history_len.add(BigInt(1));
+    pos_factor = new Tensor(
+      "float16",
+      new Uint16Array([Number(pos_factor.data[0]) + 1]),
+      [1]
+    );
+    past_key_states = result.past_key_states;
+    past_value_states = result.past_value_states;
+    input_ids.data[0] = Number(token_id.data[0]);
+    const { hidden_states } = await ortSessionB.run({
+      input_ids,
+      ids_len,
+    });
+    past_key_states = hidden_states;
+  }
 }