pdufour commited on
Commit
37f2943
1 Parent(s): 879676c

Update index.js

Browse files
Files changed (1) hide show
  1. index.js +166 -30
index.js CHANGED
@@ -1,4 +1,7 @@
1
- import { pipeline, env } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers';
 
 
 
2
 
3
  // Since we will download the model from the Hugging Face Hub, we can skip the local model check
4
  env.allowLocalModels = false;
@@ -9,11 +12,18 @@ const fileUpload = document.getElementById('upload');
9
  const imageContainer = document.getElementById('container');
10
  const example = document.getElementById('example');
11
 
12
- const EXAMPLE_URL = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/city-streets.jpg';
 
 
 
 
 
 
 
 
 
13
 
14
- // Create a new object detection pipeline
15
  status.textContent = 'Loading model...';
16
- const detector = await pipeline('object-detection', 'Xenova/detr-resnet-50');
17
  status.textContent = 'Ready';
18
 
19
  example.addEventListener('click', (e) => {
@@ -50,30 +60,156 @@ async function detect(img) {
50
  output.forEach(renderBox);
51
  }
52
 
53
- // Render a bounding box and label on the image
54
- function renderBox({ box, label }) {
55
- const { xmax, xmin, ymax, ymin } = box;
56
-
57
- // Generate a random color for the box
58
- const color = '#' + Math.floor(Math.random() * 0xFFFFFF).toString(16).padStart(6, 0);
59
-
60
- // Draw the box
61
- const boxElement = document.createElement('div');
62
- boxElement.className = 'bounding-box';
63
- Object.assign(boxElement.style, {
64
- borderColor: color,
65
- left: 100 * xmin + '%',
66
- top: 100 * ymin + '%',
67
- width: 100 * (xmax - xmin) + '%',
68
- height: 100 * (ymax - ymin) + '%',
69
- })
70
-
71
- // Draw label
72
- const labelElement = document.createElement('span');
73
- labelElement.textContent = label;
74
- labelElement.className = 'bounding-box-label';
75
- labelElement.style.backgroundColor = color;
76
-
77
- boxElement.appendChild(labelElement);
78
- imageContainer.appendChild(boxElement);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  }
 
 
 
1
+ import { pipeline, env, AutoTokenizer, RawImage } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers';
2
+ import { getModelJSON } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers/utils/hub.js";
3
+ import { Tensor } from "https://cdn.jsdelivr.net/npm/@huggingface/transformer/utils/tensor.js";
4
+ import * as ort from "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.20.0/dist/ort.webgpu.mjs";
5
 
6
  // Since we will download the model from the Hugging Face Hub, we can skip the local model check
7
  env.allowLocalModels = false;
 
12
  const imageContainer = document.getElementById('container');
13
  const example = document.getElementById('example');
14
 
15
+ const EXAMPLE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg",
16
+ const INPUT_IMAGE_SIZE = [960, 960] as const;
17
+ const HEIGHT_FACTOR = 10;
18
+ const WIDTH_FACTOR = 10;
19
+ const IMAGE_EMBED_SIZE = WIDTH_FACTOR * HEIGHT_FACTOR;
20
+ const MAX_SEQ_LENGTH = 1024;
21
+ const ONNX_URL = "http://localhost:3004/onnx";
22
+ const BASE_MODEL = "Qwen/Qwen2-VL-2B-Instruct";
23
+ const QUANTIZATION = "q4f16";
24
+ const MAX_SINGLE_CHAT_LENGTH = 10;
25
 
 
26
  status.textContent = 'Loading model...';
 
27
  status.textContent = 'Ready';
28
 
29
  example.addEventListener('click', (e) => {
 
60
  output.forEach(renderBox);
61
  }
62
 
63
+
64
+ export async function simplifiedLLMVision(
65
+ imagePath: string,
66
+ query: string,
67
+ vision = true
68
+ ) {
69
+ const suffix = QUANTIZATION ? `_${QUANTIZATION}` : "";
70
+
71
+ const config = (await getModelJSON(BASE_MODEL, "config.json")) as any;
72
+
73
+ const prompt_head_len = new Tensor("int64", new BigInt64Array([5n]), [1]);
74
+
75
+ let position_ids;
76
+ let num_decode = 0;
77
+ let history_len = new Tensor("int64", new BigInt64Array([0n]), [1]);
78
+
79
+ let past_key_states = new ort.Tensor(
80
+ "float16",
81
+ new Uint16Array(
82
+ config.num_hidden_layers *
83
+ config.num_key_value_heads *
84
+ MAX_SEQ_LENGTH *
85
+ (config.hidden_size / config.num_attention_heads)
86
+ ).fill(0),
87
+ [
88
+ config.num_hidden_layers,
89
+ config.num_key_value_heads,
90
+ MAX_SEQ_LENGTH,
91
+ config.hidden_size / config.num_attention_heads,
92
+ ]
93
+ );
94
+
95
+ let past_value_states = past_key_states;
96
+
97
+ let attention_mask = new ort.Tensor(
98
+ "float16",
99
+ new Uint16Array([0xfbff]), // -65504.0 in float16
100
+ [1]
101
+ );
102
+
103
+ let pos_factor = new Tensor("float16", new Uint16Array([0]), [1]);
104
+
105
+ const tokenizer = await AutoTokenizer.from_pretrained(BASE_MODEL);
106
+ const prompt = `\n<|im_start|>user\n<|vision_start|><|vision_end|>${query}<|im_end|>\n<|im_start|>assistant\n`;
107
+ const token = await tokenizer(prompt, {
108
+ return_tensors: "pt",
109
+ add_generation_prompt: false,
110
+ tokenize: true,
111
+ }).input_ids;
112
+
113
+ const seq_length = token.dims[1];
114
+ let ids_len = new Tensor("int64", new BigInt64Array([BigInt(seq_length)]), [
115
+ 1,
116
+ ]);
117
+
118
+ let input_ids = new ort.Tensor(
119
+ "int32",
120
+ new Int32Array(MAX_SEQ_LENGTH).fill(0),
121
+ [MAX_SEQ_LENGTH]
122
+ );
123
+
124
+ input_ids.data.set(Array.from(token.data.slice(0, seq_length), Number));
125
+
126
+ if (vision) {
127
+ let image = await RawImage.fromURL(imagePath);
128
+ image = image.rgb().toTensor("CHW").to("float32").div_(255.0);
129
+ const pixel_values = image.unsqueeze(0);
130
+
131
+ const ortSessionA = await ort.InferenceSession.create(
132
+ `${BASE_URL}/QwenVL_A${suffix}.onnx`,
133
+ { executionProviders: ["webgpu"] }
134
+ );
135
+
136
+ const { image_embed } = await ortSessionA.run({ pixel_values });
137
+
138
+ ids_len = ids_len.add(BigInt(IMAGE_EMBED_SIZE));
139
+
140
+ const ortSessionD = await ort.InferenceSession.create(
141
+ `${BASE_URL}/QwenVL_D${suffix}.onnx`,
142
+ { executionProviders: ["webgpu"] }
143
+ );
144
+
145
+ ({ hidden_states: past_key_states, position_ids } =
146
+ await ortSessionD.run({
147
+ "hidden_states.1": past_key_states,
148
+ image_embed,
149
+ ids_len,
150
+ "ids_len_minus": new Tensor(
151
+ "int32",
152
+ new Int32Array([Number(ids_len.item()) - Number(prompt_head_len.item())]),
153
+ [1]
154
+ ),
155
+ "split_factor": new Tensor(
156
+ "int32",
157
+ new Int32Array([
158
+ MAX_SEQ_LENGTH - Number(ids_len.item()) - IMAGE_EMBED_SIZE,
159
+ ]),
160
+ [1]
161
+ ),
162
+ }));
163
+ }
164
+
165
+ const ortSessionB = await ort.InferenceSession.create(
166
+ `${BASE_URL}/QwenVL_B${suffix}.onnx`,
167
+ { executionProviders: ["webgpu"] }
168
+ );
169
+
170
+ while (
171
+ num_decode < MAX_SINGLE_CHAT_LENGTH &&
172
+ Number(history_len.data[0]) < MAX_SEQ_LENGTH
173
+ ) {
174
+ const ortSessionE = await ort.InferenceSession.create(
175
+ `${BASE_URL}/QwenVL_E_q4f16.onnx`,
176
+ { executionProviders: ["wasm"] }
177
+ );
178
+
179
+ const result = await ortSessionE.run({
180
+ hidden_states: past_key_states,
181
+ attention_mask,
182
+ "past_key_states.1": past_key_states,
183
+ "past_value_states.1": past_value_states,
184
+ history_len,
185
+ ids_len,
186
+ position_ids,
187
+ pos_factor,
188
+ });
189
+
190
+ const token_id = result.max_logit_ids;
191
+ if (token_id === 151643 || token_id === 151645) break;
192
+
193
+ num_decode++;
194
+
195
+ history_len = history_len.add(BigInt(1));
196
+ pos_factor = new Tensor(
197
+ "float16",
198
+ new Uint16Array([Number(pos_factor.data[0]) + 1]),
199
+ [1]
200
+ );
201
+
202
+ past_key_states = result.past_key_states;
203
+ past_value_states = result.past_value_states;
204
+
205
+ input_ids.data[0] = Number(token_id.data[0]);
206
+ const { hidden_states } = await ortSessionB.run({
207
+ input_ids,
208
+ ids_len,
209
+ });
210
+
211
+ past_key_states = hidden_states;
212
+ }
213
  }
214
+
215
+