Add loading indicators

#1
by tudi2d - opened
Files changed (1) hide show
  1. index.html +288 -217
index.html CHANGED
@@ -1,260 +1,331 @@
1
  <!DOCTYPE html>
2
  <html lang="en">
3
- <head>
4
- <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
  <title>Camera Interaction App</title>
7
  <style>
8
- body {
9
- font-family: sans-serif;
10
- display: flex;
11
- flex-direction: column;
12
- align-items: center;
13
- gap: 20px;
14
- padding: 20px;
15
- background-color: #f0f0f0;
16
- }
17
- .controls, .io-areas {
18
- display: flex;
19
- gap: 10px;
20
- align-items: center;
21
- background-color: #fff;
22
- padding: 15px;
23
- border-radius: 8px;
24
- box-shadow: 0 2px 5px rgba(0,0,0,0.1);
25
- }
26
- .io-areas {
27
- flex-direction: column;
28
- align-items: stretch;
29
- }
30
- textarea {
31
- width: 300px;
32
- height: 80px;
33
- padding: 8px;
34
- border: 1px solid #ccc;
35
- border-radius: 4px;
36
- font-size: 14px;
37
- }
38
- #videoFeed {
39
- width: 480px;
40
- height: 360px;
41
- border: 2px solid #333;
42
- background-color: #000;
43
- border-radius: 8px;
44
- }
45
- #startButton {
46
- padding: 10px 20px;
47
- font-size: 16px;
48
- cursor: pointer;
49
- border: none;
50
- border-radius: 4px;
51
- color: white;
52
- }
53
- #startButton.start {
54
- background-color: #28a745; /* Green */
55
- }
56
- #startButton.stop {
57
- background-color: #dc3545; /* Red */
58
- }
59
- label {
60
- font-weight: bold;
61
- }
62
- select {
63
- padding: 8px;
64
- border-radius: 4px;
65
- border: 1px solid #ccc;
66
- }
67
- .hidden {
68
- display: none;
69
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  </style>
71
- </head>
72
- <body>
73
-
74
  <h1>Camera Interaction App</h1>
75
 
76
- <video id="videoFeed" autoplay playsinline></video>
77
- <canvas id="canvas" class="hidden"></canvas> <!-- For capturing frames -->
 
 
 
 
78
 
79
  <div class="io-areas">
80
- <div>
81
- <label for="instructionText">Instruction:</label><br>
82
- <textarea id="instructionText" style="height: 2em; width: 40em" name="Instruction"></textarea>
83
- </div>
84
- <div>
85
- <label for="responseText">Response:</label><br>
86
- <textarea id="responseText" style="height: 2em; width: 40em" name="Response" readonly placeholder="Server response will appear here..."></textarea>
87
- </div>
 
 
 
 
 
 
 
 
 
 
88
  </div>
89
 
90
  <div class="controls">
91
- <label for="intervalSelect">Interval between 2 requests:</label>
92
- <select id="intervalSelect" name="Interval between 2 requests">
93
- <option value="0" selected>0ms</option>
94
- <option value="100">100ms</option>
95
- <option value="250">250ms</option>
96
- <option value="500">500ms</option>
97
- <option value="1000">1s</option>
98
- <option value="2000">2s</option>
99
- </select>
100
- <button id="startButton" class="start">Start</button>
101
  </div>
102
 
103
  <script type="module">
104
- import {
105
- AutoProcessor,
106
- AutoModelForVision2Seq,
107
- RawImage
108
- } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers/dist/transformers.min.js';
 
109
 
110
- const video = document.getElementById('videoFeed');
111
- const canvas = document.getElementById('canvas');
112
- const instructionText = document.getElementById('instructionText');
113
- const responseText = document.getElementById('responseText');
114
- const intervalSelect = document.getElementById('intervalSelect');
115
- const startButton = document.getElementById('startButton');
 
116
 
117
- instructionText.value = "In one sentence, what do you see?"; // default instruction
118
 
119
- let stream;
120
- let isProcessing = false;
121
 
122
- let processor, model;
123
 
124
- async function initModel() {
125
- const modelId = 'HuggingFaceTB/SmolVLM-Instruct';
126
- processor = await AutoProcessor.from_pretrained(modelId);
127
- model = await AutoModelForVision2Seq.from_pretrained(modelId, {
128
- dtype: {
129
- embed_tokens: 'fp16',
130
- vision_encoder: 'q4',
131
- decoder_model_merged: 'q4'
132
- },
133
- device: "webgpu",
134
- });
135
- }
 
 
 
 
 
136
 
137
- async function initCamera() {
138
- try {
139
- stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: false });
140
- video.srcObject = stream;
141
- responseText.value = "Camera access granted. Ready to start.";
142
- } catch (err) {
143
- console.error("Error accessing camera:", err);
144
- responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
145
- alert(`Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`);
146
- }
 
 
 
 
147
  }
 
148
 
149
- function captureImage() {
150
- if (!stream || !video.videoWidth) {
151
- console.warn("Video stream not ready for capture.");
152
- return null;
153
- }
154
- canvas.width = video.videoWidth;
155
- canvas.height = video.videoHeight;
156
- const context = canvas.getContext('2d', { willReadFrequently: true });
157
- context.drawImage(video, 0, 0, canvas.width, canvas.height);
158
- const frame = context.getImageData(0, 0, canvas.width, canvas.height);
159
- return new RawImage(frame.data, frame.width, frame.height, 4);
160
  }
 
 
 
 
 
 
 
161
 
162
- async function runLocalVisionInference(imgElement, instruction) {
163
- const messages = [{
164
- role: 'user',
165
- content: [
166
- { type: 'image' },
167
- { type: 'text', text: instruction }
168
- ]
169
- }];
170
- const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
171
- const inputs = await processor(text, [imgElement], { do_image_splitting: false });
172
- const generatedIds = await model.generate({ ...inputs, max_new_tokens: 100 });
173
- const output = processor.batch_decode(
174
- generatedIds.slice(null, [inputs.input_ids.dims.at(-1), null]),
175
- { skip_special_tokens: true }
176
- );
177
- return output[0].trim();
178
- }
 
 
 
 
 
 
179
 
180
- async function sendData() {
181
- if (!isProcessing) return;
182
- const instruction = instructionText.value;
183
- const rawImg = captureImage();
184
- if (!rawImg) {
185
- responseText.value = 'Capture failed';
186
- return;
187
- }
188
- try {
189
- const reply = await runLocalVisionInference(rawImg, instruction);
190
- responseText.value = reply;
191
- } catch (e) {
192
- console.error(e);
193
- responseText.value = `Error: ${e.message}`;
194
- }
195
  }
 
 
 
 
 
196
 
197
- function sleep(ms) {
198
- return new Promise(resolve => setTimeout(resolve, ms));
 
 
 
 
199
  }
 
200
 
201
- async function processingLoop() {
202
- const intervalMs = parseInt(intervalSelect.value, 10);
203
- while (isProcessing) {
204
- await sendData();
205
- if (!isProcessing) break;
206
- await sleep(intervalMs);
207
- }
208
  }
 
 
 
209
 
210
- function handleStart() {
211
- if (!stream) {
212
- responseText.value = "Camera not available. Cannot start.";
213
- alert("Camera not available. Please grant permission first.");
214
- return;
215
- }
216
- isProcessing = true;
217
- startButton.textContent = "Stop";
218
- startButton.classList.replace('start', 'stop');
219
 
220
- instructionText.disabled = true;
221
- intervalSelect.disabled = true;
222
 
223
- responseText.value = "Processing started...";
 
224
 
225
- processingLoop();
 
 
 
 
 
 
 
 
226
  }
 
227
 
228
- function handleStop() {
229
- isProcessing = false;
230
- startButton.textContent = "Start";
231
- startButton.classList.replace('stop', 'start');
 
 
 
232
 
233
- instructionText.disabled = false;
234
- intervalSelect.disabled = false;
235
- if (responseText.value.startsWith("Processing started...")) {
236
- responseText.value = "Processing stopped.";
237
- }
 
 
 
 
 
 
 
 
238
  }
239
 
240
- startButton.addEventListener('click', () => {
241
- if (isProcessing) {
242
- handleStop();
243
- } else {
244
- handleStart();
245
- }
246
- });
247
 
248
- window.addEventListener('DOMContentLoaded', async () => {
249
- await initModel();
250
- await initCamera();
251
- });
252
-
253
- window.addEventListener('beforeunload', () => {
254
- if (stream) {
255
- stream.getTracks().forEach(track => track.stop());
256
- }
257
- });
258
  </script>
259
- </body>
260
  </html>
 
1
  <!DOCTYPE html>
2
  <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
  <title>Camera Interaction App</title>
7
  <style>
8
+ body {
9
+ font-family: sans-serif;
10
+ display: flex;
11
+ flex-direction: column;
12
+ align-items: center;
13
+ gap: 20px;
14
+ padding: 20px;
15
+ background-color: #f0f0f0;
16
+ }
17
+ .controls,
18
+ .io-areas {
19
+ display: flex;
20
+ gap: 10px;
21
+ align-items: center;
22
+ background-color: #fff;
23
+ padding: 15px;
24
+ border-radius: 8px;
25
+ box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
26
+ }
27
+ .io-areas {
28
+ flex-direction: column;
29
+ align-items: stretch;
30
+ }
31
+ textarea {
32
+ width: 300px;
33
+ height: 80px;
34
+ padding: 8px;
35
+ border: 1px solid #ccc;
36
+ border-radius: 4px;
37
+ font-size: 14px;
38
+ }
39
+ #videoFeed {
40
+ display: block;
41
+ width: 100%;
42
+ height: 100%;
43
+ border-radius: 6px;
44
+ object-fit: cover;
45
+ }
46
+ #videoContainer {
47
+ position: relative;
48
+ width: 480px;
49
+ height: 360px;
50
+ border: 2px solid #333;
51
+ background-color: #000;
52
+ border-radius: 8px;
53
+ margin: 0 auto;
54
+ }
55
+ #loadingOverlay {
56
+ position: absolute;
57
+ top: 0;
58
+ left: 0;
59
+ width: 100%;
60
+ height: 100%;
61
+ display: none;
62
+ justify-content: center;
63
+ align-items: center;
64
+ background-color: rgba(0, 0, 0, 0.7);
65
+ z-index: 10;
66
+ border-radius: 6px;
67
+ color: #ffffff;
68
+ font-size: 1.5em;
69
+ font-weight: bold;
70
+ }
71
+ #startButton {
72
+ padding: 10px 20px;
73
+ font-size: 16px;
74
+ cursor: pointer;
75
+ border: none;
76
+ border-radius: 4px;
77
+ color: white;
78
+ }
79
+ #startButton.start {
80
+ background-color: #28a745; /* Green */
81
+ }
82
+ #startButton.stop {
83
+ background-color: #dc3545; /* Red */
84
+ }
85
+ label {
86
+ font-weight: bold;
87
+ }
88
+ select {
89
+ padding: 8px;
90
+ border-radius: 4px;
91
+ border: 1px solid #ccc;
92
+ }
93
+ .hidden {
94
+ display: none;
95
+ }
96
  </style>
97
+ </head>
98
+ <body>
 
99
  <h1>Camera Interaction App</h1>
100
 
101
+ <div id="videoContainer">
102
+ <video id="videoFeed" autoplay playsinline></video>
103
+ <div id="loadingOverlay">Loading...</div>
104
+ </div>
105
+ <canvas id="canvas" class="hidden"></canvas>
106
+ <!-- For capturing frames -->
107
 
108
  <div class="io-areas">
109
+ <div>
110
+ <label for="instructionText">Instruction:</label><br />
111
+ <textarea
112
+ id="instructionText"
113
+ style="height: 2em; width: 40em"
114
+ name="Instruction"
115
+ ></textarea>
116
+ </div>
117
+ <div>
118
+ <label for="responseText">Response:</label><br />
119
+ <textarea
120
+ id="responseText"
121
+ style="height: 2em; width: 40em"
122
+ name="Response"
123
+ readonly
124
+ placeholder="Server response will appear here..."
125
+ ></textarea>
126
+ </div>
127
  </div>
128
 
129
  <div class="controls">
130
+ <label for="intervalSelect">Interval between 2 requests:</label>
131
+ <select id="intervalSelect" name="Interval between 2 requests">
132
+ <option value="0" selected>0ms</option>
133
+ <option value="100">100ms</option>
134
+ <option value="250">250ms</option>
135
+ <option value="500">500ms</option>
136
+ <option value="1000">1s</option>
137
+ <option value="2000">2s</option>
138
+ </select>
139
+ <button id="startButton" class="start">Start</button>
140
  </div>
141
 
142
  <script type="module">
143
+ import {
144
+ AutoProcessor,
145
+ AutoModelForVision2Seq,
146
+ RawImage,
147
+ env,
148
+ } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers/dist/transformers.min.js";
149
 
150
+ const video = document.getElementById("videoFeed");
151
+ const canvas = document.getElementById("canvas");
152
+ const instructionText = document.getElementById("instructionText");
153
+ const responseText = document.getElementById("responseText");
154
+ const intervalSelect = document.getElementById("intervalSelect");
155
+ const startButton = document.getElementById("startButton");
156
+ const loadingOverlay = document.getElementById("loadingOverlay");
157
 
158
+ instructionText.value = "In one sentence, what do you see?"; // default instruction
159
 
160
+ let stream;
161
+ let isProcessing = false;
162
 
163
+ let processor, model;
164
 
165
+ async function initModel() {
166
+ const modelId = "HuggingFaceTB/SmolVLM-Instruct";
167
+ loadingOverlay.style.display = "flex";
168
+ responseText.value = "Loading processor...";
169
+ processor = await AutoProcessor.from_pretrained(modelId);
170
+ responseText.value = "Processor loaded. Loading model...";
171
+ model = await AutoModelForVision2Seq.from_pretrained(modelId, {
172
+ dtype: {
173
+ embed_tokens: "fp16",
174
+ vision_encoder: "q4",
175
+ decoder_model_merged: "q4",
176
+ },
177
+ device: "webgpu",
178
+ });
179
+ responseText.value = "Model loaded. Initializing camera...";
180
+ loadingOverlay.style.display = "none";
181
+ }
182
 
183
+ async function initCamera() {
184
+ try {
185
+ stream = await navigator.mediaDevices.getUserMedia({
186
+ video: true,
187
+ audio: false,
188
+ });
189
+ video.srcObject = stream;
190
+ responseText.value = "Camera access granted. Ready to start.";
191
+ } catch (err) {
192
+ console.error("Error accessing camera:", err);
193
+ responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
194
+ alert(
195
+ `Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`
196
+ );
197
  }
198
+ }
199
 
200
+ function captureImage() {
201
+ if (!stream || !video.videoWidth) {
202
+ console.warn("Video stream not ready for capture.");
203
+ return null;
 
 
 
 
 
 
 
204
  }
205
+ canvas.width = video.videoWidth;
206
+ canvas.height = video.videoHeight;
207
+ const context = canvas.getContext("2d", { willReadFrequently: true });
208
+ context.drawImage(video, 0, 0, canvas.width, canvas.height);
209
+ const frame = context.getImageData(0, 0, canvas.width, canvas.height);
210
+ return new RawImage(frame.data, frame.width, frame.height, 4);
211
+ }
212
 
213
+ async function runLocalVisionInference(imgElement, instruction) {
214
+ const messages = [
215
+ {
216
+ role: "user",
217
+ content: [{ type: "image" }, { type: "text", text: instruction }],
218
+ },
219
+ ];
220
+ const text = processor.apply_chat_template(messages, {
221
+ add_generation_prompt: true,
222
+ });
223
+ const inputs = await processor(text, [imgElement], {
224
+ do_image_splitting: false,
225
+ });
226
+ const generatedIds = await model.generate({
227
+ ...inputs,
228
+ max_new_tokens: 100,
229
+ });
230
+ const output = processor.batch_decode(
231
+ generatedIds.slice(null, [inputs.input_ids.dims.at(-1), null]),
232
+ { skip_special_tokens: true }
233
+ );
234
+ return output[0].trim();
235
+ }
236
 
237
+ async function sendData() {
238
+ if (!isProcessing) return;
239
+ const instruction = instructionText.value;
240
+ const rawImg = captureImage();
241
+ if (!rawImg) {
242
+ responseText.value = "Capture failed";
243
+ return;
244
+ }
245
+ try {
246
+ const reply = await runLocalVisionInference(rawImg, instruction);
247
+ responseText.value = reply;
248
+ } catch (e) {
249
+ console.error(e);
250
+ responseText.value = `Error: ${e.message}`;
 
251
  }
252
+ }
253
+
254
+ function sleep(ms) {
255
+ return new Promise((resolve) => setTimeout(resolve, ms));
256
+ }
257
 
258
+ async function processingLoop() {
259
+ const intervalMs = parseInt(intervalSelect.value, 10);
260
+ while (isProcessing) {
261
+ await sendData();
262
+ if (!isProcessing) break;
263
+ await sleep(intervalMs);
264
  }
265
+ }
266
 
267
+ function handleStart() {
268
+ if (!stream) {
269
+ responseText.value = "Camera not available. Cannot start.";
270
+ alert("Camera not available. Please grant permission first.");
271
+ return;
 
 
272
  }
273
+ isProcessing = true;
274
+ startButton.textContent = "Stop";
275
+ startButton.classList.replace("start", "stop");
276
 
277
+ instructionText.disabled = true;
278
+ intervalSelect.disabled = true;
 
 
 
 
 
 
 
279
 
280
+ responseText.value = "Processing started...";
 
281
 
282
+ processingLoop();
283
+ }
284
 
285
+ function handleStop() {
286
+ isProcessing = false;
287
+ startButton.textContent = "Start";
288
+ startButton.classList.replace("stop", "start");
289
+
290
+ instructionText.disabled = false;
291
+ intervalSelect.disabled = false;
292
+ if (responseText.value.startsWith("Processing started...")) {
293
+ responseText.value = "Processing stopped.";
294
  }
295
+ }
296
 
297
+ startButton.addEventListener("click", () => {
298
+ if (isProcessing) {
299
+ handleStop();
300
+ } else {
301
+ handleStart();
302
+ }
303
+ });
304
 
305
+ window.addEventListener("DOMContentLoaded", async () => {
306
+ // Check for WebGPU support
307
+ if (!navigator.gpu) {
308
+ const videoElement = document.getElementById("videoFeed");
309
+ const warningElement = document.createElement("p");
310
+ warningElement.textContent =
311
+ "WebGPU is not available in this browser.";
312
+ warningElement.style.color = "red";
313
+ warningElement.style.textAlign = "center";
314
+ videoElement.parentNode.insertBefore(
315
+ warningElement,
316
+ videoElement.nextSibling
317
+ );
318
  }
319
 
320
+ await initModel();
321
+ await initCamera();
322
+ });
 
 
 
 
323
 
324
+ window.addEventListener("beforeunload", () => {
325
+ if (stream) {
326
+ stream.getTracks().forEach((track) => track.stop());
327
+ }
328
+ });
 
 
 
 
 
329
  </script>
330
+ </body>
331
  </html>