Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 1,514 Bytes
ae8cf98 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
importScripts('https://cdn.jsdelivr.net/npm/@xenova/transformers@3.0.0');
const { AutoProcessor, AutoTokenizer, Moondream1ForConditionalGeneration, RawImage } = transformers;
let processor;
let tokenizer;
let model;
async function initializeModel() {
const model_id = 'Xenova/moondream2';
processor = await AutoProcessor.from_pretrained(model_id);
tokenizer = await AutoTokenizer.from_pretrained(model_id);
model = await Moondream1ForConditionalGeneration.from_pretrained(model_id, {
dtype: {
embed_tokens: 'fp16',
vision_encoder: 'fp16',
decoder_model_merged: 'q4',
},
device: 'webgpu',
});
}
async function captionImage(imageDataUrl) {
if (!processor || !tokenizer || !model) {
await initializeModel();
}
const prompt = 'Describe this image.';
const text = `<image>\n\nQuestion: ${prompt}\n\nAnswer:`;
const text_inputs = tokenizer(text);
const image = await RawImage.fromURL(imageDataUrl);
const vision_inputs = await processor(image);
const output = await model.generate({
...text_inputs,
...vision_inputs,
do_sample: false,
max_new_tokens: 64,
});
const decoded = tokenizer.batch_decode(output, { skip_special_tokens: true });
return decoded[0].trim();
}
self.addEventListener('message', async (event) => {
const { imageDataUrl } = event.data;
try {
const caption = await captionImage(imageDataUrl);
self.postMessage({ caption });
} catch (error) {
self.postMessage({ error: error.message });
}
}); |