Spaces:

jbilcke-hf
/

clapper

Running on CPU Upgrade

App Files Files Community

clapper / src /services /io /extractCaptionFromFrameMoondream.ts

jbilcke-hf HF staff

work on video import

86259b4 8 days ago

raw

history blame

No virus

2.37 kB

	import {
	AutoProcessor,
	AutoTokenizer,
	Moondream1ForConditionalGeneration,
	RawImage,
	} from '@xenova/transformers'

	export async function extractCaptionFromFrameMoondream(
	imageInBase64DataUri: string
	): Promise<string> {
	if (!(navigator as any).gpu) {
	throw new Error(`Please enable WebGPU to analyze video frames:

	1. You need a modern browser such as Google Chrome 113+, Microsoft Edge 113+, Safari 18 (macOS 15), Firefox Nightly

	2. You need to enable WebGPU (depends on your browser, see below)

	2.1 For Chrome: Perform the following operations in the Chrome / Microsoft Edge address bar
	The chrome://flags/#enable-unsafe-webgpu flag must be enabled (not enable-webgpu-developer-features).
	Linux experimental support also requires launching the browser with --enable-features=Vulkan.

	2.2 For Safari 18 (macOS 15): WebGPU is enabled by default

	2.3 For Firefox Nightly: Type about:config in the address bar and set 'dom.webgpu.enabled" to true
	`)
	}

	// Load processor, tokenizer and model
	const model_id = 'Xenova/moondream2'
	const processor = await AutoProcessor.from_pretrained(model_id)
	const tokenizer = await AutoTokenizer.from_pretrained(model_id)
	const model = await Moondream1ForConditionalGeneration.from_pretrained(
	model_id,
	{
	dtype: {
	embed_tokens: 'fp16', // or 'fp32'
	vision_encoder: 'fp16', // or 'q8'
	decoder_model_merged: 'q4', // or 'q4f16' or 'q8'
	},
	device: 'webgpu',
	}
	)

	// Prepare text inputs
	const prompt = 'Describe this image.'
	const text = `<image>\n\nQuestion: ${prompt}\n\nAnswer:`
	const text_inputs = tokenizer(text)

	// Prepare vision inputs
	const image = await RawImage.fromURL(imageInBase64DataUri)
	const vision_inputs = await processor(image)

	// Generate response
	const output = await model.generate({
	...text_inputs,
	...vision_inputs,
	do_sample: false,
	max_new_tokens: 177,
	})
	const decoded = tokenizer.batch_decode(output, { skip_special_tokens: false })
	console.log(decoded)
	// [
	// '<\|endoftext\|><image>\n\n' +
	// 'Question: Describe this image.\n\n' +
	// 'Answer: A hand is holding a white book titled "The Little Book of Deep Learning" against a backdrop of a balcony with a railing and a view of a building and trees.<\|endoftext\|>'
	// ]

	return `${decoded[0] \|\| ''}`
	}