Spaces:
Running
Running
import { | |
AutoProcessor, | |
AutoModelForImageTextToText, | |
RawImage, | |
TextStreamer, | |
} from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.2'; | |
let processor = null; | |
let model = null; | |
let currentVideo = null; | |
let frameDescriptions = []; | |
// Check WebGPU support | |
async function checkWebGPUSupport() { | |
const statusEl = document.getElementById('webgpuStatus'); | |
if ('gpu' in navigator) { | |
statusEl.textContent = 'β Available'; | |
statusEl.style.color = '#10b981'; | |
return true; | |
} else { | |
statusEl.textContent = 'β Not Available'; | |
statusEl.style.color = '#ef4444'; | |
document.getElementById('deviceSelect').value = 'wasm'; | |
document.getElementById('deviceSelect').disabled = true; | |
return false; | |
} | |
} | |
// Initialize the model | |
async function initializeModel() { | |
const device = document.getElementById('deviceSelect').value; | |
updateStatus('Loading AI model...'); | |
try { | |
const model_id = "onnx-community/FastVLM-0.5B-ONNX"; | |
processor = await AutoProcessor.from_pretrained(model_id); | |
const modelOptions = { | |
dtype: { | |
embed_tokens: "fp16", | |
vision_encoder: "q4", | |
decoder_model_merged: "q4", | |
} | |
}; | |
if (device === 'webgpu') { | |
modelOptions.device = 'webgpu'; | |
} | |
model = await AutoModelForImageTextToText.from_pretrained(model_id, modelOptions); | |
updateStatus('Model loaded successfully!'); | |
return true; | |
} catch (error) { | |
console.error('Model initialization error:', error); | |
showError('Failed to load AI model. Please try again.'); | |
return false; | |
} | |
} | |
// Extract frames from video | |
async function extractFramesFromVideo(videoFile, numFrames = 4) { | |
return new Promise((resolve, reject) => { | |
const video = document.createElement('video'); | |
const canvas = document.createElement('canvas'); | |
const ctx = canvas.getContext('2d'); | |
video.src = URL.createObjectURL(videoFile); | |
video.addEventListener('loadedmetadata', async () => { | |
const duration = video.duration; | |
const frameInterval = duration / numFrames; | |
const frames = []; | |
const frameTimes = []; | |
canvas.width = Math.min(video.videoWidth, 1024); | |
canvas.height = Math.min(video.videoHeight, 1024); | |
for (let i = 0; i < numFrames; i++) { | |
const currentTime = i * frameInterval; | |
video.currentTime = currentTime; | |
frameTimes.push(currentTime); | |
await new Promise(r => { | |
video.addEventListener('seeked', () => r(), { once: true }); | |
}); | |
ctx.drawImage(video, 0, 0, canvas.width, canvas.height); | |
const blob = await new Promise(r => canvas.toBlob(r, 'image/png')); | |
const image = await RawImage.fromBlob(blob); | |
// Store frame preview | |
const previewUrl = canvas.toDataURL('image/jpeg', 0.8); | |
frames.push({ image, preview: previewUrl, time: currentTime }); | |
updateProgress((i + 1) / numFrames * 30, `Extracting frame ${i + 1}/${numFrames}`); | |
} | |
URL.revokeObjectURL(video.src); | |
resolve(frames); | |
}); | |
video.addEventListener('error', () => { | |
URL.revokeObjectURL(video.src); | |
reject(new Error('Failed to load video')); | |
}); | |
}); | |
} | |
// Process video | |
async function processVideo() { | |
const analyzeBtn = document.getElementById('analyzeBtn'); | |
const progressSection = document.getElementById('progressSection'); | |
const resultsSection = document.getElementById('resultsSection'); | |
analyzeBtn.disabled = true; | |
analyzeBtn.querySelector('.spinner').classList.remove('hidden'); | |
analyzeBtn.querySelector('.btn-text').textContent = 'Processing...'; | |
progressSection.classList.remove('hidden'); | |
resultsSection.classList.add('hidden'); | |
frameDescriptions = []; | |
try { | |
// Initialize model if not already loaded | |
if (!model || !processor) { | |
if (!await initializeModel()) { | |
throw new Error('Model initialization failed'); | |
} | |
} | |
// Extract frames | |
const numFrames = parseInt(document.getElementById('frameCount').value); | |
updateProgress(0, 'Extracting frames from video...'); | |
const frames = await extractFramesFromVideo(currentVideo, numFrames); | |
// Display frame previews | |
const framesGrid = document.getElementById('framesGrid'); | |
framesGrid.innerHTML = ''; | |
// Process each frame | |
for (let i = 0; i < frames.length; i++) { | |
updateProgress(30 + (i / frames.length * 50), `Analyzing frame ${i + 1}/${frames.length}`); | |
// Create frame card | |
const frameCard = document.createElement('div'); | |
frameCard.className = 'frame-card'; | |
frameCard.innerHTML = ` | |
<img src="${frames[i].preview}" alt="Frame ${i + 1}"> | |
<div class="frame-info"> | |
<h4>Frame ${i + 1}</h4> | |
<span class="frame-time">${formatTime(frames[i].time)}</span> | |
</div> | |
<div class="frame-description"> | |
<div class="loading-dots">Analyzing...</div> | |
</div> | |
`; | |
framesGrid.appendChild(frameCard); | |
// Prepare prompt | |
const messages = [ | |
{ | |
role: "user", | |
content: `<image>Describe what's happening in this frame of the video in detail.`, | |
}, | |
]; | |
const prompt = processor.apply_chat_template(messages, { | |
add_generation_prompt: true, | |
}); | |
// Prepare inputs | |
const inputs = await processor(frames[i].image, prompt, { | |
add_special_tokens: false, | |
}); | |
// Generate output | |
let generatedText = ''; | |
try { | |
const outputs = await model.generate({ | |
...inputs, | |
max_new_tokens: 256, | |
do_sample: false, | |
streamer: new TextStreamer(processor.tokenizer, { | |
skip_prompt: true, | |
skip_special_tokens: false, | |
callback_function: (text) => { | |
generatedText += text; | |
frameCard.querySelector('.frame-description').innerHTML = `<p>${generatedText}</p>`; | |
}, | |
}), | |
}); | |
// Decode output | |
const decoded = processor.batch_decode( | |
outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), | |
{ skip_special_tokens: true }, | |
); | |
frameDescriptions.push({ | |
frame: i + 1, | |
time: frames[i].time, | |
description: decoded[0] || generatedText | |
}); | |
} catch (frameError) { | |
console.error(`Error processing frame ${i + 1}:`, frameError); | |
frameDescriptions.push({ | |
frame: i + 1, | |
time: frames[i].time, | |
description: 'Failed to analyze this frame' | |
}); | |
frameCard.querySelector('.frame-description').innerHTML = `<p style="color: #ef4444;">Failed to analyze this frame</p>`; | |
} | |
} | |
// Generate overall summary | |
updateProgress(80, 'Generating video summary...'); | |
const summaryCard = document.getElementById('summaryCard'); | |
const summaryContent = document.getElementById('summaryContent'); | |
// Create a summary based on the frame descriptions | |
if (frameDescriptions.length > 0) { | |
const summaryMessages = [ | |
{ | |
role: "user", | |
content: `<image>Based on what you see in this video frame and knowing that the video contains the following sequence: ${frameDescriptions.map(f => `Frame ${f.frame}: ${f.description}`).join('; ')}. Provide a comprehensive summary of what the entire video is about.`, | |
}, | |
]; | |
const summaryPrompt = processor.apply_chat_template(summaryMessages, { | |
add_generation_prompt: true, | |
}); | |
// Use the last frame's image for context | |
const summaryInputs = await processor(frames[frames.length - 1].image, summaryPrompt, { | |
add_special_tokens: false, | |
}); | |
let summaryText = ''; | |
const summaryOutputs = await model.generate({ | |
...summaryInputs, | |
max_new_tokens: 512, | |
do_sample: false, | |
streamer: new TextStreamer(processor.tokenizer, { | |
skip_prompt: true, | |
skip_special_tokens: false, | |
callback_function: (text) => { | |
summaryText += text; | |
summaryContent.innerHTML = `<p>${summaryText}</p>`; | |
summaryCard.classList.remove('hidden'); | |
}, | |
}), | |
}); | |
} | |
updateProgress(100, 'Analysis complete!'); | |
// Show results | |
resultsSection.classList.remove('hidden'); | |
progressSection.classList.add('hidden'); | |
} catch (error) { | |
console.error('Processing error:', error); | |
showError(`Failed to process video: ${error.message}`); | |
} finally { | |
analyzeBtn.disabled = false; | |
analyzeBtn.querySelector('.spinner').classList.add('hidden'); | |
analyzeBtn.querySelector('.btn-text').textContent = 'Analyze Video'; | |
} | |
} | |
// Utility functions | |
function formatTime(seconds) { | |
const mins = Math.floor(seconds / 60); | |
const secs = Math.floor(seconds % 60); | |
return `${mins}:${secs.toString().padStart(2, '0')}`; | |
} | |
function updateProgress(percent, status) { | |
document.getElementById('progressFill').style.width = `${percent}%`; | |
document.getElementById('progressText').textContent = `${Math.round(percent)}%`; | |
document.getElementById('currentStatus').textContent = status; | |
} | |
function updateStatus(message) { | |
document.getElementById('currentStatus').textContent = message; | |
} | |
function showError(message) { | |
document.getElementById('errorMessage').textContent = message; | |
document.getElementById('errorSection').classList.remove('hidden'); | |
document.getElementById('progressSection').classList.add('hidden'); | |
} | |
function downloadResults() { | |
const results = { | |
timestamp: new Date().toISOString(), | |
video: currentVideo.name, | |
frames: frameDescriptions, | |
summary: document.getElementById('summaryContent').textContent | |
}; | |
const blob = new Blob([JSON.stringify(results, null, 2)], { type: 'application/json' }); | |
const url = URL.createObjectURL(blob); | |
const a = document.createElement('a'); | |
a.href = url; | |
a.download = `video-analysis-${Date.now()}.json`; | |
a.click(); | |
URL.revokeObjectURL(url); | |
} | |
function resetApp() { | |
document.getElementById('videoInput').value = ''; | |
document.getElementById('videoInfo').innerHTML = ''; | |
document.getElementById('videoPreview').classList.add('hidden'); | |
document.getElementById('analyzeBtn').classList.add('hidden'); | |
document.getElementById('progressSection').classList.add('hidden'); | |
document.getElementById('resultsSection').classList.add('hidden'); | |
document.getElementById('errorSection').classList.add('hidden'); | |
currentVideo = null; | |
frameDescriptions = []; | |
} | |
// Event listeners | |
document.getElementById('videoInput').addEventListener('change', (e) => { | |
const file = e.target.files[0]; | |
if (file && file.type.startsWith('video/')) { | |
currentVideo = file; | |
// Display video info | |
const videoInfo = document.getElementById('videoInfo'); | |
videoInfo.innerHTML = ` | |
<div class="file-info"> | |
<span class="file-name">${file.name}</span> | |
<span class="file-size">${(file.size / 1024 / 1024).toFixed(2)} MB</span> | |
</div> | |
`; | |
// Show video preview | |
const videoPreview = document.getElementById('videoPreview'); | |
const videoElement = document.getElementById('videoElement'); | |
videoElement.src = URL.createObjectURL(file); | |
videoPreview.classList.remove('hidden'); | |
// Show analyze button | |
document.getElementById('analyzeBtn').classList.remove('hidden'); | |
document.getElementById('analyzeBtn').disabled = false; | |
} | |
}); | |
document.getElementById('analyzeBtn').addEventListener('click', processVideo); | |
// Initialize | |
checkWebGPUSupport(); |