vid-transcript / index.js
akhaliq's picture
akhaliq HF Staff
Update index.js
796efe2 verified
raw
history blame
13.4 kB
import {
AutoProcessor,
AutoModelForImageTextToText,
RawImage,
TextStreamer,
} from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.2';
let processor = null;
let model = null;
let currentVideo = null;
let frameDescriptions = [];
// Check WebGPU support
async function checkWebGPUSupport() {
const statusEl = document.getElementById('webgpuStatus');
if ('gpu' in navigator) {
statusEl.textContent = 'βœ… Available';
statusEl.style.color = '#10b981';
return true;
} else {
statusEl.textContent = '❌ Not Available';
statusEl.style.color = '#ef4444';
document.getElementById('deviceSelect').value = 'wasm';
document.getElementById('deviceSelect').disabled = true;
return false;
}
}
// Initialize the model
async function initializeModel() {
const device = document.getElementById('deviceSelect').value;
updateStatus('Loading AI model...');
try {
const model_id = "onnx-community/FastVLM-0.5B-ONNX";
processor = await AutoProcessor.from_pretrained(model_id);
const modelOptions = {
dtype: {
embed_tokens: "fp16",
vision_encoder: "q4",
decoder_model_merged: "q4",
}
};
if (device === 'webgpu') {
modelOptions.device = 'webgpu';
}
model = await AutoModelForImageTextToText.from_pretrained(model_id, modelOptions);
updateStatus('Model loaded successfully!');
return true;
} catch (error) {
console.error('Model initialization error:', error);
showError('Failed to load AI model. Please try again.');
return false;
}
}
// Extract frames from video
async function extractFramesFromVideo(videoFile, numFrames = 4) {
return new Promise((resolve, reject) => {
const video = document.createElement('video');
const canvas = document.createElement('canvas');
const ctx = canvas.getContext('2d');
video.src = URL.createObjectURL(videoFile);
video.addEventListener('loadedmetadata', async () => {
const duration = video.duration;
const frameInterval = duration / numFrames;
const frames = [];
const frameTimes = [];
canvas.width = Math.min(video.videoWidth, 1024);
canvas.height = Math.min(video.videoHeight, 1024);
for (let i = 0; i < numFrames; i++) {
const currentTime = i * frameInterval;
video.currentTime = currentTime;
frameTimes.push(currentTime);
await new Promise(r => {
video.addEventListener('seeked', () => r(), { once: true });
});
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
const blob = await new Promise(r => canvas.toBlob(r, 'image/png'));
const image = await RawImage.fromBlob(blob);
// Store frame preview
const previewUrl = canvas.toDataURL('image/jpeg', 0.8);
frames.push({ image, preview: previewUrl, time: currentTime });
updateProgress((i + 1) / numFrames * 30, `Extracting frame ${i + 1}/${numFrames}`);
}
URL.revokeObjectURL(video.src);
resolve(frames);
});
video.addEventListener('error', () => {
URL.revokeObjectURL(video.src);
reject(new Error('Failed to load video'));
});
});
}
// Process video
async function processVideo() {
const analyzeBtn = document.getElementById('analyzeBtn');
const progressSection = document.getElementById('progressSection');
const resultsSection = document.getElementById('resultsSection');
analyzeBtn.disabled = true;
analyzeBtn.querySelector('.spinner').classList.remove('hidden');
analyzeBtn.querySelector('.btn-text').textContent = 'Processing...';
progressSection.classList.remove('hidden');
resultsSection.classList.add('hidden');
frameDescriptions = [];
try {
// Initialize model if not already loaded
if (!model || !processor) {
if (!await initializeModel()) {
throw new Error('Model initialization failed');
}
}
// Extract frames
const numFrames = parseInt(document.getElementById('frameCount').value);
updateProgress(0, 'Extracting frames from video...');
const frames = await extractFramesFromVideo(currentVideo, numFrames);
// Display frame previews
const framesGrid = document.getElementById('framesGrid');
framesGrid.innerHTML = '';
// Process each frame
for (let i = 0; i < frames.length; i++) {
updateProgress(30 + (i / frames.length * 50), `Analyzing frame ${i + 1}/${frames.length}`);
// Create frame card
const frameCard = document.createElement('div');
frameCard.className = 'frame-card';
frameCard.innerHTML = `
<img src="${frames[i].preview}" alt="Frame ${i + 1}">
<div class="frame-info">
<h4>Frame ${i + 1}</h4>
<span class="frame-time">${formatTime(frames[i].time)}</span>
</div>
<div class="frame-description">
<div class="loading-dots">Analyzing...</div>
</div>
`;
framesGrid.appendChild(frameCard);
// Prepare prompt
const messages = [
{
role: "user",
content: `<image>Describe what's happening in this frame of the video in detail.`,
},
];
const prompt = processor.apply_chat_template(messages, {
add_generation_prompt: true,
});
// Prepare inputs
const inputs = await processor(frames[i].image, prompt, {
add_special_tokens: false,
});
// Generate output
let generatedText = '';
try {
const outputs = await model.generate({
...inputs,
max_new_tokens: 256,
do_sample: false,
streamer: new TextStreamer(processor.tokenizer, {
skip_prompt: true,
skip_special_tokens: false,
callback_function: (text) => {
generatedText += text;
frameCard.querySelector('.frame-description').innerHTML = `<p>${generatedText}</p>`;
},
}),
});
// Decode output
const decoded = processor.batch_decode(
outputs.slice(null, [inputs.input_ids.dims.at(-1), null]),
{ skip_special_tokens: true },
);
frameDescriptions.push({
frame: i + 1,
time: frames[i].time,
description: decoded[0] || generatedText
});
} catch (frameError) {
console.error(`Error processing frame ${i + 1}:`, frameError);
frameDescriptions.push({
frame: i + 1,
time: frames[i].time,
description: 'Failed to analyze this frame'
});
frameCard.querySelector('.frame-description').innerHTML = `<p style="color: #ef4444;">Failed to analyze this frame</p>`;
}
}
// Generate overall summary
updateProgress(80, 'Generating video summary...');
const summaryCard = document.getElementById('summaryCard');
const summaryContent = document.getElementById('summaryContent');
// Create a summary based on the frame descriptions
if (frameDescriptions.length > 0) {
const summaryMessages = [
{
role: "user",
content: `<image>Based on what you see in this video frame and knowing that the video contains the following sequence: ${frameDescriptions.map(f => `Frame ${f.frame}: ${f.description}`).join('; ')}. Provide a comprehensive summary of what the entire video is about.`,
},
];
const summaryPrompt = processor.apply_chat_template(summaryMessages, {
add_generation_prompt: true,
});
// Use the last frame's image for context
const summaryInputs = await processor(frames[frames.length - 1].image, summaryPrompt, {
add_special_tokens: false,
});
let summaryText = '';
const summaryOutputs = await model.generate({
...summaryInputs,
max_new_tokens: 512,
do_sample: false,
streamer: new TextStreamer(processor.tokenizer, {
skip_prompt: true,
skip_special_tokens: false,
callback_function: (text) => {
summaryText += text;
summaryContent.innerHTML = `<p>${summaryText}</p>`;
summaryCard.classList.remove('hidden');
},
}),
});
}
updateProgress(100, 'Analysis complete!');
// Show results
resultsSection.classList.remove('hidden');
progressSection.classList.add('hidden');
} catch (error) {
console.error('Processing error:', error);
showError(`Failed to process video: ${error.message}`);
} finally {
analyzeBtn.disabled = false;
analyzeBtn.querySelector('.spinner').classList.add('hidden');
analyzeBtn.querySelector('.btn-text').textContent = 'Analyze Video';
}
}
// Utility functions
function formatTime(seconds) {
const mins = Math.floor(seconds / 60);
const secs = Math.floor(seconds % 60);
return `${mins}:${secs.toString().padStart(2, '0')}`;
}
function updateProgress(percent, status) {
document.getElementById('progressFill').style.width = `${percent}%`;
document.getElementById('progressText').textContent = `${Math.round(percent)}%`;
document.getElementById('currentStatus').textContent = status;
}
function updateStatus(message) {
document.getElementById('currentStatus').textContent = message;
}
function showError(message) {
document.getElementById('errorMessage').textContent = message;
document.getElementById('errorSection').classList.remove('hidden');
document.getElementById('progressSection').classList.add('hidden');
}
function downloadResults() {
const results = {
timestamp: new Date().toISOString(),
video: currentVideo.name,
frames: frameDescriptions,
summary: document.getElementById('summaryContent').textContent
};
const blob = new Blob([JSON.stringify(results, null, 2)], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `video-analysis-${Date.now()}.json`;
a.click();
URL.revokeObjectURL(url);
}
function resetApp() {
document.getElementById('videoInput').value = '';
document.getElementById('videoInfo').innerHTML = '';
document.getElementById('videoPreview').classList.add('hidden');
document.getElementById('analyzeBtn').classList.add('hidden');
document.getElementById('progressSection').classList.add('hidden');
document.getElementById('resultsSection').classList.add('hidden');
document.getElementById('errorSection').classList.add('hidden');
currentVideo = null;
frameDescriptions = [];
}
// Event listeners
document.getElementById('videoInput').addEventListener('change', (e) => {
const file = e.target.files[0];
if (file && file.type.startsWith('video/')) {
currentVideo = file;
// Display video info
const videoInfo = document.getElementById('videoInfo');
videoInfo.innerHTML = `
<div class="file-info">
<span class="file-name">${file.name}</span>
<span class="file-size">${(file.size / 1024 / 1024).toFixed(2)} MB</span>
</div>
`;
// Show video preview
const videoPreview = document.getElementById('videoPreview');
const videoElement = document.getElementById('videoElement');
videoElement.src = URL.createObjectURL(file);
videoPreview.classList.remove('hidden');
// Show analyze button
document.getElementById('analyzeBtn').classList.remove('hidden');
document.getElementById('analyzeBtn').disabled = false;
}
});
document.getElementById('analyzeBtn').addEventListener('click', processVideo);
// Initialize
checkWebGPUSupport();