Seym0n commited on
Commit
cb2d036
·
1 Parent(s): ff324d9

feat: handle images instead webcam

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ node_modules/
package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
src/App.tsx CHANGED
@@ -1,109 +1,48 @@
1
- import { useState, useEffect, useRef, useCallback, useMemo } from "react";
2
  import LoadingScreen from "./components/LoadingScreen";
3
- import CaptioningView from "./components/CaptioningView";
4
- import WelcomeScreen from "./components/WelcomeScreen";
5
- import WebcamPermissionDialog from "./components/WebcamPermissionDialog";
6
  import type { AppState } from "./types";
7
 
8
  export default function App() {
9
- const [appState, setAppState] = useState<AppState>("requesting-permission");
10
- const [webcamStream, setWebcamStream] = useState<MediaStream | null>(null);
11
- const [isVideoReady, setIsVideoReady] = useState(false);
12
- const videoRef = useRef<HTMLVideoElement | null>(null);
13
 
14
- const handlePermissionGranted = useCallback((stream: MediaStream) => {
15
- setWebcamStream(stream);
16
- setAppState("welcome");
17
- }, []);
18
-
19
- const handleStart = useCallback(() => {
20
  setAppState("loading");
21
  }, []);
22
 
23
  const handleLoadingComplete = useCallback(() => {
24
- setAppState("captioning");
25
  }, []);
26
 
27
- const playVideo = useCallback(async (video: HTMLVideoElement) => {
28
- try {
29
- await video.play();
30
- } catch (error) {
31
- console.error("Failed to play video:", error);
32
- }
33
  }, []);
34
 
35
- const setupVideo = useCallback(
36
- (video: HTMLVideoElement, stream: MediaStream) => {
37
- video.srcObject = stream;
38
-
39
- const handleCanPlay = () => {
40
- setIsVideoReady(true);
41
- playVideo(video);
42
- };
43
-
44
- video.addEventListener("canplay", handleCanPlay, { once: true });
45
-
46
- return () => {
47
- video.removeEventListener("canplay", handleCanPlay);
48
- };
49
- },
50
- [playVideo],
51
- );
52
-
53
- useEffect(() => {
54
- if (webcamStream && videoRef.current) {
55
- const video = videoRef.current;
56
-
57
- video.srcObject = null;
58
- video.load();
59
-
60
- const cleanup = setupVideo(video, webcamStream);
61
- return cleanup;
62
- }
63
- }, [webcamStream, setupVideo]);
64
-
65
- const videoBlurState = useMemo(() => {
66
- switch (appState) {
67
- case "requesting-permission":
68
- return "blur(20px) brightness(0.2) saturate(0.5)";
69
- case "welcome":
70
- return "blur(12px) brightness(0.3) saturate(0.7)";
71
- case "loading":
72
- return "blur(8px) brightness(0.4) saturate(0.8)";
73
- case "captioning":
74
- return "none";
75
- default:
76
- return "blur(20px) brightness(0.2) saturate(0.5)";
77
- }
78
- }, [appState]);
79
-
80
  return (
81
  <div className="App relative h-screen overflow-hidden">
82
- <div className="absolute inset-0 bg-gray-900" />
83
 
84
- {webcamStream && (
85
- <video
86
- ref={videoRef}
87
- autoPlay
88
- muted
89
- playsInline
90
- className="absolute inset-0 w-full h-full object-cover transition-all duration-1000 ease-out"
91
- style={{
92
- filter: videoBlurState,
93
- opacity: isVideoReady ? 1 : 0,
94
- }}
95
  />
96
  )}
97
 
98
- {appState !== "captioning" && <div className="absolute inset-0 bg-gray-900/80 backdrop-blur-sm" />}
99
-
100
- {appState === "requesting-permission" && <WebcamPermissionDialog onPermissionGranted={handlePermissionGranted} />}
101
-
102
- {appState === "welcome" && <WelcomeScreen onStart={handleStart} />}
103
-
104
  {appState === "loading" && <LoadingScreen onComplete={handleLoadingComplete} />}
105
 
106
- {appState === "captioning" && <CaptioningView videoRef={videoRef} />}
 
 
 
 
 
107
  </div>
108
  );
109
  }
 
1
+ import { useState, useCallback } from "react";
2
  import LoadingScreen from "./components/LoadingScreen";
3
+ import ImageUpload from "./components/ImageUpload";
4
+ import ImageAnalysisView from "./components/ImageAnalysisView";
 
5
  import type { AppState } from "./types";
6
 
7
  export default function App() {
8
+ const [appState, setAppState] = useState<AppState>("upload");
9
+ const [uploadedImages, setUploadedImages] = useState<File[]>([]);
 
 
10
 
11
+ const handleImagesUploaded = useCallback((files: File[]) => {
12
+ setUploadedImages(files);
 
 
 
 
13
  setAppState("loading");
14
  }, []);
15
 
16
  const handleLoadingComplete = useCallback(() => {
17
+ setAppState("analyzing");
18
  }, []);
19
 
20
+ const handleBackToUpload = useCallback(() => {
21
+ setUploadedImages([]);
22
+ setAppState("upload");
 
 
 
23
  }, []);
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  return (
26
  <div className="App relative h-screen overflow-hidden">
27
+ <div className="absolute inset-0 bg-gradient-to-br from-gray-900 via-blue-900/20 to-purple-900/20" />
28
 
29
+ {appState !== "analyzing" && <div className="absolute inset-0 bg-gray-900/80 backdrop-blur-sm" />}
30
+
31
+ {appState === "upload" && (
32
+ <ImageUpload
33
+ onImagesUploaded={handleImagesUploaded}
34
+ isAnalyzing={false}
 
 
 
 
 
35
  />
36
  )}
37
 
 
 
 
 
 
 
38
  {appState === "loading" && <LoadingScreen onComplete={handleLoadingComplete} />}
39
 
40
+ {appState === "analyzing" && (
41
+ <ImageAnalysisView
42
+ images={uploadedImages}
43
+ onBackToUpload={handleBackToUpload}
44
+ />
45
+ )}
46
  </div>
47
  );
48
  }
src/components/ImageAnalysisView.tsx ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, useRef, useEffect, useCallback } from "react";
2
+ import DraggableContainer from "./DraggableContainer";
3
+ import PromptInput from "./PromptInput";
4
+ import GlassButton from "./GlassButton";
5
+ import GlassContainer from "./GlassContainer";
6
+ import { useVLMContext } from "../context/useVLMContext";
7
+ import { PROMPTS, GLASS_EFFECTS } from "../constants";
8
+ import type { ImageAnalysisResult } from "../types";
9
+
10
+ interface ImageAnalysisViewProps {
11
+ images: File[];
12
+ onBackToUpload: () => void;
13
+ }
14
+
15
+ export default function ImageAnalysisView({ images, onBackToUpload }: ImageAnalysisViewProps) {
16
+ const [results, setResults] = useState<ImageAnalysisResult[]>([]);
17
+ const [currentPrompt, setCurrentPrompt] = useState<string>(PROMPTS.default);
18
+ const [isAnalyzing, setIsAnalyzing] = useState<boolean>(false);
19
+ const [currentImageIndex, setCurrentImageIndex] = useState<number>(0);
20
+ const [selectedImageUrl, setSelectedImageUrl] = useState<string>("");
21
+
22
+ const { isLoaded, runInference } = useVLMContext();
23
+ const abortControllerRef = useRef<AbortController | null>(null);
24
+
25
+ // Create preview URL for selected image
26
+ useEffect(() => {
27
+ if (images[currentImageIndex]) {
28
+ const url = URL.createObjectURL(images[currentImageIndex]);
29
+ setSelectedImageUrl(url);
30
+ return () => URL.revokeObjectURL(url);
31
+ }
32
+ }, [images, currentImageIndex]);
33
+
34
+ const analyzeAllImages = useCallback(async () => {
35
+ if (!isLoaded || isAnalyzing) return;
36
+
37
+ setIsAnalyzing(true);
38
+ setResults([]);
39
+
40
+ abortControllerRef.current?.abort();
41
+ abortControllerRef.current = new AbortController();
42
+
43
+ const analysisResults: ImageAnalysisResult[] = [];
44
+
45
+ try {
46
+ for (let i = 0; i < images.length; i++) {
47
+ if (abortControllerRef.current.signal.aborted) break;
48
+
49
+ setCurrentImageIndex(i);
50
+ const file = images[i];
51
+
52
+ try {
53
+ const caption = await runInference(file, currentPrompt);
54
+ analysisResults.push({ file, caption });
55
+ } catch (error) {
56
+ const errorMsg = error instanceof Error ? error.message : String(error);
57
+ analysisResults.push({ file, caption: "", error: errorMsg });
58
+ }
59
+
60
+ setResults([...analysisResults]);
61
+ }
62
+ } catch (error) {
63
+ console.error("Analysis interrupted:", error);
64
+ } finally {
65
+ setIsAnalyzing(false);
66
+ }
67
+ }, [images, currentPrompt, isLoaded, runInference, isAnalyzing]);
68
+
69
+ const handlePromptChange = useCallback((prompt: string) => {
70
+ setCurrentPrompt(prompt);
71
+ }, []);
72
+
73
+ const handleImageSelect = useCallback((index: number) => {
74
+ setCurrentImageIndex(index);
75
+ }, []);
76
+
77
+ const stopAnalysis = useCallback(() => {
78
+ abortControllerRef.current?.abort();
79
+ setIsAnalyzing(false);
80
+ }, []);
81
+
82
+ useEffect(() => {
83
+ return () => {
84
+ abortControllerRef.current?.abort();
85
+ };
86
+ }, []);
87
+
88
+ return (
89
+ <div className="absolute inset-0 text-white">
90
+ {/* Main image display */}
91
+ <div className="relative w-full h-full flex">
92
+ {/* Image preview */}
93
+ <div className="flex-1 flex items-center justify-center p-8">
94
+ {selectedImageUrl && (
95
+ <img
96
+ src={selectedImageUrl}
97
+ alt={`Preview of ${images[currentImageIndex]?.name}`}
98
+ className="max-w-full max-h-full object-contain rounded-lg shadow-2xl"
99
+ />
100
+ )}
101
+ </div>
102
+
103
+ {/* Sidebar with image thumbnails and results */}
104
+ <div className="w-80 bg-black/20 backdrop-blur-sm border-l border-white/20 overflow-y-auto">
105
+ {/* Controls */}
106
+ <div className="p-4 border-b border-white/20">
107
+ <div className="flex gap-2 mb-4">
108
+ <GlassButton onClick={onBackToUpload} className="flex-1">
109
+ Back to Upload
110
+ </GlassButton>
111
+ {!isAnalyzing ? (
112
+ <GlassButton
113
+ onClick={analyzeAllImages}
114
+ disabled={!isLoaded}
115
+ className="flex-1"
116
+ >
117
+ Analyze All
118
+ </GlassButton>
119
+ ) : (
120
+ <GlassButton onClick={stopAnalysis} className="flex-1 bg-red-500/20">
121
+ Stop
122
+ </GlassButton>
123
+ )}
124
+ </div>
125
+
126
+ {isAnalyzing && (
127
+ <div className="text-sm text-white/70 text-center">
128
+ Analyzing image {currentImageIndex + 1} of {images.length}...
129
+ </div>
130
+ )}
131
+ </div>
132
+
133
+ {/* Image list with results */}
134
+ <div className="p-4 space-y-4">
135
+ {images.map((file, index) => {
136
+ const result = results.find(r => r.file === file);
137
+ const isSelected = index === currentImageIndex;
138
+ const isProcessing = isAnalyzing && index === currentImageIndex;
139
+
140
+ return (
141
+ <div
142
+ key={`${file.name}-${index}`}
143
+ className={`cursor-pointer transition-all duration-200 ${
144
+ isSelected ? 'ring-2 ring-blue-400' : ''
145
+ }`}
146
+ onClick={() => handleImageSelect(index)}
147
+ >
148
+ <GlassContainer
149
+ bgColor={isSelected ? GLASS_EFFECTS.COLORS.BUTTON_BG : GLASS_EFFECTS.COLORS.DEFAULT_BG}
150
+ className="p-3 rounded-lg"
151
+ >
152
+ <div className="flex items-start gap-3">
153
+ {/* Thumbnail */}
154
+ <div className="w-16 h-16 bg-gray-700 rounded flex items-center justify-center text-xs flex-shrink-0">
155
+ <img
156
+ src={URL.createObjectURL(file)}
157
+ alt={file.name}
158
+ className="w-full h-full object-cover rounded"
159
+ onLoad={(e) => URL.revokeObjectURL((e.target as HTMLImageElement).src)}
160
+ />
161
+ </div>
162
+
163
+ {/* Content */}
164
+ <div className="flex-1 min-w-0">
165
+ <div className="text-sm font-medium truncate mb-1">
166
+ {file.name}
167
+ </div>
168
+
169
+ {isProcessing && (
170
+ <div className="text-xs text-blue-400">
171
+ Processing...
172
+ </div>
173
+ )}
174
+
175
+ {result && (
176
+ <div className="text-xs">
177
+ {result.error ? (
178
+ <div className="text-red-400">
179
+ Error: {result.error}
180
+ </div>
181
+ ) : (
182
+ <div className="text-white/80">
183
+ {result.caption}
184
+ </div>
185
+ )}
186
+ </div>
187
+ )}
188
+ </div>
189
+ </div>
190
+ </GlassContainer>
191
+ </div>
192
+ );
193
+ })}
194
+ </div>
195
+ </div>
196
+ </div>
197
+
198
+ {/* Draggable Prompt Input - Bottom Left */}
199
+ <DraggableContainer initialPosition="bottom-left">
200
+ <PromptInput
201
+ onPromptChange={handlePromptChange}
202
+ disabled={isAnalyzing}
203
+ />
204
+ </DraggableContainer>
205
+ </div>
206
+ );
207
+ }
src/components/ImageUpload.tsx ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, useCallback, useRef } from "react";
2
+ import GlassButton from "./GlassButton";
3
+ import GlassContainer from "./GlassContainer";
4
+ import { GLASS_EFFECTS } from "../constants";
5
+
6
+ interface ImageUploadProps {
7
+ onImagesUploaded: (files: File[]) => void;
8
+ isAnalyzing: boolean;
9
+ }
10
+
11
+ export default function ImageUpload({ onImagesUploaded, isAnalyzing }: ImageUploadProps) {
12
+ const [dragActive, setDragActive] = useState(false);
13
+ const fileInputRef = useRef<HTMLInputElement>(null);
14
+
15
+ const handleFiles = useCallback(
16
+ (files: FileList | null) => {
17
+ if (!files) return;
18
+
19
+ const imageFiles = Array.from(files).filter(file =>
20
+ file.type.startsWith("image/")
21
+ );
22
+
23
+ if (imageFiles.length > 0) {
24
+ onImagesUploaded(imageFiles);
25
+ }
26
+ },
27
+ [onImagesUploaded]
28
+ );
29
+
30
+ const handleDrag = useCallback((e: React.DragEvent) => {
31
+ e.preventDefault();
32
+ e.stopPropagation();
33
+ }, []);
34
+
35
+ const handleDragIn = useCallback((e: React.DragEvent) => {
36
+ e.preventDefault();
37
+ e.stopPropagation();
38
+ if (e.dataTransfer?.items && e.dataTransfer.items.length > 0) {
39
+ setDragActive(true);
40
+ }
41
+ }, []);
42
+
43
+ const handleDragOut = useCallback((e: React.DragEvent) => {
44
+ e.preventDefault();
45
+ e.stopPropagation();
46
+ setDragActive(false);
47
+ }, []);
48
+
49
+ const handleDrop = useCallback(
50
+ (e: React.DragEvent) => {
51
+ e.preventDefault();
52
+ e.stopPropagation();
53
+ setDragActive(false);
54
+
55
+ if (e.dataTransfer?.files && e.dataTransfer.files.length > 0) {
56
+ handleFiles(e.dataTransfer.files);
57
+ }
58
+ },
59
+ [handleFiles]
60
+ );
61
+
62
+ const handleFileInputChange = useCallback(
63
+ (e: React.ChangeEvent<HTMLInputElement>) => {
64
+ handleFiles(e.target.files);
65
+ },
66
+ [handleFiles]
67
+ );
68
+
69
+ const handleClick = useCallback(() => {
70
+ if (!isAnalyzing) {
71
+ fileInputRef.current?.click();
72
+ }
73
+ }, [isAnalyzing]);
74
+
75
+ return (
76
+ <div className="absolute inset-0 flex items-center justify-center">
77
+ <GlassContainer
78
+ bgColor={dragActive ? GLASS_EFFECTS.COLORS.BUTTON_BG : GLASS_EFFECTS.COLORS.DEFAULT_BG}
79
+ className={`p-8 rounded-2xl border-2 border-dashed transition-all duration-300 cursor-pointer max-w-md mx-4 ${
80
+ dragActive ? "border-blue-400 scale-105" : "border-white/30"
81
+ } ${isAnalyzing ? "opacity-50 pointer-events-none" : "hover:border-white/50"}`}
82
+ onDragEnter={handleDragIn}
83
+ onDragLeave={handleDragOut}
84
+ onDragOver={handleDrag}
85
+ onDrop={handleDrop}
86
+ onClick={handleClick}
87
+ >
88
+ <div className="text-center text-white">
89
+ <div className="mb-4">
90
+ <svg
91
+ className="mx-auto w-16 h-16 text-white/60"
92
+ fill="none"
93
+ stroke="currentColor"
94
+ viewBox="0 0 24 24"
95
+ >
96
+ <path
97
+ strokeLinecap="round"
98
+ strokeLinejoin="round"
99
+ strokeWidth={1.5}
100
+ d="M4 16l4.586-4.586a2 2 0 012.828 0L16 16m-2-2l1.586-1.586a2 2 0 012.828 0L20 14m-6-6h.01M6 20h12a2 2 0 002-2V6a2 2 0 00-2-2H6a2 2 0 00-2 2v12a2 2 0 002 2z"
101
+ />
102
+ </svg>
103
+ </div>
104
+
105
+ <h3 className="text-xl font-semibold mb-2">Upload Images</h3>
106
+ <p className="text-white/80 mb-4">
107
+ Drag and drop images here, or click to select files
108
+ </p>
109
+ <p className="text-sm text-white/60 mb-6">
110
+ Supports JPG, PNG, GIF, WebP formats. Multiple files allowed.
111
+ </p>
112
+
113
+ <GlassButton disabled={isAnalyzing}>
114
+ {isAnalyzing ? "Analyzing..." : "Choose Files"}
115
+ </GlassButton>
116
+ </div>
117
+
118
+ <input
119
+ ref={fileInputRef}
120
+ type="file"
121
+ multiple
122
+ accept="image/*"
123
+ onChange={handleFileInputChange}
124
+ className="hidden"
125
+ />
126
+ </GlassContainer>
127
+ </div>
128
+ );
129
+ }
src/components/PromptInput.tsx CHANGED
@@ -5,9 +5,10 @@ import GlassContainer from "./GlassContainer";
5
  interface PromptInputProps {
6
  onPromptChange: (prompt: string) => void;
7
  defaultPrompt?: string;
 
8
  }
9
 
10
- export default function PromptInput({ onPromptChange, defaultPrompt = PROMPTS.default }: PromptInputProps) {
11
  const [prompt, setPrompt] = useState(defaultPrompt);
12
  const [showSuggestions, setShowSuggestions] = useState(false);
13
  const inputRef = useRef<HTMLTextAreaElement>(null);
@@ -116,10 +117,13 @@ export default function PromptInput({ onPromptChange, defaultPrompt = PROMPTS.de
116
  ref={inputRef}
117
  value={prompt}
118
  onChange={handleInputChange}
119
- onFocus={handleInputFocus}
120
- onBlur={handleInputBlur}
121
- onClick={handleInputClick}
122
- className="search-input w-full py-3 pl-4 pr-8 rounded-xl text-white text-base transition-all duration-400 border resize-none focus:outline-none focus:-translate-y-0.5 focus:shadow-lg"
 
 
 
123
  style={{
124
  background: "var(--input-bg)",
125
  borderColor: "var(--input-border)",
@@ -132,7 +136,7 @@ export default function PromptInput({ onPromptChange, defaultPrompt = PROMPTS.de
132
  placeholder={PROMPTS.placeholder}
133
  rows={1}
134
  />
135
- {prompt && (
136
  <button
137
  type="button"
138
  onClick={clearInput}
 
5
  interface PromptInputProps {
6
  onPromptChange: (prompt: string) => void;
7
  defaultPrompt?: string;
8
+ disabled?: boolean;
9
  }
10
 
11
+ export default function PromptInput({ onPromptChange, defaultPrompt = PROMPTS.default, disabled = false }: PromptInputProps) {
12
  const [prompt, setPrompt] = useState(defaultPrompt);
13
  const [showSuggestions, setShowSuggestions] = useState(false);
14
  const inputRef = useRef<HTMLTextAreaElement>(null);
 
117
  ref={inputRef}
118
  value={prompt}
119
  onChange={handleInputChange}
120
+ onFocus={disabled ? undefined : handleInputFocus}
121
+ onBlur={disabled ? undefined : handleInputBlur}
122
+ onClick={disabled ? undefined : handleInputClick}
123
+ disabled={disabled}
124
+ className={`search-input w-full py-3 pl-4 pr-8 rounded-xl text-white text-base transition-all duration-400 border resize-none focus:outline-none focus:-translate-y-0.5 focus:shadow-lg ${
125
+ disabled ? 'opacity-50 cursor-not-allowed' : ''
126
+ }`}
127
  style={{
128
  background: "var(--input-bg)",
129
  borderColor: "var(--input-border)",
 
136
  placeholder={PROMPTS.placeholder}
137
  rows={1}
138
  />
139
+ {prompt && !disabled && (
140
  <button
141
  type="button"
142
  onClick={clearInput}
src/context/VLMContext.tsx CHANGED
@@ -67,9 +67,9 @@ export const VLMProvider: React.FC<React.PropsWithChildren> = ({ children }) =>
67
  );
68
 
69
  const runInference = useCallback(
70
- async (video: HTMLVideoElement, instruction: string, onTextUpdate?: (text: string) => void): Promise<string> => {
71
  if (inferenceLock.current) {
72
- console.log("Inference already running, skipping frame");
73
  return ""; // Return empty string to signal a skip
74
  }
75
  inferenceLock.current = true;
@@ -78,21 +78,35 @@ export const VLMProvider: React.FC<React.PropsWithChildren> = ({ children }) =>
78
  throw new Error("Model/processor not loaded");
79
  }
80
 
81
- if (!canvasRef.current) {
82
- canvasRef.current = document.createElement("canvas");
83
- }
84
- const canvas = canvasRef.current;
85
 
86
- canvas.width = video.videoWidth;
87
- canvas.height = video.videoHeight;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- const ctx = canvas.getContext("2d", { willReadFrequently: true });
90
- if (!ctx) throw new Error("Could not get canvas context");
91
 
92
- ctx.drawImage(video, 0, 0);
93
 
94
- const frame = ctx.getImageData(0, 0, canvas.width, canvas.height);
95
- const rawImg = new RawImage(frame.data, frame.width, frame.height, 4);
 
96
  const messages = [
97
  {
98
  role: "system",
 
67
  );
68
 
69
  const runInference = useCallback(
70
+ async (imageSource: HTMLVideoElement | File, instruction: string, onTextUpdate?: (text: string) => void): Promise<string> => {
71
  if (inferenceLock.current) {
72
+ console.log("Inference already running, skipping");
73
  return ""; // Return empty string to signal a skip
74
  }
75
  inferenceLock.current = true;
 
78
  throw new Error("Model/processor not loaded");
79
  }
80
 
81
+ let rawImg: RawImage;
 
 
 
82
 
83
+ if (imageSource instanceof File) {
84
+ // Handle uploaded image file
85
+ const url = URL.createObjectURL(imageSource);
86
+ try {
87
+ rawImg = await RawImage.fromURL(url);
88
+ } finally {
89
+ URL.revokeObjectURL(url);
90
+ }
91
+ } else {
92
+ // Handle video frame (original logic)
93
+ if (!canvasRef.current) {
94
+ canvasRef.current = document.createElement("canvas");
95
+ }
96
+ const canvas = canvasRef.current;
97
+ const video = imageSource;
98
+
99
+ canvas.width = video.videoWidth;
100
+ canvas.height = video.videoHeight;
101
 
102
+ const ctx = canvas.getContext("2d", { willReadFrequently: true });
103
+ if (!ctx) throw new Error("Could not get canvas context");
104
 
105
+ ctx.drawImage(video, 0, 0);
106
 
107
+ const frame = ctx.getImageData(0, 0, canvas.width, canvas.height);
108
+ rawImg = new RawImage(frame.data, frame.width, frame.height, 4);
109
+ }
110
  const messages = [
111
  {
112
  role: "system",
src/types/index.ts CHANGED
@@ -1,4 +1,4 @@
1
- export type AppState = "requesting-permission" | "welcome" | "loading" | "captioning";
2
 
3
  export interface GlassEffectProps {
4
  baseFrequency?: number;
@@ -25,3 +25,9 @@ export interface Dimensions {
25
  }
26
 
27
  export type InitialPosition = "bottom-left" | "bottom-right" | Position;
 
 
 
 
 
 
 
1
+ export type AppState = "upload" | "loading" | "analyzing";
2
 
3
  export interface GlassEffectProps {
4
  baseFrequency?: number;
 
25
  }
26
 
27
  export type InitialPosition = "bottom-left" | "bottom-right" | Position;
28
+
29
+ export interface ImageAnalysisResult {
30
+ file: File;
31
+ caption: string;
32
+ error?: string;
33
+ }
src/types/vlm.ts CHANGED
@@ -4,7 +4,7 @@ export type VLMContextValue = {
4
  error: string | null;
5
  loadModel: (onProgress?: (msg: string) => void) => Promise<void>;
6
  runInference: (
7
- video: HTMLVideoElement,
8
  instruction: string,
9
  onTextUpdate?: (text: string) => void,
10
  ) => Promise<string>;
 
4
  error: string | null;
5
  loadModel: (onProgress?: (msg: string) => void) => Promise<void>;
6
  runInference: (
7
+ imageSource: HTMLVideoElement | File,
8
  instruction: string,
9
  onTextUpdate?: (text: string) => void,
10
  ) => Promise<string>;