profplate commited on
Commit
7bb5bc1
·
verified ·
1 Parent(s): f9a128a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +370 -0
app.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+ import time
3
+
4
+ import gradio as gr
5
+ import torch
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
7
+
8
+
9
+ MODEL_OPTIONS = {
10
+ "SmolLM2 360M Instruct (best default)": "HuggingFaceTB/SmolLM2-360M-Instruct",
11
+ "SmolLM2 135M Instruct (fast)": "HuggingFaceTB/SmolLM2-135M-Instruct",
12
+ "distilgpt2 (baseline)": "distilgpt2",
13
+ }
14
+
15
+ DEFAULT_MODEL = "SmolLM2 360M Instruct (best default)"
16
+ INSTRUCT_MODEL_LABELS = {
17
+ "SmolLM2 360M Instruct (best default)",
18
+ "SmolLM2 135M Instruct (fast)",
19
+ }
20
+
21
+ VIEWPOINT_GUIDES = {
22
+ "close-up": (
23
+ "Focus on nearby detail, texture, facial expression, small objects, and "
24
+ "what is cropped out or hidden by the tight framing."
25
+ ),
26
+ "wide shot": (
27
+ "Focus on layout, background, scale, distance between objects, and how "
28
+ "the whole scene is arranged."
29
+ ),
30
+ "bird's-eye view": (
31
+ "Describe the scene from above. Focus on map-like layout, paths, shapes, "
32
+ "and what becomes visible only from overhead."
33
+ ),
34
+ "low angle": (
35
+ "Describe the scene from below. Focus on height, scale, foreground, "
36
+ "dominance, sky or ceiling, and what is hidden behind tall objects."
37
+ ),
38
+ "over-the-shoulder": (
39
+ "Describe what is visible from behind one character or object. Focus on "
40
+ "foreground shoulder/frame, partial visibility, and what the viewer can "
41
+ "infer but not fully see."
42
+ ),
43
+ }
44
+
45
+ MODE_GUIDES = {
46
+ "cinematic shot description": (
47
+ "Write like a film shot description, emphasizing framing, movement, and "
48
+ "what the viewer sees first."
49
+ ),
50
+ "photography caption": (
51
+ "Write like a precise photography caption, emphasizing composition and "
52
+ "visible details."
53
+ ),
54
+ "storyboard note": (
55
+ "Write like a storyboard note for an artist, naming visual beats and "
56
+ "spatial relationships."
57
+ ),
58
+ "image prompt helper": (
59
+ "Write a detailed image-generation prompt that makes the viewpoint and "
60
+ "composition explicit."
61
+ ),
62
+ "visual analysis paragraph": (
63
+ "Write an analytical paragraph explaining how the viewpoint changes "
64
+ "what is visible and what is hidden."
65
+ ),
66
+ }
67
+
68
+ FIVE_VIEWPOINTS = [
69
+ "close-up",
70
+ "wide shot",
71
+ "bird's-eye view",
72
+ "low angle",
73
+ "over-the-shoulder",
74
+ ]
75
+
76
+
77
+ try:
78
+ torch.set_num_threads(2)
79
+ except Exception:
80
+ pass
81
+
82
+
83
+ @lru_cache(maxsize=3)
84
+ def load_generator(model_label):
85
+ model_id = MODEL_OPTIONS[model_label]
86
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
87
+ if tokenizer.pad_token_id is None:
88
+ tokenizer.pad_token = tokenizer.eos_token
89
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
90
+ model.eval()
91
+ return pipeline(
92
+ "text-generation",
93
+ model=model,
94
+ tokenizer=tokenizer,
95
+ device=-1,
96
+ )
97
+
98
+
99
+ def build_prompt(model_label, scene, viewpoint, output_mode):
100
+ scene = scene.strip()
101
+ viewpoint_guide = VIEWPOINT_GUIDES[viewpoint]
102
+ mode_guide = MODE_GUIDES[output_mode]
103
+
104
+ if model_label not in INSTRUCT_MODEL_LABELS:
105
+ return (
106
+ f"{viewpoint.title()} {output_mode}.\n"
107
+ f"Scene: {scene}\n"
108
+ "Description:"
109
+ )
110
+
111
+ return (
112
+ "You are a careful visual scene description assistant for a student "
113
+ "research project.\n"
114
+ "Describe the same scene from a selected viewpoint. The important question "
115
+ "is not just camera vocabulary; explain what becomes visible, hidden, "
116
+ "larger, smaller, foregrounded, or backgrounded because of the viewpoint.\n\n"
117
+ f"Viewpoint: {viewpoint}\n"
118
+ f"Viewpoint guidance: {viewpoint_guide}\n"
119
+ f"Output mode: {output_mode}\n"
120
+ f"Output guidance: {mode_guide}\n"
121
+ f"Scene: {scene}\n\n"
122
+ "Write the response now:"
123
+ )
124
+
125
+
126
+ def call_model(model_label, final_prompt, temperature, top_p, max_new_tokens):
127
+ generator = load_generator(model_label)
128
+ tokenizer = generator.tokenizer
129
+ result = generator(
130
+ final_prompt,
131
+ max_new_tokens=int(max_new_tokens),
132
+ temperature=max(float(temperature), 0.05),
133
+ top_p=float(top_p),
134
+ do_sample=True,
135
+ repetition_penalty=1.08,
136
+ return_full_text=False,
137
+ pad_token_id=tokenizer.eos_token_id,
138
+ eos_token_id=tokenizer.eos_token_id,
139
+ )
140
+ text = result[0]["generated_text"].strip()
141
+ return text if text else "(The model returned an empty response. Try more tokens.)"
142
+
143
+
144
+ def generate_viewpoint(
145
+ model_label,
146
+ scene,
147
+ viewpoint,
148
+ output_mode,
149
+ temperature,
150
+ top_p,
151
+ max_new_tokens,
152
+ ):
153
+ if not scene or not scene.strip():
154
+ return "Please enter a scene.", "", ""
155
+
156
+ final_prompt = build_prompt(model_label, scene, viewpoint, output_mode)
157
+ started = time.perf_counter()
158
+ try:
159
+ output = call_model(
160
+ model_label,
161
+ final_prompt,
162
+ temperature,
163
+ top_p,
164
+ max_new_tokens,
165
+ )
166
+ except Exception as exc:
167
+ return (
168
+ f"Error while running the model: {exc}",
169
+ final_prompt,
170
+ "Try the fast model first, or reduce max tokens.",
171
+ )
172
+
173
+ elapsed = time.perf_counter() - started
174
+ note = (
175
+ f"Model: {MODEL_OPTIONS[model_label]}\n"
176
+ f"Elapsed: {elapsed:.1f} seconds\n"
177
+ "First use can be slower because the model has to download and load."
178
+ )
179
+ return output, final_prompt, note
180
+
181
+
182
+ def make_paper_notes(scene, outputs_text):
183
+ scene_line = scene.strip() if scene and scene.strip() else "the tested scene"
184
+ return (
185
+ f"Paper notes for: {scene_line}\n\n"
186
+ "Use these checks while reading the outputs:\n\n"
187
+ "1. Visibility: Which objects become visible or hidden in each viewpoint?\n"
188
+ "2. Occlusion: Does the model notice when one object blocks another?\n"
189
+ "3. Scale: Does low angle or close-up change perceived size or importance?\n"
190
+ "4. Layout: Does bird's-eye or wide shot explain spatial relationships?\n"
191
+ "5. Specificity: Does the model describe this scene, or could the paragraph "
192
+ "fit almost any scene?\n"
193
+ "6. Finding sentence: Write one cautious sentence about whether the model "
194
+ "understands viewpoint consequences or only uses camera-angle words.\n\n"
195
+ "Useful wording for the paper:\n"
196
+ "In this small test, the model was strongest when ____. It was weakest "
197
+ "when ____. The clearest limitation was ____."
198
+ )
199
+
200
+
201
+ def run_five_viewpoints(model_label, scene, output_mode, temperature, top_p, max_new_tokens):
202
+ if not scene or not scene.strip():
203
+ return "Please enter a scene.", ""
204
+
205
+ started = time.perf_counter()
206
+ sections = []
207
+ try:
208
+ for viewpoint in FIVE_VIEWPOINTS:
209
+ final_prompt = build_prompt(model_label, scene, viewpoint, output_mode)
210
+ output = call_model(
211
+ model_label,
212
+ final_prompt,
213
+ temperature,
214
+ top_p,
215
+ max_new_tokens,
216
+ )
217
+ sections.append(f"## {viewpoint.title()}\n\n{output}")
218
+ except Exception as exc:
219
+ return (
220
+ f"Error while running the five-viewpoint test: {exc}",
221
+ "Try the fast model first, or reduce max tokens.",
222
+ )
223
+
224
+ elapsed = time.perf_counter() - started
225
+ outputs_text = "\n\n---\n\n".join(sections)
226
+ notes = make_paper_notes(scene, outputs_text) + f"\n\nElapsed: {elapsed:.1f} seconds."
227
+ return outputs_text, notes
228
+
229
+
230
+ def notes_from_pasted_outputs(scene, pasted_outputs):
231
+ if not pasted_outputs or not pasted_outputs.strip():
232
+ return "Paste your generated outputs first."
233
+ return make_paper_notes(scene, pasted_outputs)
234
+
235
+
236
+ with gr.Blocks(title="Camera Angle Model Lab", theme=gr.themes.Soft()) as demo:
237
+ gr.Markdown(
238
+ "# Camera Angle Model Lab\n"
239
+ "CPU-only viewpoint lab for testing how small language models describe "
240
+ "the same scene from different visual perspectives. No API tokens or paid "
241
+ "compute required. The first run may take a minute while the model loads."
242
+ )
243
+
244
+ with gr.Tab("Single Viewpoint Writer"):
245
+ with gr.Row():
246
+ model_one = gr.Dropdown(
247
+ choices=list(MODEL_OPTIONS.keys()),
248
+ value=DEFAULT_MODEL,
249
+ label="Model",
250
+ )
251
+ viewpoint_one = gr.Dropdown(
252
+ choices=list(VIEWPOINT_GUIDES.keys()),
253
+ value="close-up",
254
+ label="Viewpoint",
255
+ )
256
+ mode_one = gr.Dropdown(
257
+ choices=list(MODE_GUIDES.keys()),
258
+ value="visual analysis paragraph",
259
+ label="Output mode",
260
+ )
261
+
262
+ scene_one = gr.Textbox(
263
+ label="Scene",
264
+ lines=4,
265
+ value="A dog hides under a kitchen table while a child looks for it.",
266
+ )
267
+
268
+ with gr.Row():
269
+ temperature_one = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
270
+ top_p_one = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
271
+ max_tokens_one = gr.Slider(40, 170, value=100, step=10, label="Max new tokens")
272
+
273
+ run_one = gr.Button("Generate", variant="primary")
274
+ output_one = gr.Textbox(label="Generated output", lines=10)
275
+ prompt_sent_one = gr.Textbox(label="Prompt sent to model", lines=8)
276
+ note_one = gr.Textbox(label="Run note", lines=3)
277
+
278
+ run_one.click(
279
+ fn=generate_viewpoint,
280
+ inputs=[
281
+ model_one,
282
+ scene_one,
283
+ viewpoint_one,
284
+ mode_one,
285
+ temperature_one,
286
+ top_p_one,
287
+ max_tokens_one,
288
+ ],
289
+ outputs=[output_one, prompt_sent_one, note_one],
290
+ )
291
+
292
+ gr.Examples(
293
+ examples=[
294
+ ["A dog hides under a kitchen table while a child looks for it.", "close-up", "visual analysis paragraph"],
295
+ ["A crowded city street after rain reflects neon signs in puddles.", "bird's-eye view", "cinematic shot description"],
296
+ ["A soccer player prepares to take a penalty kick while the goalkeeper waits.", "low angle", "storyboard note"],
297
+ ["A person stands at the edge of a forest path holding a lantern.", "over-the-shoulder", "image prompt helper"],
298
+ ["A museum gallery contains one bright painting at the far end of the room.", "wide shot", "photography caption"],
299
+ ],
300
+ inputs=[scene_one, viewpoint_one, mode_one],
301
+ )
302
+
303
+ with gr.Tab("Five-Viewpoint Test"):
304
+ model_grid = gr.Dropdown(
305
+ choices=list(MODEL_OPTIONS.keys()),
306
+ value=DEFAULT_MODEL,
307
+ label="Model",
308
+ )
309
+ scene_grid = gr.Textbox(
310
+ label="Shared scene",
311
+ lines=4,
312
+ value="A dog hides under a kitchen table while a child looks for it.",
313
+ )
314
+ mode_grid = gr.Dropdown(
315
+ choices=list(MODE_GUIDES.keys()),
316
+ value="visual analysis paragraph",
317
+ label="Output mode",
318
+ )
319
+ with gr.Row():
320
+ temperature_grid = gr.Slider(0.1, 1.5, value=0.6, step=0.1, label="Temperature")
321
+ top_p_grid = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
322
+ max_tokens_grid = gr.Slider(40, 140, value=80, step=10, label="Max new tokens")
323
+
324
+ run_grid = gr.Button("Run Five Viewpoints", variant="primary")
325
+ grid_output = gr.Markdown(label="Five-viewpoint output")
326
+ grid_notes = gr.Textbox(label="Paper notes", lines=14)
327
+
328
+ run_grid.click(
329
+ fn=run_five_viewpoints,
330
+ inputs=[
331
+ model_grid,
332
+ scene_grid,
333
+ mode_grid,
334
+ temperature_grid,
335
+ top_p_grid,
336
+ max_tokens_grid,
337
+ ],
338
+ outputs=[grid_output, grid_notes],
339
+ )
340
+
341
+ with gr.Tab("Paper Notes Helper"):
342
+ scene_notes = gr.Textbox(
343
+ label="Scene being tested",
344
+ lines=3,
345
+ value="A dog hides under a kitchen table while a child looks for it.",
346
+ )
347
+ pasted_outputs = gr.Textbox(
348
+ label="Paste generated outputs here",
349
+ lines=12,
350
+ placeholder="Paste close-up, wide shot, bird's-eye, low angle, and over-the-shoulder outputs here.",
351
+ )
352
+ run_notes = gr.Button("Make Paper Notes", variant="primary")
353
+ paper_notes = gr.Textbox(label="Checklist for findings section", lines=14)
354
+
355
+ run_notes.click(
356
+ fn=notes_from_pasted_outputs,
357
+ inputs=[scene_notes, pasted_outputs],
358
+ outputs=paper_notes,
359
+ )
360
+
361
+ gr.Markdown(
362
+ "### Duplication note\n"
363
+ "This Space uses only local CPU models. No tokens, API keys, or paid "
364
+ "hardware are required. Students can duplicate it and edit the viewpoints, "
365
+ "output modes, examples, or model list."
366
+ )
367
+
368
+
369
+ if __name__ == "__main__":
370
+ demo.launch()