Spaces:

profplate
/

camera-angle-model-lab

Running

App Files Files Community

profplate commited on 12 days ago

Commit

7bb5bc1

verified ·

1 Parent(s): f9a128a

Create app.py

Browse files

Files changed (1) hide show

app.py +370 -0

app.py ADDED Viewed

	@@ -0,0 +1,370 @@

+from functools import lru_cache
+import time
+import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+MODEL_OPTIONS = {
+    "SmolLM2 360M Instruct (best default)": "HuggingFaceTB/SmolLM2-360M-Instruct",
+    "SmolLM2 135M Instruct (fast)": "HuggingFaceTB/SmolLM2-135M-Instruct",
+    "distilgpt2 (baseline)": "distilgpt2",
+}
+DEFAULT_MODEL = "SmolLM2 360M Instruct (best default)"
+INSTRUCT_MODEL_LABELS = {
+    "SmolLM2 360M Instruct (best default)",
+    "SmolLM2 135M Instruct (fast)",
+}
+VIEWPOINT_GUIDES = {
+    "close-up": (
+        "Focus on nearby detail, texture, facial expression, small objects, and "
+        "what is cropped out or hidden by the tight framing."
+    ),
+    "wide shot": (
+        "Focus on layout, background, scale, distance between objects, and how "
+        "the whole scene is arranged."
+    ),
+    "bird's-eye view": (
+        "Describe the scene from above. Focus on map-like layout, paths, shapes, "
+        "and what becomes visible only from overhead."
+    ),
+    "low angle": (
+        "Describe the scene from below. Focus on height, scale, foreground, "
+        "dominance, sky or ceiling, and what is hidden behind tall objects."
+    ),
+    "over-the-shoulder": (
+        "Describe what is visible from behind one character or object. Focus on "
+        "foreground shoulder/frame, partial visibility, and what the viewer can "
+        "infer but not fully see."
+    ),
+}
+MODE_GUIDES = {
+    "cinematic shot description": (
+        "Write like a film shot description, emphasizing framing, movement, and "
+        "what the viewer sees first."
+    ),
+    "photography caption": (
+        "Write like a precise photography caption, emphasizing composition and "
+        "visible details."
+    ),
+    "storyboard note": (
+        "Write like a storyboard note for an artist, naming visual beats and "
+        "spatial relationships."
+    ),
+    "image prompt helper": (
+        "Write a detailed image-generation prompt that makes the viewpoint and "
+        "composition explicit."
+    ),
+    "visual analysis paragraph": (
+        "Write an analytical paragraph explaining how the viewpoint changes "
+        "what is visible and what is hidden."
+    ),
+}
+FIVE_VIEWPOINTS = [
+    "close-up",
+    "wide shot",
+    "bird's-eye view",
+    "low angle",
+    "over-the-shoulder",
+]
+try:
+    torch.set_num_threads(2)
+except Exception:
+    pass
+@lru_cache(maxsize=3)
+def load_generator(model_label):
+    model_id = MODEL_OPTIONS[model_label]
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
+    model.eval()
+    return pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        device=-1,
+    )
+def build_prompt(model_label, scene, viewpoint, output_mode):
+    scene = scene.strip()
+    viewpoint_guide = VIEWPOINT_GUIDES[viewpoint]
+    mode_guide = MODE_GUIDES[output_mode]
+    if model_label not in INSTRUCT_MODEL_LABELS:
+        return (
+            f"{viewpoint.title()} {output_mode}.\n"
+            f"Scene: {scene}\n"
+            "Description:"
+        )
+    return (
+        "You are a careful visual scene description assistant for a student "
+        "research project.\n"
+        "Describe the same scene from a selected viewpoint. The important question "
+        "is not just camera vocabulary; explain what becomes visible, hidden, "
+        "larger, smaller, foregrounded, or backgrounded because of the viewpoint.\n\n"
+        f"Viewpoint: {viewpoint}\n"
+        f"Viewpoint guidance: {viewpoint_guide}\n"
+        f"Output mode: {output_mode}\n"
+        f"Output guidance: {mode_guide}\n"
+        f"Scene: {scene}\n\n"
+        "Write the response now:"
+    )
+def call_model(model_label, final_prompt, temperature, top_p, max_new_tokens):
+    generator = load_generator(model_label)
+    tokenizer = generator.tokenizer
+    result = generator(
+        final_prompt,
+        max_new_tokens=int(max_new_tokens),
+        temperature=max(float(temperature), 0.05),
+        top_p=float(top_p),
+        do_sample=True,
+        repetition_penalty=1.08,
+        return_full_text=False,
+        pad_token_id=tokenizer.eos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+    text = result[0]["generated_text"].strip()
+    return text if text else "(The model returned an empty response. Try more tokens.)"
+def generate_viewpoint(
+    model_label,
+    scene,
+    viewpoint,
+    output_mode,
+    temperature,
+    top_p,
+    max_new_tokens,
+):
+    if not scene or not scene.strip():
+        return "Please enter a scene.", "", ""
+    final_prompt = build_prompt(model_label, scene, viewpoint, output_mode)
+    started = time.perf_counter()
+    try:
+        output = call_model(
+            model_label,
+            final_prompt,
+            temperature,
+            top_p,
+            max_new_tokens,
+        )
+    except Exception as exc:
+        return (
+            f"Error while running the model: {exc}",
+            final_prompt,
+            "Try the fast model first, or reduce max tokens.",
+        )
+    elapsed = time.perf_counter() - started
+    note = (
+        f"Model: {MODEL_OPTIONS[model_label]}\n"
+        f"Elapsed: {elapsed:.1f} seconds\n"
+        "First use can be slower because the model has to download and load."
+    )
+    return output, final_prompt, note
+def make_paper_notes(scene, outputs_text):
+    scene_line = scene.strip() if scene and scene.strip() else "the tested scene"
+    return (
+        f"Paper notes for: {scene_line}\n\n"
+        "Use these checks while reading the outputs:\n\n"
+        "1. Visibility: Which objects become visible or hidden in each viewpoint?\n"
+        "2. Occlusion: Does the model notice when one object blocks another?\n"
+        "3. Scale: Does low angle or close-up change perceived size or importance?\n"
+        "4. Layout: Does bird's-eye or wide shot explain spatial relationships?\n"
+        "5. Specificity: Does the model describe this scene, or could the paragraph "
+        "fit almost any scene?\n"
+        "6. Finding sentence: Write one cautious sentence about whether the model "
+        "understands viewpoint consequences or only uses camera-angle words.\n\n"
+        "Useful wording for the paper:\n"
+        "In this small test, the model was strongest when ____. It was weakest "
+        "when ____. The clearest limitation was ____."
+    )
+def run_five_viewpoints(model_label, scene, output_mode, temperature, top_p, max_new_tokens):
+    if not scene or not scene.strip():
+        return "Please enter a scene.", ""
+    started = time.perf_counter()
+    sections = []
+    try:
+        for viewpoint in FIVE_VIEWPOINTS:
+            final_prompt = build_prompt(model_label, scene, viewpoint, output_mode)
+            output = call_model(
+                model_label,
+                final_prompt,
+                temperature,
+                top_p,
+                max_new_tokens,
+            )
+            sections.append(f"## {viewpoint.title()}\n\n{output}")
+    except Exception as exc:
+        return (
+            f"Error while running the five-viewpoint test: {exc}",
+            "Try the fast model first, or reduce max tokens.",
+        )
+    elapsed = time.perf_counter() - started
+    outputs_text = "\n\n---\n\n".join(sections)
+    notes = make_paper_notes(scene, outputs_text) + f"\n\nElapsed: {elapsed:.1f} seconds."
+    return outputs_text, notes
+def notes_from_pasted_outputs(scene, pasted_outputs):
+    if not pasted_outputs or not pasted_outputs.strip():
+        return "Paste your generated outputs first."
+    return make_paper_notes(scene, pasted_outputs)
+with gr.Blocks(title="Camera Angle Model Lab", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        "# Camera Angle Model Lab\n"
+        "CPU-only viewpoint lab for testing how small language models describe "
+        "the same scene from different visual perspectives. No API tokens or paid "
+        "compute required. The first run may take a minute while the model loads."
+    )
+    with gr.Tab("Single Viewpoint Writer"):
+        with gr.Row():
+            model_one = gr.Dropdown(
+                choices=list(MODEL_OPTIONS.keys()),
+                value=DEFAULT_MODEL,
+                label="Model",
+            )
+            viewpoint_one = gr.Dropdown(
+                choices=list(VIEWPOINT_GUIDES.keys()),
+                value="close-up",
+                label="Viewpoint",
+            )
+            mode_one = gr.Dropdown(
+                choices=list(MODE_GUIDES.keys()),
+                value="visual analysis paragraph",
+                label="Output mode",
+            )
+        scene_one = gr.Textbox(
+            label="Scene",
+            lines=4,
+            value="A dog hides under a kitchen table while a child looks for it.",
+        )
+        with gr.Row():
+            temperature_one = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
+            top_p_one = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
+            max_tokens_one = gr.Slider(40, 170, value=100, step=10, label="Max new tokens")
+        run_one = gr.Button("Generate", variant="primary")
+        output_one = gr.Textbox(label="Generated output", lines=10)
+        prompt_sent_one = gr.Textbox(label="Prompt sent to model", lines=8)
+        note_one = gr.Textbox(label="Run note", lines=3)
+        run_one.click(
+            fn=generate_viewpoint,
+            inputs=[
+                model_one,
+                scene_one,
+                viewpoint_one,
+                mode_one,
+                temperature_one,
+                top_p_one,
+                max_tokens_one,
+            ],
+            outputs=[output_one, prompt_sent_one, note_one],
+        )
+        gr.Examples(
+            examples=[
+                ["A dog hides under a kitchen table while a child looks for it.", "close-up", "visual analysis paragraph"],
+                ["A crowded city street after rain reflects neon signs in puddles.", "bird's-eye view", "cinematic shot description"],
+                ["A soccer player prepares to take a penalty kick while the goalkeeper waits.", "low angle", "storyboard note"],
+                ["A person stands at the edge of a forest path holding a lantern.", "over-the-shoulder", "image prompt helper"],
+                ["A museum gallery contains one bright painting at the far end of the room.", "wide shot", "photography caption"],
+            ],
+            inputs=[scene_one, viewpoint_one, mode_one],
+        )
+    with gr.Tab("Five-Viewpoint Test"):
+        model_grid = gr.Dropdown(
+            choices=list(MODEL_OPTIONS.keys()),
+            value=DEFAULT_MODEL,
+            label="Model",
+        )
+        scene_grid = gr.Textbox(
+            label="Shared scene",
+            lines=4,
+            value="A dog hides under a kitchen table while a child looks for it.",
+        )
+        mode_grid = gr.Dropdown(
+            choices=list(MODE_GUIDES.keys()),
+            value="visual analysis paragraph",
+            label="Output mode",
+        )
+        with gr.Row():
+            temperature_grid = gr.Slider(0.1, 1.5, value=0.6, step=0.1, label="Temperature")
+            top_p_grid = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
+            max_tokens_grid = gr.Slider(40, 140, value=80, step=10, label="Max new tokens")
+        run_grid = gr.Button("Run Five Viewpoints", variant="primary")
+        grid_output = gr.Markdown(label="Five-viewpoint output")
+        grid_notes = gr.Textbox(label="Paper notes", lines=14)
+        run_grid.click(
+            fn=run_five_viewpoints,
+            inputs=[
+                model_grid,
+                scene_grid,
+                mode_grid,
+                temperature_grid,
+                top_p_grid,
+                max_tokens_grid,
+            ],
+            outputs=[grid_output, grid_notes],
+        )
+    with gr.Tab("Paper Notes Helper"):
+        scene_notes = gr.Textbox(
+            label="Scene being tested",
+            lines=3,
+            value="A dog hides under a kitchen table while a child looks for it.",
+        )
+        pasted_outputs = gr.Textbox(
+            label="Paste generated outputs here",
+            lines=12,
+            placeholder="Paste close-up, wide shot, bird's-eye, low angle, and over-the-shoulder outputs here.",
+        )
+        run_notes = gr.Button("Make Paper Notes", variant="primary")
+        paper_notes = gr.Textbox(label="Checklist for findings section", lines=14)
+        run_notes.click(
+            fn=notes_from_pasted_outputs,
+            inputs=[scene_notes, pasted_outputs],
+            outputs=paper_notes,
+        )
+    gr.Markdown(
+        "### Duplication note\n"
+        "This Space uses only local CPU models. No tokens, API keys, or paid "
+        "hardware are required. Students can duplicate it and edit the viewpoints, "
+        "output modes, examples, or model list."
+    )
+if __name__ == "__main__":
+    demo.launch()