Spaces:

MathFrames
/

manim-render-api

Sleeping

App Files Files Community

Verdiola commited on Oct 20, 2025

Commit

6b4cd19

verified ·

1 Parent(s): 7b6d527

update models (use gpt and gemini)

Browse files

Files changed (1) hide show

app/main.py +209 -84

app/main.py CHANGED Viewed

@@ -15,18 +15,24 @@ from huggingface_hub import HfApi, create_repo, CommitOperationAdd
 from dotenv import load_dotenv
 load_dotenv()
-# -------- Gemini (same SDK style as your Flask app) --------
 from google import genai
 from google.genai import types
-# something
 API_KEY = os.getenv("GEMINI_API_KEY", "")
-# Switch to 2.5 Flash as requested
-MODEL   = os.getenv("GEMINI_MODEL")
-SMALL_MODEL = os.getenv("GEMINI_SMALL_MODEL") or MODEL
 PORT    = int(os.getenv("PORT", "7860"))
-client = genai.Client(api_key=API_KEY) if API_KEY else None
 # -------- FastAPI app --------
 app = FastAPI(title="Manim Render API (error + visual refine)")
@@ -83,12 +89,114 @@ class RateLimiter:
 limiter = RateLimiter(10)
 storyboard_limiter = RateLimiter(30)
 def gemini_call(*, system: str, contents):
     """Wrapper to: enforce RPM and standardize text extraction."""
-    if not client:
         raise RuntimeError("Gemini client is not configured")
     limiter.acquire()
-    resp = client.models.generate_content(
         model=MODEL,
         config=types.GenerateContentConfig(system_instruction=system),
         contents=contents,
@@ -98,16 +206,9 @@ def gemini_call(*, system: str, contents):
 def gemini_small_call(*, system: str, contents: str) -> str:
     """Lightweight wrapper for the storyboard assistant (smaller model)."""
-    if not client:
-        raise RuntimeError("Gemini client is not configured")
-    target_model = SMALL_MODEL or MODEL
     storyboard_limiter.acquire()
-    resp = client.models.generate_content(
-        model=target_model,
-        config=types.GenerateContentConfig(system_instruction=system),
-        contents=contents,
-    )
-    return getattr(resp, "text", str(resp))
 # ---------------- prompts ----------------
 SYSTEM_PROMPT = """You are a Manim CE (0.19.x) code generator/refiner.
@@ -128,7 +229,8 @@ Forbidden: os, subprocess, sys, requests, pathlib, socket, shutil, psutil, any f
 # Common Manim CE 0.19 API constraints (must follow)
 - Do NOT use `vertex=` with RightAngle(...). Choose the corner by line ordering or set quadrant=(±1, ±1).
 - Do NOT call `.to_center()` (not a valid method). Use `.center()` or `.move_to(ORIGIN)`.
-- Prefer `.move_to()`, `.align_to()`, `.to_edge()`, `.scale()`, `.next_to()` for layout/placement.
 """
 DEFAULT_SCENE = """from manim import *
@@ -697,14 +799,14 @@ def _run_manim(scene_code: str, run_id: Optional[str] = None, quality: str = "me
     return mp4.read_bytes(), png_path
 def _upload_image_to_gemini(png_path: Path):
-    """Upload an image to Gemini Files API and return the file reference."""
-    if not client or not png_path or not png_path.exists():
         return None
     limiter.acquire()
     with open(png_path, "rb") as f:
-        file_ref = client.files.upload(
-            file=f,                      # <-- keyword arg, not positional
-            config={"mime_type": "image/png"}  # helpful but optional
         )
     return file_ref
@@ -715,7 +817,7 @@ def llm_generate_manim_code(
     previous_code: Optional[str] = None,
 ) -> str:
     """First-pass generation (capture-aware)."""
-    if not client:
         return DEFAULT_SCENE
     try:
         contents = f"Create AutoScene for: {prompt}\nRemember the CAPTURE POLICY and Common API constraints."
@@ -733,6 +835,8 @@ def llm_generate_manim_code(
             resolution = settings.get("resolution")
             if resolution:
                 contents += f"\n- Design visuals that read clearly at {resolution}."
         response_text = gemini_call(system=SYSTEM_PROMPT, contents=contents)
         code = _clean_code(response_text)
         if "class AutoScene" not in code:
@@ -749,8 +853,8 @@ def llm_refine_from_error(
     original_user_prompt: str,
     settings: Optional[Dict[str, Any]] = None,
 ) -> str:
-    """When Manim fails; send the *real* CLI log/trace to Gemini."""
-    if not client:
         return previous_code or DEFAULT_SCENE
     try:
         trimmed = error_message[-4000:] if error_message else ""
@@ -769,6 +873,8 @@ Requirements:
 - Fix the bug while preserving the math logic and planned animations.
 - Keep exactly one class AutoScene(Scene).
 - Keep the CAPTURE POLICY and ensure # CAPTURE_POINT is at the final steady layout.
 - Scan for nonexistent methods (e.g., `.to_center`) or invalid kwargs (e.g., `vertex=` on RightAngle) and replace with valid Manim CE 0.19 API.
 - Prefer `.center()`/`.move_to(ORIGIN)`, and `.move_to()`, `.align_to()`, `.to_edge()`, `.next_to()` for layout.
 - Apply the smallest change necessary to resolve the failure; do not overhaul structure, pacing, or stylistic choices the user made.
@@ -811,7 +917,7 @@ def llm_visual_refine_from_image(
     Use the screenshot to request layout/legibility/placement fixes.
     Includes the original prompt and current code, and asks for minimal edits.
     """
-    if not client or not png_path or not png_path.exists():
         return previous_code
     try:
         file_ref = _upload_image_to_gemini(png_path)
@@ -829,6 +935,8 @@ Tasks (optimize for readability and visual quality without changing the math mea
 - Fix layout issues (overlaps, cramped margins, alignment, consistent scaling).
 - Improve text legibility (minimum size ~32 px at 854x480, adequate contrast).
 - Ensure all intended elements are visible at the capture point.
 - Keep animation semantics as-is unless they're obviously broken.
 - Keep exactly one class AutoScene(Scene).
 - Preserve the CAPTURE POLICY and place `# CAPTURE_POINT` at the final steady layout with self.wait(0.75) and NO outro after that.
@@ -862,11 +970,54 @@ Return ONLY the revised Python code (no backticks).
         traceback.print_exc()
         return previous_code
 def refine_loop(
     user_prompt: str,
     settings: Optional[Dict[str, Any]] = None,
     max_error_refines: int = 3,
-    do_visual_refine: bool = True,
 ) -> bytes:
     """
     Generate → render; on error, refine up to N times from Manim traceback → re-render.
@@ -874,58 +1025,32 @@ def refine_loop(
     using the saved steady-state PNG, then re-render. Fallback to the best successful MP4.
     """
     # 1) initial generation (capture-aware)
-    code = llm_generate_manim_code(user_prompt, settings=settings)
     quality = _quality_from_settings(settings)
-    # 2) render attempt
-    try:
-        mp4_bytes, png_path = _run_manim(code, run_id="iter0", quality=quality)
-    except RenderError as e:
-        print("Render failed (iter0), attempting error-based refinement...", file=sys.stderr)
-        if max_error_refines <= 0:
-            raise
-        attempts = 0
-        last_err = e.log or ""
-        while attempts < max_error_refines:
-            attempts += 1
-            refined = llm_refine_from_error(
-                previous_code=code,
-                error_message=last_err,
-                original_user_prompt=user_prompt,
-                settings=settings,
-            )
-            try:
-                mp4_bytes, png_path = _run_manim(refined, run_id=f"iter_err_{attempts}", quality=quality)
-                code = refined
-                break
-            except RenderError as e2:
-                last_err = e2.log or last_err
-                if attempts >= max_error_refines:
-                    raise
-            except Exception:
-                last_err = traceback.format_exc()
-                if attempts >= max_error_refines:
-                    raise
-    except Exception:
-        print("Unexpected error path; refining from Python traceback...", file=sys.stderr)
-        attempts = 0
-        last_err = traceback.format_exc()
-        while attempts < max_error_refines:
-            attempts += 1
-            refined = llm_refine_from_error(
-                previous_code=code,
-                error_message=last_err,
-                original_user_prompt=user_prompt,
-                settings=settings,
-            )
-            try:
-                mp4_bytes, png_path = _run_manim(refined, run_id=f"iter_err_{attempts}", quality=quality)
-                code = refined
-                break
-            except Exception:
-                last_err = traceback.format_exc()
-                if attempts >= max_error_refines:
-                    raise
     # 3) optional visual refinement loop
     if do_visual_refine and png_path and png_path.exists():
@@ -954,7 +1079,7 @@ def _auto_fix_render(
     max_attempts: int = 3,
 ) -> Tuple[Optional[str], Optional[bytes], str]:
     """Attempt to auto-fix user code via LLM refinement if available."""
-    if not client:
         return None, None, initial_log
     quality = _quality_from_settings(settings)
     attempt_code = code
@@ -983,8 +1108,8 @@ def _auto_fix_render(
 # ---------------- API ----------------
 @app.post("/storyboard/chat")
 def storyboard_chat(inp: StoryboardChatIn):
-    if not client:
-        raise HTTPException(500, "Gemini client is not configured")
     if not inp.message.strip() and not inp.plan:
         raise HTTPException(400, "Message or plan updates are required.")
@@ -1022,8 +1147,8 @@ def storyboard_chat(inp: StoryboardChatIn):
 @app.post("/storyboard/confirm")
 def storyboard_confirm(inp: StoryboardConfirmIn):
-    if not client:
-        raise HTTPException(500, "Gemini client is not configured")
     session = _get_or_create_session(inp.session_id, inp.settings or {})
     if inp.settings:
@@ -1122,7 +1247,7 @@ def generate_code(inp: GenerateCodeIn):
 @app.post("/generate-and-render")
 def generate_and_render(inp: PromptIn):
     try:
-        mp4 = refine_loop(inp.prompt, settings=inp.settings, max_error_refines=3, do_visual_refine=True)
     except Exception:
         raise HTTPException(500, "Failed to produce video after refinement")
     return Response(

 from dotenv import load_dotenv
 load_dotenv()
+# -------- Gemini + GPT client setup --------
 from google import genai
 from google.genai import types
+try:
+    from openai import OpenAI
+except ImportError:
+    OpenAI = None
+# We keep the GEMINI_* env vars for compatibility.
 API_KEY = os.getenv("GEMINI_API_KEY", "")
+MODEL   = os.getenv("GEMINI_MODEL", "gemini-2.0-pro")
+SMALL_MODEL = os.getenv("GEMINI_SMALL_MODEL")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or API_KEY
 PORT    = int(os.getenv("PORT", "7860"))
+gemini_client = genai.Client(api_key=API_KEY) if API_KEY else None
+gpt_client = OpenAI(api_key=OPENAI_API_KEY) if (OPENAI_API_KEY and OpenAI) else None
 # -------- FastAPI app --------
 app = FastAPI(title="Manim Render API (error + visual refine)")
 limiter = RateLimiter(10)
 storyboard_limiter = RateLimiter(30)
+def _to_chat_content_item(item: Any) -> Any:
+    if isinstance(item, str):
+        return {"type": "text", "text": item}
+    if isinstance(item, dict):
+        return item
+    return {"type": "text", "text": str(item)}
+def _to_response_content_item(item: Any) -> Dict[str, Any]:
+    if isinstance(item, str):
+        return {"type": "input_text", "text": item}
+    if isinstance(item, dict):
+        itype = item.get("type")
+        if itype == "text":
+            return {"type": "input_text", "text": item.get("text", "")}
+        if itype == "image_url":
+            image_url = item.get("image_url", {})
+            if isinstance(image_url, dict):
+                return {"type": "input_image", "image_url": image_url}
+            return {"type": "input_image", "image_url": {"url": str(image_url)}}
+        if itype in {"input_text", "input_image", "input_file"}:
+            return item
+    return {"type": "input_text", "text": str(item)}
+def _build_openai_content(contents: Any, *, for_chat: bool) -> Any:
+    """
+    Normalize content payloads for chat (strings or multimodal lists) and responses API (typed blocks).
+    """
+    if isinstance(contents, str):
+        return contents if for_chat else [_to_response_content_item(contents)]
+    if isinstance(contents, (list, tuple)):
+        if for_chat:
+            return [_to_chat_content_item(item) for item in contents]
+        return [_to_response_content_item(item) for item in contents]
+    return contents if for_chat else [_to_response_content_item(contents)]
+def _build_chat_messages(system: str, contents: Any) -> List[Dict[str, Any]]:
+    return [
+        {"role": "system", "content": system},
+        {"role": "user", "content": _build_openai_content(contents, for_chat=True)},
+    ]
+def _build_responses_input(system: str, contents: Any) -> List[Dict[str, Any]]:
+    return [
+        {"role": "system", "content": _build_openai_content(system, for_chat=False)},
+        {"role": "user", "content": _build_openai_content(contents, for_chat=False)},
+    ]
+def _extract_chat_content(resp: Any) -> str:
+    content = resp.choices[0].message.content
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        text_parts = []
+        for chunk in content:
+            if isinstance(chunk, dict) and chunk.get("type") == "text":
+                text_parts.append(chunk.get("text", ""))
+            else:
+                text_parts.append(str(chunk))
+        return "\n".join(filter(None, text_parts))
+    return str(content)
+def _extract_responses_content(resp: Any) -> str:
+    text = getattr(resp, "output_text", None)
+    if text:
+        return text
+    output = getattr(resp, "output", None)
+    if output:
+        chunks = []
+        for item in output:
+            for elem in getattr(item, "content", []) or []:
+                chunk_text = getattr(elem, "text", None) or getattr(elem, "content", None)
+                if chunk_text:
+                    chunks.append(chunk_text)
+        if chunks:
+            return "\n".join(map(str, chunks))
+    return str(resp)
+def _invoke_gpt_model(model: str, system: str, contents: Any) -> str:
+    if not gpt_client:
+        raise RuntimeError("GPT client is not configured")
+    messages = _build_chat_messages(system, contents)
+    try:
+        resp = gpt_client.chat.completions.create(model=model, messages=messages)
+        return _extract_chat_content(resp)
+    except Exception as err:
+        message = str(err)
+        if "only supported in v1/responses" not in message:
+            raise
+        resp = gpt_client.responses.create(
+            model=model,
+            input=_build_responses_input(system, contents),
+        )
+        return _extract_responses_content(resp)
 def gemini_call(*, system: str, contents):
     """Wrapper to: enforce RPM and standardize text extraction."""
+    if not gemini_client:
         raise RuntimeError("Gemini client is not configured")
     limiter.acquire()
+    resp = gemini_client.models.generate_content(
         model=MODEL,
         config=types.GenerateContentConfig(system_instruction=system),
         contents=contents,
 def gemini_small_call(*, system: str, contents: str) -> str:
     """Lightweight wrapper for the storyboard assistant (smaller model)."""
+    target_model = SMALL_MODEL or "gpt-4o-mini"
     storyboard_limiter.acquire()
+    return _invoke_gpt_model(target_model, system, contents)
 # ---------------- prompts ----------------
 SYSTEM_PROMPT = """You are a Manim CE (0.19.x) code generator/refiner.
 # Common Manim CE 0.19 API constraints (must follow)
 - Do NOT use `vertex=` with RightAngle(...). Choose the corner by line ordering or set quadrant=(±1, ±1).
 - Do NOT call `.to_center()` (not a valid method). Use `.center()` or `.move_to(ORIGIN)`.
+- Prefer `.move_to()`, `.align_to()`, `.to_edge()`, `.scale()`, `.next_to()` for layout/placement, keeping generous spacing (buff ≥ 0.6) so nothing overlaps.
+- Only introduce objects that directly support the user's request. Avoid decorative or redundant elements that clutter the scene.
 """
 DEFAULT_SCENE = """from manim import *
     return mp4.read_bytes(), png_path
 def _upload_image_to_gemini(png_path: Path):
+    """Prepare an inline data URI that the OpenAI vision API accepts."""
+    if not gemini_client or not png_path or not png_path.exists():
         return None
     limiter.acquire()
     with open(png_path, "rb") as f:
+        file_ref = gemini_client.files.upload(
+            file=f,
+            config={"mime_type": "image/png"},
         )
     return file_ref
     previous_code: Optional[str] = None,
 ) -> str:
     """First-pass generation (capture-aware)."""
+    if not gemini_client:
         return DEFAULT_SCENE
     try:
         contents = f"Create AutoScene for: {prompt}\nRemember the CAPTURE POLICY and Common API constraints."
             resolution = settings.get("resolution")
             if resolution:
                 contents += f"\n- Design visuals that read clearly at {resolution}."
+        contents += "\nLayout requirement: ensure every element has clear separation—absolutely no overlaps at the capture point."
+        contents += "\nKeep the composition minimal: only include elements explicitly needed for the prompt."
         response_text = gemini_call(system=SYSTEM_PROMPT, contents=contents)
         code = _clean_code(response_text)
         if "class AutoScene" not in code:
     original_user_prompt: str,
     settings: Optional[Dict[str, Any]] = None,
 ) -> str:
+    """When Manim fails; send the *real* CLI log/trace to the LLM."""
+    if not gemini_client:
         return previous_code or DEFAULT_SCENE
     try:
         trimmed = error_message[-4000:] if error_message else ""
 - Fix the bug while preserving the math logic and planned animations.
 - Keep exactly one class AutoScene(Scene).
 - Keep the CAPTURE POLICY and ensure # CAPTURE_POINT is at the final steady layout.
+- Eliminate any overlapping elements; maintain clear spacing at the capture point.
+- Remove any objects that are not necessary for the prompt or storyboard; keep the scene concise.
 - Scan for nonexistent methods (e.g., `.to_center`) or invalid kwargs (e.g., `vertex=` on RightAngle) and replace with valid Manim CE 0.19 API.
 - Prefer `.center()`/`.move_to(ORIGIN)`, and `.move_to()`, `.align_to()`, `.to_edge()`, `.next_to()` for layout.
 - Apply the smallest change necessary to resolve the failure; do not overhaul structure, pacing, or stylistic choices the user made.
     Use the screenshot to request layout/legibility/placement fixes.
     Includes the original prompt and current code, and asks for minimal edits.
     """
+    if not gemini_client or not png_path or not png_path.exists():
         return previous_code
     try:
         file_ref = _upload_image_to_gemini(png_path)
 - Fix layout issues (overlaps, cramped margins, alignment, consistent scaling).
 - Improve text legibility (minimum size ~32 px at 854x480, adequate contrast).
 - Ensure all intended elements are visible at the capture point.
+- Remove any overlapping elements; keep generous spacing between visuals.
+- Remove decorative or redundant elements that are not required by the user's prompt or storyboard.
 - Keep animation semantics as-is unless they're obviously broken.
 - Keep exactly one class AutoScene(Scene).
 - Preserve the CAPTURE POLICY and place `# CAPTURE_POINT` at the final steady layout with self.wait(0.75) and NO outro after that.
         traceback.print_exc()
         return previous_code
+def _attempt_render_with_refine(
+    base_code: str,
+    *,
+    user_prompt: str,
+    settings: Optional[Dict[str, Any]],
+    quality: str,
+    run_prefix: str,
+    max_refines: int,
+) -> Tuple[Optional[str], Optional[bytes], Optional[Path], str]:
+    """
+    Try to render `base_code`, refining up to `max_refines` times using Gemini on failure.
+    Returns tuple: (final_code, video_bytes, png_path, last_error_log).
+    If rendering still fails, code/video/png are None and last_error_log carries the last trace.
+    """
+    attempts = 0
+    current_code = base_code
+    last_log = ""
+    while True:
+        try:
+            mp4_bytes, png_path = _run_manim(
+                current_code,
+                run_id=f"{run_prefix}_try{attempts}",
+                quality=quality,
+            )
+            return current_code, mp4_bytes, png_path, ""
+        except RenderError as err:
+            last_log = err.log or last_log
+        except Exception:
+            last_log = traceback.format_exc()
+        if attempts >= max_refines:
+            return None, None, None, last_log
+        attempts += 1
+        current_code = llm_refine_from_error(
+            previous_code=current_code,
+            error_message=last_log,
+            original_user_prompt=user_prompt,
+            settings=settings,
+        )
 def refine_loop(
     user_prompt: str,
     settings: Optional[Dict[str, Any]] = None,
     max_error_refines: int = 3,
+    do_visual_refine: bool = False,
 ) -> bytes:
     """
     Generate → render; on error, refine up to N times from Manim traceback → re-render.
     using the saved steady-state PNG, then re-render. Fallback to the best successful MP4.
     """
     # 1) initial generation (capture-aware)
+    initial_code = llm_generate_manim_code(user_prompt, settings=settings)
     quality = _quality_from_settings(settings)
+    code, mp4_bytes, png_path, last_log = _attempt_render_with_refine(
+        initial_code,
+        user_prompt=user_prompt,
+        settings=settings,
+        quality=quality,
+        run_prefix="primary",
+        max_refines=max_error_refines,
+    )
+    if code is None:
+        print("Primary render failed after refinements; generating fallback code...", file=sys.stderr)
+        fallback_code = llm_generate_manim_code(user_prompt, settings=settings)
+        code, mp4_bytes, png_path, last_log = _attempt_render_with_refine(
+            fallback_code,
+            user_prompt=user_prompt,
+            settings=settings,
+            quality=quality,
+            run_prefix="fallback",
+            max_refines=2,
+        )
+        if code is None:
+            error_message = last_log or "Render failed after fallback attempts."
+            raise RenderError(error_message)
     # 3) optional visual refinement loop
     if do_visual_refine and png_path and png_path.exists():
     max_attempts: int = 3,
 ) -> Tuple[Optional[str], Optional[bytes], str]:
     """Attempt to auto-fix user code via LLM refinement if available."""
+    if not gemini_client:
         return None, None, initial_log
     quality = _quality_from_settings(settings)
     attempt_code = code
 # ---------------- API ----------------
 @app.post("/storyboard/chat")
 def storyboard_chat(inp: StoryboardChatIn):
+    if not gpt_client:
+        raise HTTPException(500, "Storyboard model is not configured")
     if not inp.message.strip() and not inp.plan:
         raise HTTPException(400, "Message or plan updates are required.")
 @app.post("/storyboard/confirm")
 def storyboard_confirm(inp: StoryboardConfirmIn):
+    if not gpt_client:
+        raise HTTPException(500, "Storyboard model is not configured")
     session = _get_or_create_session(inp.session_id, inp.settings or {})
     if inp.settings:
 @app.post("/generate-and-render")
 def generate_and_render(inp: PromptIn):
     try:
+        mp4 = refine_loop(inp.prompt, settings=inp.settings, max_error_refines=3, do_visual_refine=False)
     except Exception:
         raise HTTPException(500, "Failed to produce video after refinement")
     return Response(