Spaces:

ruslanmv
/

ai-fast-image-server

Running on Zero

App Files Files Community

ruslanmv commited on Sep 28

Commit

74942a4

1 Parent(s): 6fdbc47

First commit

Browse files

Files changed (2) hide show

app.py +150 -131
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -------------------------------
-# AI Fast Image Server (Production)
 # -------------------------------
 from __future__ import annotations
@@ -7,11 +7,11 @@ import os
 import sys
 import logging
 import subprocess
-from typing import Optional
-# ---------- Early, safe env defaults ----------
-os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")  # faster model downloads
-os.environ.setdefault("DEEPSPEED_DISABLE_NVML", "1")      # silence NVML in headless envs
 os.environ.setdefault("BITSANDBYTES_NOWELCOME", "1")
 # ---------- Logging ----------
@@ -23,7 +23,7 @@ logging.basicConfig(
 log = logging.getLogger("ai-fast-image-server")
 # ---------- Config via ENV ----------
-# MODEL_BACKEND: sdxl_lcm_unet (heavy), sdxl_lcm_lora (light), ssd1b_lcm_lora (light)
 MODEL_BACKEND = os.getenv("MODEL_BACKEND", "sdxl_lcm_lora").lower()
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024"))
 DEFAULT_SIZE = int(os.getenv("DEFAULT_SIZE", "1024"))
@@ -31,9 +31,10 @@ SECRET_TOKEN = os.getenv("SECRET_TOKEN", "default_secret")
 PORT = int(os.getenv("PORT", "7860"))
 CONCURRENCY = int(os.getenv("CONCURRENCY", "2"))
 QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", "32"))
-ENABLE_SSR = os.getenv("ENABLE_SSR", "false").lower() == "true"  # SSR can be flaky; default off
-# ---------- Imports that require deps ----------
 import warnings
 warnings.filterwarnings("ignore", message="Can't initialize NVML")
@@ -48,6 +49,18 @@ from diffusers import (
     AutoPipelineForText2Image,
 )
 # ---------- Version guard: Torch 2.1 + NumPy 2.x is incompatible ----------
 try:
     _np_major = int(np.__version__.split(".")[0])
@@ -79,161 +92,163 @@ def print_nvidia_smi() -> None:
 print_nvidia_smi()
-IS_GPU = torch.cuda.is_available()
-DEVICE = torch.device("cuda") if IS_GPU else torch.device("cpu")
-DTYPE = torch.float16 if IS_GPU else torch.float32
-log.info(f"CUDA available: {IS_GPU} | device={DEVICE} | dtype={DTYPE}")
-# ---------- Torch perf knobs ----------
-try:
-    if IS_GPU:
-        torch.backends.cuda.matmul.allow_tf32 = True  # safe perf on Ampere+
-        torch.set_float32_matmul_precision("high")
-except Exception:
-    pass
-# ---------- Helpers ----------
-def _variant_kwargs() -> dict:
-    # use fp16 repo variants only on GPU
-    return {"variant": "fp16"} if IS_GPU else {}
-def _cpu_safety_settings(pipe: DiffusionPipeline) -> None:
-    # reduce RAM usage and avoid giant VAE allocations on CPU
-    try:
-        pipe.enable_vae_tiling()
-    except Exception:
-        pass
-def _gpu_memory_efficiency(pipe: DiffusionPipeline) -> None:
-    # enable memory-efficient attention when available
     enabled = False
     try:
-        pipe.enable_xformers_memory_efficient_attention()
         enabled = True
     except Exception:
         try:
-            pipe.enable_attention_slicing("max")
             enabled = True
         except Exception:
             pass
     if enabled:
         try:
-            pipe.enable_vae_tiling()
         except Exception:
             pass
-# ---------- Model loading ----------
-pipe: Optional[DiffusionPipeline] = None
-def load_pipeline() -> DiffusionPipeline:
     """
-    Load the selected backend with sensible defaults.
-    - sdxl_lcm_unet: SDXL base + full LCM UNet (heavy, high VRAM)
-    - sdxl_lcm_lora: SDXL base + LCM-LoRA (light, recommended)
-    - ssd1b_lcm_lora: SSD-1B + LCM-LoRA (light)
     """
     log.info(f"Loading model backend: {MODEL_BACKEND}")
     if MODEL_BACKEND == "sdxl_lcm_unet":
-        # Heavy: downloads ~10 GB UNet; best quality/speed on big GPUs
         unet = UNet2DConditionModel.from_pretrained(
             "latent-consistency/lcm-sdxl",
-            torch_dtype=DTYPE,
             cache_dir=CACHE_DIR,
-            **_variant_kwargs(),
         )
-        _pipe = DiffusionPipeline.from_pretrained(
             "stabilityai/stable-diffusion-xl-base-1.0",
             unet=unet,
-            torch_dtype=DTYPE,
             cache_dir=CACHE_DIR,
-            **_variant_kwargs(),
         )
     elif MODEL_BACKEND == "ssd1b_lcm_lora":
-        _pipe = AutoPipelineForText2Image.from_pretrained(
             "segmind/SSD-1B",
-            torch_dtype=DTYPE,
             cache_dir=CACHE_DIR,
-            **_variant_kwargs(),
         )
-        _pipe.load_lora_weights("latent-consistency/lcm-lora-ssd-1b")
-        _pipe.fuse_lora()
     else:
-        # Default & recommended: SDXL + LCM-LoRA (smaller downloads, good quality)
-        _pipe = DiffusionPipeline.from_pretrained(
             "stabilityai/stable-diffusion-xl-base-1.0",
-            torch_dtype=DTYPE,
             cache_dir=CACHE_DIR,
-            **_variant_kwargs(),
         )
-        _pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
-        _pipe.fuse_lora()
     # Use LCM scheduler
-    _pipe.scheduler = LCMScheduler.from_config(_pipe.scheduler.config)
-    # Device & memory efficiency
-    _pipe.to(DEVICE)
-    if IS_GPU:
-        _gpu_memory_efficiency(_pipe)
-    else:
-        _cpu_safety_settings(_pipe)
-    log.info("Pipeline loaded.")
-    return _pipe
-# warmup lazily
 def ensure_pipe() -> DiffusionPipeline:
     global pipe
     if pipe is None:
-        pipe = load_pipeline()
     return pipe
-# ---------- HF Spaces GPU decorator (fixes “No @spaces.GPU function detected”) ----------
-try:
-    import spaces  # type: ignore
-    GPU_DECORATOR = spaces.GPU
-    log.info("`spaces` package detected. GPU-decorating inference function.")
-except Exception:
-    GPU_DECORATOR = lambda f: f  # no-op
-# ---------- Inference ----------
-@gpu_dec := GPU_DECORATOR
-def generate_image_internal(
     prompt: str,
-    negative_prompt: str = "",
-    seed: Optional[int] = 0,
-    width: int = DEFAULT_SIZE,
-    height: int = DEFAULT_SIZE,
-    guidance_scale: float = 0.0,
-    num_inference_steps: int = 4,
 ) -> Image.Image:
-    _pipe = ensure_pipe()
-    # Clamp to safe bounds
-    width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
-    height = int(np.clip(height, 256, MAX_IMAGE_SIZE))
-    num_inference_steps = int(np.clip(num_inference_steps, 1, 12))
-    guidance_scale = float(np.clip(guidance_scale, 0.0, 2.0))
-    # Deterministic generator
-    generator = torch.Generator(device=DEVICE)
-    if seed is not None:
-        generator = generator.manual_seed(int(seed))
-    result = _pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        width=width,
-        height=height,
-        guidance_scale=guidance_scale,         # LCM prefers low/no guidance
-        num_inference_steps=num_inference_steps,
-        generator=generator,
-        output_type="pil",
-    )
-    return result.images[0]
-# thin wrapper that enforces the token (kept out of the GPU-decorated function)
 def generate(
     prompt: str,
     negative_prompt: str = "",
@@ -246,39 +261,45 @@ def generate(
 ) -> Image.Image:
     if secret_token != SECRET_TOKEN:
         raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")
-    return generate_image_internal(
         prompt=prompt,
         negative_prompt=negative_prompt,
         seed=seed,
         width=width,
         height=height,
         guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
     )
-# ---------- Optional warmup at startup ----------
 def warmup():
     try:
         ensure_pipe()
-        _ = generate_image_internal(
-            prompt="A quick warmup prompt, minimal style", seed=42, width=512, height=512, num_inference_steps=2
-        )
-        log.info("Warmup complete.")
     except Exception as e:
         log.warning(f"Warmup skipped or failed: {e}")
-if os.getenv("WARMUP", "true").lower() == "true":
-    # Don't block too long on CPU
-    if IS_GPU:
-        warmup()
 # ---------- Gradio UI (v5) ----------
 def build_ui() -> gr.Blocks:
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
-        gr.Markdown("## Image Generator (LCM) — SDXL / SSD-1B")
         with gr.Row():
-            prompt = gr.Textbox(label="Prompt", lines=3, placeholder="Describe the image...")
             negative = gr.Textbox(label="Negative Prompt", lines=2, placeholder="(optional)")
         with gr.Row():
@@ -297,24 +318,22 @@ def build_ui() -> gr.Blocks:
         inputs = [prompt, negative, seed, width, height, guidance, steps, token]
         run.click(fn=generate, inputs=inputs, outputs=out, concurrency_limit=CONCURRENCY)
-        # Simple health info
         gr.Markdown(
             f"*Backend:* `{MODEL_BACKEND}` &nbsp; | &nbsp; "
-            f"*Device:* `{DEVICE}` &nbsp; | &nbsp; "
-            f"*dtype:* `{DTYPE}`"
         )
     return demo
 # ---------- Launch ----------
 def main():
     demo = build_ui()
-    # Queue for backpressure and concurrency control
     demo.queue(max_size=QUEUE_SIZE, concurrency_count=CONCURRENCY)
     demo.launch(
         server_name="0.0.0.0",
         server_port=PORT,
         show_api=True,
-        ssr_mode=ENABLE_SSR,  # SSR off by default (can be flaky on Spaces)
         share=False,
         show_error=True,
     )

 # -------------------------------
+# AI Fast Image Server — ZeroGPU Ready
 # -------------------------------
 from __future__ import annotations
 import sys
 import logging
 import subprocess
+from typing import Optional, Callable
+# ---------- Fast, safe defaults ----------
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")   # faster model downloads
+os.environ.setdefault("DEEPSPEED_DISABLE_NVML", "1")       # silence NVML in headless envs
 os.environ.setdefault("BITSANDBYTES_NOWELCOME", "1")
 # ---------- Logging ----------
 log = logging.getLogger("ai-fast-image-server")
 # ---------- Config via ENV ----------
+# MODEL_BACKEND: "sdxl_lcm_lora" (default), "sdxl_lcm_unet" (heavy), "ssd1b_lcm_lora" (light)
 MODEL_BACKEND = os.getenv("MODEL_BACKEND", "sdxl_lcm_lora").lower()
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024"))
 DEFAULT_SIZE = int(os.getenv("DEFAULT_SIZE", "1024"))
 PORT = int(os.getenv("PORT", "7860"))
 CONCURRENCY = int(os.getenv("CONCURRENCY", "2"))
 QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", "32"))
+ENABLE_SSR = os.getenv("ENABLE_SSR", "false").lower() == "true"  # SSR off by default for stability
+WARMUP = os.getenv("WARMUP", "false").lower() == "true"          # default False for ZeroGPU
+# ---------- Third-party imports ----------
 import warnings
 warnings.filterwarnings("ignore", message="Can't initialize NVML")
     AutoPipelineForText2Image,
 )
+# ---------- ZeroGPU decorator (works even off-Spaces) ----------
+try:
+    import spaces  # real decorator on Spaces
+except Exception:
+    class _DummySpaces:
+        def GPU(self, *args, **kwargs):
+            # identity decorator if not on Spaces
+            def _wrap(f):
+                return f
+            return _wrap
+    spaces = _DummySpaces()
 # ---------- Version guard: Torch 2.1 + NumPy 2.x is incompatible ----------
 try:
     _np_major = int(np.__version__.split(".")[0])
 print_nvidia_smi()
+# ---------- Global pipeline handle (kept on CPU between calls) ----------
+pipe: Optional[DiffusionPipeline] = None
+def _gpu_mem_efficiency(p: DiffusionPipeline) -> None:
+    """Enable memory-efficient attention and VAE tiling where possible."""
     enabled = False
     try:
+        p.enable_xformers_memory_efficient_attention()
         enabled = True
     except Exception:
         try:
+            p.enable_attention_slicing("max")
             enabled = True
         except Exception:
             pass
+    try:
+        p.enable_vae_tiling()
+    except Exception:
+        pass
     if enabled:
+        # faster matmul on Ampere+
         try:
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.set_float32_matmul_precision("high")
         except Exception:
             pass
+def _variant_kwargs() -> dict:
+    # Use fp16 repo variants only when on GPU (avoid oddities on CPU)
+    return {"variant": "fp16"}
+def _build_pipeline_cpu() -> DiffusionPipeline:
     """
+    Build the pipeline on CPU with float32 to keep it stable in ZeroGPU's
+    CPU-only startup environment. We'll move it to CUDA inside the GPU-decorated
+    function per call and return it to CPU after.
     """
     log.info(f"Loading model backend: {MODEL_BACKEND}")
     if MODEL_BACKEND == "sdxl_lcm_unet":
+        # Heavy: full LCM UNet (~10GB). Use only if you have big VRAM.
         unet = UNet2DConditionModel.from_pretrained(
             "latent-consistency/lcm-sdxl",
+            torch_dtype=torch.float32,
             cache_dir=CACHE_DIR,
+            # no variant on CPU
         )
+        _p = DiffusionPipeline.from_pretrained(
             "stabilityai/stable-diffusion-xl-base-1.0",
             unet=unet,
+            torch_dtype=torch.float32,
             cache_dir=CACHE_DIR,
         )
     elif MODEL_BACKEND == "ssd1b_lcm_lora":
+        _p = AutoPipelineForText2Image.from_pretrained(
             "segmind/SSD-1B",
+            torch_dtype=torch.float32,
             cache_dir=CACHE_DIR,
         )
+        _p.load_lora_weights("latent-consistency/lcm-lora-ssd-1b")
+        _p.fuse_lora()
     else:
+        # Default: SDXL + LCM-LoRA (smaller download, great speed/quality)
+        _p = DiffusionPipeline.from_pretrained(
             "stabilityai/stable-diffusion-xl-base-1.0",
+            torch_dtype=torch.float32,
             cache_dir=CACHE_DIR,
         )
+        _p.load_lora_weights("latent-consistency/lcm-lora-sdxl")
+        _p.fuse_lora()
     # Use LCM scheduler
+    _p.scheduler = LCMScheduler.from_config(_p.scheduler.config)
+    # Stay on CPU by default (ZeroGPU will give us CUDA only during calls)
+    _p.to("cpu", torch.float32)
+    try:
+        _p.enable_vae_tiling()  # also fine on CPU
+    except Exception:
+        pass
+    log.info("Pipeline built on CPU.")
+    return _p
 def ensure_pipe() -> DiffusionPipeline:
     global pipe
     if pipe is None:
+        pipe = _build_pipeline_cpu()
     return pipe
+# ---------- Duration model for ZeroGPU ----------
+def _estimate_duration(prompt: str, negative_prompt: str, seed: int,
+                       width: int, height: int, guidance_scale: float, steps: int,
+                       secret_token: str) -> int:
+    """
+    Rough estimate (seconds) to inform ZeroGPU scheduler for better queuing.
+    Scale by pixel count and steps. Conservative upper bound.
+    """
+    base = 3.0  # pipeline dispatch + overhead
+    px_scale = (max(256, width) * max(256, height)) / (1024 * 1024)
+    step_cost = 0.85  # ~0.85s/step @1024^2 (H200 slice; tune as needed)
+    est = base + steps * step_cost * max(0.5, px_scale)
+    # Clamp between 10 and 120 seconds
+    return int(min(120, max(10, est)))
+# ---------- GPU-decorated inference (Spaces detects this) ----------
+@spaces.GPU(duration=_estimate_duration)  # dynamic duration; no-op outside Spaces
+def _generate_gpu_call(
     prompt: str,
+    negative_prompt: str,
+    seed: Optional[int],
+    width: int,
+    height: int,
+    guidance_scale: float,
+    steps: int,
 ) -> Image.Image:
+    """
+    Runs under a ZeroGPU-allocated context. We move the pipeline to CUDA at the
+    start and back to CPU at the end so that it remains usable when GPU is released.
+    """
+    _p = ensure_pipe()
+    # Move to CUDA with half precision (safe with LCM)
+    _p.to("cuda", torch.float16)
+    _gpu_mem_efficiency(_p)
+    try:
+        # Clamp inputs
+        width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
+        height = int(np.clip(height, 256, MAX_IMAGE_SIZE))
+        steps = int(np.clip(steps, 1, 12))
+        guidance_scale = float(np.clip(guidance_scale, 0.0, 2.0))
+        # Deterministic generator on CUDA
+        gen = torch.Generator(device="cuda")
+        if seed is not None:
+            gen = gen.manual_seed(int(seed))
+        out = _p(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            width=width,
+            height=height,
+            guidance_scale=guidance_scale,     # LCM prefers low guidance
+            num_inference_steps=steps,
+            generator=gen,
+            output_type="pil",
+        )
+        return out.images[0]
+    finally:
+        # Always return pipeline to CPU so next non-GPU context is safe
+        try:
+            _p.to("cpu", torch.float32)
+            _p.enable_vae_tiling()
+        except Exception:
+            pass
+# ---------- Public generate (token gate kept outside GPU context) ----------
 def generate(
     prompt: str,
     negative_prompt: str = "",
 ) -> Image.Image:
     if secret_token != SECRET_TOKEN:
         raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")
+    return _generate_gpu_call(
         prompt=prompt,
         negative_prompt=negative_prompt,
         seed=seed,
         width=width,
         height=height,
         guidance_scale=guidance_scale,
+        steps=num_inference_steps,
     )
+# ---------- Optional warmup (CPU only by default for ZeroGPU) ----------
 def warmup():
     try:
         ensure_pipe()
+        # Tiny CPU warmup to load weights into RAM/cache
+        _ = pipe(
+            prompt="minimal warmup",
+            width=256,
+            height=256,
+            guidance_scale=0.0,
+            num_inference_steps=1,
+            generator=torch.Generator(device="cpu").manual_seed(1),
+            output_type="pil",
+        ).images[0]
+        log.info("CPU warmup complete.")
     except Exception as e:
         log.warning(f"Warmup skipped or failed: {e}")
+if WARMUP:
+    warmup()
 # ---------- Gradio UI (v5) ----------
 def build_ui() -> gr.Blocks:
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("## Image Generator (LCM) — SDXL / SSD-1B (ZeroGPU Ready)")
         with gr.Row():
+            prompt = gr.Textbox(label="Prompt", lines=3, placeholder="Describe the image…")
             negative = gr.Textbox(label="Negative Prompt", lines=2, placeholder="(optional)")
         with gr.Row():
         inputs = [prompt, negative, seed, width, height, guidance, steps, token]
         run.click(fn=generate, inputs=inputs, outputs=out, concurrency_limit=CONCURRENCY)
         gr.Markdown(
             f"*Backend:* `{MODEL_BACKEND}` &nbsp; | &nbsp; "
+            f"*ZeroGPU:* `@spaces.GPU` enabled &nbsp; | &nbsp; "
+            f"*Max size:* {MAX_IMAGE_SIZE}px"
         )
     return demo
 # ---------- Launch ----------
 def main():
     demo = build_ui()
     demo.queue(max_size=QUEUE_SIZE, concurrency_count=CONCURRENCY)
     demo.launch(
         server_name="0.0.0.0",
         server_port=PORT,
         show_api=True,
+        ssr_mode=ENABLE_SSR,  # Off by default; turn on with ENABLE_SSR=true if needed
         share=False,
         show_error=True,
     )

requirements.txt CHANGED Viewed

@@ -2,7 +2,6 @@ accelerate==0.24.1
 diffusers==0.30.0
 gradio==5.47.2
 huggingface_hub==0.33.5
-invisible-watermark==0.2.0
 Pillow==10.1.0
 torch==2.1.0
 transformers==4.41.0

 diffusers==0.30.0
 gradio==5.47.2
 huggingface_hub==0.33.5
 Pillow==10.1.0
 torch==2.1.0
 transformers==4.41.0