ruslanmv commited on
Commit
74942a4
·
1 Parent(s): 6fdbc47

First commit

Browse files
Files changed (2) hide show
  1. app.py +150 -131
  2. requirements.txt +0 -1
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # -------------------------------
2
- # AI Fast Image Server (Production)
3
  # -------------------------------
4
 
5
  from __future__ import annotations
@@ -7,11 +7,11 @@ import os
7
  import sys
8
  import logging
9
  import subprocess
10
- from typing import Optional
11
 
12
- # ---------- Early, safe env defaults ----------
13
- os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # faster model downloads
14
- os.environ.setdefault("DEEPSPEED_DISABLE_NVML", "1") # silence NVML in headless envs
15
  os.environ.setdefault("BITSANDBYTES_NOWELCOME", "1")
16
 
17
  # ---------- Logging ----------
@@ -23,7 +23,7 @@ logging.basicConfig(
23
  log = logging.getLogger("ai-fast-image-server")
24
 
25
  # ---------- Config via ENV ----------
26
- # MODEL_BACKEND: sdxl_lcm_unet (heavy), sdxl_lcm_lora (light), ssd1b_lcm_lora (light)
27
  MODEL_BACKEND = os.getenv("MODEL_BACKEND", "sdxl_lcm_lora").lower()
28
  MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024"))
29
  DEFAULT_SIZE = int(os.getenv("DEFAULT_SIZE", "1024"))
@@ -31,9 +31,10 @@ SECRET_TOKEN = os.getenv("SECRET_TOKEN", "default_secret")
31
  PORT = int(os.getenv("PORT", "7860"))
32
  CONCURRENCY = int(os.getenv("CONCURRENCY", "2"))
33
  QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", "32"))
34
- ENABLE_SSR = os.getenv("ENABLE_SSR", "false").lower() == "true" # SSR can be flaky; default off
 
35
 
36
- # ---------- Imports that require deps ----------
37
  import warnings
38
  warnings.filterwarnings("ignore", message="Can't initialize NVML")
39
 
@@ -48,6 +49,18 @@ from diffusers import (
48
  AutoPipelineForText2Image,
49
  )
50
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  # ---------- Version guard: Torch 2.1 + NumPy 2.x is incompatible ----------
52
  try:
53
  _np_major = int(np.__version__.split(".")[0])
@@ -79,161 +92,163 @@ def print_nvidia_smi() -> None:
79
 
80
  print_nvidia_smi()
81
 
82
- IS_GPU = torch.cuda.is_available()
83
- DEVICE = torch.device("cuda") if IS_GPU else torch.device("cpu")
84
- DTYPE = torch.float16 if IS_GPU else torch.float32
85
- log.info(f"CUDA available: {IS_GPU} | device={DEVICE} | dtype={DTYPE}")
86
-
87
- # ---------- Torch perf knobs ----------
88
- try:
89
- if IS_GPU:
90
- torch.backends.cuda.matmul.allow_tf32 = True # safe perf on Ampere+
91
- torch.set_float32_matmul_precision("high")
92
- except Exception:
93
- pass
94
-
95
- # ---------- Helpers ----------
96
- def _variant_kwargs() -> dict:
97
- # use fp16 repo variants only on GPU
98
- return {"variant": "fp16"} if IS_GPU else {}
99
-
100
- def _cpu_safety_settings(pipe: DiffusionPipeline) -> None:
101
- # reduce RAM usage and avoid giant VAE allocations on CPU
102
- try:
103
- pipe.enable_vae_tiling()
104
- except Exception:
105
- pass
106
 
107
- def _gpu_memory_efficiency(pipe: DiffusionPipeline) -> None:
108
- # enable memory-efficient attention when available
109
  enabled = False
110
  try:
111
- pipe.enable_xformers_memory_efficient_attention()
112
  enabled = True
113
  except Exception:
114
  try:
115
- pipe.enable_attention_slicing("max")
116
  enabled = True
117
  except Exception:
118
  pass
 
 
 
 
119
  if enabled:
 
120
  try:
121
- pipe.enable_vae_tiling()
 
122
  except Exception:
123
  pass
124
 
125
- # ---------- Model loading ----------
126
- pipe: Optional[DiffusionPipeline] = None
 
127
 
128
- def load_pipeline() -> DiffusionPipeline:
129
  """
130
- Load the selected backend with sensible defaults.
131
- - sdxl_lcm_unet: SDXL base + full LCM UNet (heavy, high VRAM)
132
- - sdxl_lcm_lora: SDXL base + LCM-LoRA (light, recommended)
133
- - ssd1b_lcm_lora: SSD-1B + LCM-LoRA (light)
134
  """
135
  log.info(f"Loading model backend: {MODEL_BACKEND}")
136
-
137
  if MODEL_BACKEND == "sdxl_lcm_unet":
138
- # Heavy: downloads ~10 GB UNet; best quality/speed on big GPUs
139
  unet = UNet2DConditionModel.from_pretrained(
140
  "latent-consistency/lcm-sdxl",
141
- torch_dtype=DTYPE,
142
  cache_dir=CACHE_DIR,
143
- **_variant_kwargs(),
144
  )
145
- _pipe = DiffusionPipeline.from_pretrained(
146
  "stabilityai/stable-diffusion-xl-base-1.0",
147
  unet=unet,
148
- torch_dtype=DTYPE,
149
  cache_dir=CACHE_DIR,
150
- **_variant_kwargs(),
151
  )
152
  elif MODEL_BACKEND == "ssd1b_lcm_lora":
153
- _pipe = AutoPipelineForText2Image.from_pretrained(
154
  "segmind/SSD-1B",
155
- torch_dtype=DTYPE,
156
  cache_dir=CACHE_DIR,
157
- **_variant_kwargs(),
158
  )
159
- _pipe.load_lora_weights("latent-consistency/lcm-lora-ssd-1b")
160
- _pipe.fuse_lora()
161
  else:
162
- # Default & recommended: SDXL + LCM-LoRA (smaller downloads, good quality)
163
- _pipe = DiffusionPipeline.from_pretrained(
164
  "stabilityai/stable-diffusion-xl-base-1.0",
165
- torch_dtype=DTYPE,
166
  cache_dir=CACHE_DIR,
167
- **_variant_kwargs(),
168
  )
169
- _pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
170
- _pipe.fuse_lora()
171
 
172
  # Use LCM scheduler
173
- _pipe.scheduler = LCMScheduler.from_config(_pipe.scheduler.config)
174
 
175
- # Device & memory efficiency
176
- _pipe.to(DEVICE)
177
- if IS_GPU:
178
- _gpu_memory_efficiency(_pipe)
179
- else:
180
- _cpu_safety_settings(_pipe)
181
 
182
- log.info("Pipeline loaded.")
183
- return _pipe
184
 
185
- # warmup lazily
186
  def ensure_pipe() -> DiffusionPipeline:
187
  global pipe
188
  if pipe is None:
189
- pipe = load_pipeline()
190
  return pipe
191
 
192
- # ---------- HF Spaces GPU decorator (fixes “No @spaces.GPU function detected”) ----------
193
- try:
194
- import spaces # type: ignore
195
- GPU_DECORATOR = spaces.GPU
196
- log.info("`spaces` package detected. GPU-decorating inference function.")
197
- except Exception:
198
- GPU_DECORATOR = lambda f: f # no-op
199
-
200
- # ---------- Inference ----------
201
- @gpu_dec := GPU_DECORATOR
202
- def generate_image_internal(
 
 
 
 
 
 
 
203
  prompt: str,
204
- negative_prompt: str = "",
205
- seed: Optional[int] = 0,
206
- width: int = DEFAULT_SIZE,
207
- height: int = DEFAULT_SIZE,
208
- guidance_scale: float = 0.0,
209
- num_inference_steps: int = 4,
210
  ) -> Image.Image:
211
- _pipe = ensure_pipe()
212
-
213
- # Clamp to safe bounds
214
- width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
215
- height = int(np.clip(height, 256, MAX_IMAGE_SIZE))
216
- num_inference_steps = int(np.clip(num_inference_steps, 1, 12))
217
- guidance_scale = float(np.clip(guidance_scale, 0.0, 2.0))
218
 
219
- # Deterministic generator
220
- generator = torch.Generator(device=DEVICE)
221
- if seed is not None:
222
- generator = generator.manual_seed(int(seed))
223
 
224
- result = _pipe(
225
- prompt=prompt,
226
- negative_prompt=negative_prompt,
227
- width=width,
228
- height=height,
229
- guidance_scale=guidance_scale, # LCM prefers low/no guidance
230
- num_inference_steps=num_inference_steps,
231
- generator=generator,
232
- output_type="pil",
233
- )
234
- return result.images[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
- # thin wrapper that enforces the token (kept out of the GPU-decorated function)
237
  def generate(
238
  prompt: str,
239
  negative_prompt: str = "",
@@ -246,39 +261,45 @@ def generate(
246
  ) -> Image.Image:
247
  if secret_token != SECRET_TOKEN:
248
  raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")
249
- return generate_image_internal(
 
250
  prompt=prompt,
251
  negative_prompt=negative_prompt,
252
  seed=seed,
253
  width=width,
254
  height=height,
255
  guidance_scale=guidance_scale,
256
- num_inference_steps=num_inference_steps,
257
  )
258
 
259
- # ---------- Optional warmup at startup ----------
260
  def warmup():
261
  try:
262
  ensure_pipe()
263
- _ = generate_image_internal(
264
- prompt="A quick warmup prompt, minimal style", seed=42, width=512, height=512, num_inference_steps=2
265
- )
266
- log.info("Warmup complete.")
 
 
 
 
 
 
 
267
  except Exception as e:
268
  log.warning(f"Warmup skipped or failed: {e}")
269
 
270
- if os.getenv("WARMUP", "true").lower() == "true":
271
- # Don't block too long on CPU
272
- if IS_GPU:
273
- warmup()
274
 
275
  # ---------- Gradio UI (v5) ----------
276
  def build_ui() -> gr.Blocks:
277
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
278
- gr.Markdown("## Image Generator (LCM) — SDXL / SSD-1B")
279
 
280
  with gr.Row():
281
- prompt = gr.Textbox(label="Prompt", lines=3, placeholder="Describe the image...")
282
  negative = gr.Textbox(label="Negative Prompt", lines=2, placeholder="(optional)")
283
 
284
  with gr.Row():
@@ -297,24 +318,22 @@ def build_ui() -> gr.Blocks:
297
  inputs = [prompt, negative, seed, width, height, guidance, steps, token]
298
  run.click(fn=generate, inputs=inputs, outputs=out, concurrency_limit=CONCURRENCY)
299
 
300
- # Simple health info
301
  gr.Markdown(
302
  f"*Backend:* `{MODEL_BACKEND}`   |   "
303
- f"*Device:* `{DEVICE}`   |   "
304
- f"*dtype:* `{DTYPE}`"
305
  )
306
  return demo
307
 
308
  # ---------- Launch ----------
309
  def main():
310
  demo = build_ui()
311
- # Queue for backpressure and concurrency control
312
  demo.queue(max_size=QUEUE_SIZE, concurrency_count=CONCURRENCY)
313
  demo.launch(
314
  server_name="0.0.0.0",
315
  server_port=PORT,
316
  show_api=True,
317
- ssr_mode=ENABLE_SSR, # SSR off by default (can be flaky on Spaces)
318
  share=False,
319
  show_error=True,
320
  )
 
1
  # -------------------------------
2
+ # AI Fast Image Server — ZeroGPU Ready
3
  # -------------------------------
4
 
5
  from __future__ import annotations
 
7
  import sys
8
  import logging
9
  import subprocess
10
+ from typing import Optional, Callable
11
 
12
+ # ---------- Fast, safe defaults ----------
13
+ os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # faster model downloads
14
+ os.environ.setdefault("DEEPSPEED_DISABLE_NVML", "1") # silence NVML in headless envs
15
  os.environ.setdefault("BITSANDBYTES_NOWELCOME", "1")
16
 
17
  # ---------- Logging ----------
 
23
  log = logging.getLogger("ai-fast-image-server")
24
 
25
  # ---------- Config via ENV ----------
26
+ # MODEL_BACKEND: "sdxl_lcm_lora" (default), "sdxl_lcm_unet" (heavy), "ssd1b_lcm_lora" (light)
27
  MODEL_BACKEND = os.getenv("MODEL_BACKEND", "sdxl_lcm_lora").lower()
28
  MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024"))
29
  DEFAULT_SIZE = int(os.getenv("DEFAULT_SIZE", "1024"))
 
31
  PORT = int(os.getenv("PORT", "7860"))
32
  CONCURRENCY = int(os.getenv("CONCURRENCY", "2"))
33
  QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", "32"))
34
+ ENABLE_SSR = os.getenv("ENABLE_SSR", "false").lower() == "true" # SSR off by default for stability
35
+ WARMUP = os.getenv("WARMUP", "false").lower() == "true" # default False for ZeroGPU
36
 
37
+ # ---------- Third-party imports ----------
38
  import warnings
39
  warnings.filterwarnings("ignore", message="Can't initialize NVML")
40
 
 
49
  AutoPipelineForText2Image,
50
  )
51
 
52
+ # ---------- ZeroGPU decorator (works even off-Spaces) ----------
53
+ try:
54
+ import spaces # real decorator on Spaces
55
+ except Exception:
56
+ class _DummySpaces:
57
+ def GPU(self, *args, **kwargs):
58
+ # identity decorator if not on Spaces
59
+ def _wrap(f):
60
+ return f
61
+ return _wrap
62
+ spaces = _DummySpaces()
63
+
64
  # ---------- Version guard: Torch 2.1 + NumPy 2.x is incompatible ----------
65
  try:
66
  _np_major = int(np.__version__.split(".")[0])
 
92
 
93
  print_nvidia_smi()
94
 
95
+ # ---------- Global pipeline handle (kept on CPU between calls) ----------
96
+ pipe: Optional[DiffusionPipeline] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
+ def _gpu_mem_efficiency(p: DiffusionPipeline) -> None:
99
+ """Enable memory-efficient attention and VAE tiling where possible."""
100
  enabled = False
101
  try:
102
+ p.enable_xformers_memory_efficient_attention()
103
  enabled = True
104
  except Exception:
105
  try:
106
+ p.enable_attention_slicing("max")
107
  enabled = True
108
  except Exception:
109
  pass
110
+ try:
111
+ p.enable_vae_tiling()
112
+ except Exception:
113
+ pass
114
  if enabled:
115
+ # faster matmul on Ampere+
116
  try:
117
+ torch.backends.cuda.matmul.allow_tf32 = True
118
+ torch.set_float32_matmul_precision("high")
119
  except Exception:
120
  pass
121
 
122
+ def _variant_kwargs() -> dict:
123
+ # Use fp16 repo variants only when on GPU (avoid oddities on CPU)
124
+ return {"variant": "fp16"}
125
 
126
+ def _build_pipeline_cpu() -> DiffusionPipeline:
127
  """
128
+ Build the pipeline on CPU with float32 to keep it stable in ZeroGPU's
129
+ CPU-only startup environment. We'll move it to CUDA inside the GPU-decorated
130
+ function per call and return it to CPU after.
 
131
  """
132
  log.info(f"Loading model backend: {MODEL_BACKEND}")
 
133
  if MODEL_BACKEND == "sdxl_lcm_unet":
134
+ # Heavy: full LCM UNet (~10GB). Use only if you have big VRAM.
135
  unet = UNet2DConditionModel.from_pretrained(
136
  "latent-consistency/lcm-sdxl",
137
+ torch_dtype=torch.float32,
138
  cache_dir=CACHE_DIR,
139
+ # no variant on CPU
140
  )
141
+ _p = DiffusionPipeline.from_pretrained(
142
  "stabilityai/stable-diffusion-xl-base-1.0",
143
  unet=unet,
144
+ torch_dtype=torch.float32,
145
  cache_dir=CACHE_DIR,
 
146
  )
147
  elif MODEL_BACKEND == "ssd1b_lcm_lora":
148
+ _p = AutoPipelineForText2Image.from_pretrained(
149
  "segmind/SSD-1B",
150
+ torch_dtype=torch.float32,
151
  cache_dir=CACHE_DIR,
 
152
  )
153
+ _p.load_lora_weights("latent-consistency/lcm-lora-ssd-1b")
154
+ _p.fuse_lora()
155
  else:
156
+ # Default: SDXL + LCM-LoRA (smaller download, great speed/quality)
157
+ _p = DiffusionPipeline.from_pretrained(
158
  "stabilityai/stable-diffusion-xl-base-1.0",
159
+ torch_dtype=torch.float32,
160
  cache_dir=CACHE_DIR,
 
161
  )
162
+ _p.load_lora_weights("latent-consistency/lcm-lora-sdxl")
163
+ _p.fuse_lora()
164
 
165
  # Use LCM scheduler
166
+ _p.scheduler = LCMScheduler.from_config(_p.scheduler.config)
167
 
168
+ # Stay on CPU by default (ZeroGPU will give us CUDA only during calls)
169
+ _p.to("cpu", torch.float32)
170
+ try:
171
+ _p.enable_vae_tiling() # also fine on CPU
172
+ except Exception:
173
+ pass
174
 
175
+ log.info("Pipeline built on CPU.")
176
+ return _p
177
 
 
178
  def ensure_pipe() -> DiffusionPipeline:
179
  global pipe
180
  if pipe is None:
181
+ pipe = _build_pipeline_cpu()
182
  return pipe
183
 
184
+ # ---------- Duration model for ZeroGPU ----------
185
+ def _estimate_duration(prompt: str, negative_prompt: str, seed: int,
186
+ width: int, height: int, guidance_scale: float, steps: int,
187
+ secret_token: str) -> int:
188
+ """
189
+ Rough estimate (seconds) to inform ZeroGPU scheduler for better queuing.
190
+ Scale by pixel count and steps. Conservative upper bound.
191
+ """
192
+ base = 3.0 # pipeline dispatch + overhead
193
+ px_scale = (max(256, width) * max(256, height)) / (1024 * 1024)
194
+ step_cost = 0.85 # ~0.85s/step @1024^2 (H200 slice; tune as needed)
195
+ est = base + steps * step_cost * max(0.5, px_scale)
196
+ # Clamp between 10 and 120 seconds
197
+ return int(min(120, max(10, est)))
198
+
199
+ # ---------- GPU-decorated inference (Spaces detects this) ----------
200
+ @spaces.GPU(duration=_estimate_duration) # dynamic duration; no-op outside Spaces
201
+ def _generate_gpu_call(
202
  prompt: str,
203
+ negative_prompt: str,
204
+ seed: Optional[int],
205
+ width: int,
206
+ height: int,
207
+ guidance_scale: float,
208
+ steps: int,
209
  ) -> Image.Image:
210
+ """
211
+ Runs under a ZeroGPU-allocated context. We move the pipeline to CUDA at the
212
+ start and back to CPU at the end so that it remains usable when GPU is released.
213
+ """
214
+ _p = ensure_pipe()
 
 
215
 
216
+ # Move to CUDA with half precision (safe with LCM)
217
+ _p.to("cuda", torch.float16)
218
+ _gpu_mem_efficiency(_p)
 
219
 
220
+ try:
221
+ # Clamp inputs
222
+ width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
223
+ height = int(np.clip(height, 256, MAX_IMAGE_SIZE))
224
+ steps = int(np.clip(steps, 1, 12))
225
+ guidance_scale = float(np.clip(guidance_scale, 0.0, 2.0))
226
+
227
+ # Deterministic generator on CUDA
228
+ gen = torch.Generator(device="cuda")
229
+ if seed is not None:
230
+ gen = gen.manual_seed(int(seed))
231
+
232
+ out = _p(
233
+ prompt=prompt,
234
+ negative_prompt=negative_prompt,
235
+ width=width,
236
+ height=height,
237
+ guidance_scale=guidance_scale, # LCM prefers low guidance
238
+ num_inference_steps=steps,
239
+ generator=gen,
240
+ output_type="pil",
241
+ )
242
+ return out.images[0]
243
+ finally:
244
+ # Always return pipeline to CPU so next non-GPU context is safe
245
+ try:
246
+ _p.to("cpu", torch.float32)
247
+ _p.enable_vae_tiling()
248
+ except Exception:
249
+ pass
250
 
251
+ # ---------- Public generate (token gate kept outside GPU context) ----------
252
  def generate(
253
  prompt: str,
254
  negative_prompt: str = "",
 
261
  ) -> Image.Image:
262
  if secret_token != SECRET_TOKEN:
263
  raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")
264
+
265
+ return _generate_gpu_call(
266
  prompt=prompt,
267
  negative_prompt=negative_prompt,
268
  seed=seed,
269
  width=width,
270
  height=height,
271
  guidance_scale=guidance_scale,
272
+ steps=num_inference_steps,
273
  )
274
 
275
+ # ---------- Optional warmup (CPU only by default for ZeroGPU) ----------
276
  def warmup():
277
  try:
278
  ensure_pipe()
279
+ # Tiny CPU warmup to load weights into RAM/cache
280
+ _ = pipe(
281
+ prompt="minimal warmup",
282
+ width=256,
283
+ height=256,
284
+ guidance_scale=0.0,
285
+ num_inference_steps=1,
286
+ generator=torch.Generator(device="cpu").manual_seed(1),
287
+ output_type="pil",
288
+ ).images[0]
289
+ log.info("CPU warmup complete.")
290
  except Exception as e:
291
  log.warning(f"Warmup skipped or failed: {e}")
292
 
293
+ if WARMUP:
294
+ warmup()
 
 
295
 
296
  # ---------- Gradio UI (v5) ----------
297
  def build_ui() -> gr.Blocks:
298
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
299
+ gr.Markdown("## Image Generator (LCM) — SDXL / SSD-1B (ZeroGPU Ready)")
300
 
301
  with gr.Row():
302
+ prompt = gr.Textbox(label="Prompt", lines=3, placeholder="Describe the image")
303
  negative = gr.Textbox(label="Negative Prompt", lines=2, placeholder="(optional)")
304
 
305
  with gr.Row():
 
318
  inputs = [prompt, negative, seed, width, height, guidance, steps, token]
319
  run.click(fn=generate, inputs=inputs, outputs=out, concurrency_limit=CONCURRENCY)
320
 
 
321
  gr.Markdown(
322
  f"*Backend:* `{MODEL_BACKEND}`   |   "
323
+ f"*ZeroGPU:* `@spaces.GPU` enabled   |   "
324
+ f"*Max size:* {MAX_IMAGE_SIZE}px"
325
  )
326
  return demo
327
 
328
  # ---------- Launch ----------
329
  def main():
330
  demo = build_ui()
 
331
  demo.queue(max_size=QUEUE_SIZE, concurrency_count=CONCURRENCY)
332
  demo.launch(
333
  server_name="0.0.0.0",
334
  server_port=PORT,
335
  show_api=True,
336
+ ssr_mode=ENABLE_SSR, # Off by default; turn on with ENABLE_SSR=true if needed
337
  share=False,
338
  show_error=True,
339
  )
requirements.txt CHANGED
@@ -2,7 +2,6 @@ accelerate==0.24.1
2
  diffusers==0.30.0
3
  gradio==5.47.2
4
  huggingface_hub==0.33.5
5
- invisible-watermark==0.2.0
6
  Pillow==10.1.0
7
  torch==2.1.0
8
  transformers==4.41.0
 
2
  diffusers==0.30.0
3
  gradio==5.47.2
4
  huggingface_hub==0.33.5
 
5
  Pillow==10.1.0
6
  torch==2.1.0
7
  transformers==4.41.0