Testing3

Runtime error

App Files Files Community

dagloop5 commited on Mar 18

Commit

1e66590

verified ·

1 Parent(s): 7c85561

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -43

app.py CHANGED Viewed

@@ -297,29 +297,57 @@ def build_loras_tuple(pose_strength: float, general_strength: float, motion_stre
 # initial strengths (you can change defaults)
 INITIAL_LORAS = build_loras_tuple(1.0, 1.0, 1.0)
-# Initialize pipeline WITH text encoder and optional audio support
 pipeline = LTX23DistilledA2VPipeline(
     distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=gemma_root,
     loras=INITIAL_LORAS,
-    quantization=QuantizationPolicy.fp8_cast(),
 )
-# Preload all models for ZeroGPU tensor packing.
-print("Preloading all models (including Gemma and audio components)...")
 ledger = pipeline.model_ledger
-_transformer = ledger.transformer()
-_video_encoder = ledger.video_encoder()
-_video_decoder = ledger.video_decoder()
-_audio_encoder = ledger.audio_encoder()
-_audio_decoder = ledger.audio_decoder()
-_vocoder = ledger.vocoder()
-_spatial_upsampler = ledger.spatial_upsampler()
-_text_encoder = ledger.text_encoder()
-_embeddings_processor = ledger.gemma_embeddings_processor()
-print("All models preloaded (including Gemma text encoder and audio encoder)!")
 print("=" * 80)
 print("Pipeline ready!")
@@ -327,11 +355,23 @@ print("=" * 80)
 def log_memory(tag: str):
-    if torch.cuda.is_available():
-        allocated = torch.cuda.memory_allocated() / 1024**3
-        peak = torch.cuda.max_memory_allocated() / 1024**3
-        free, total = torch.cuda.mem_get_info()
-        print(f"[VRAM {tag}] allocated={allocated:.2f}GB peak={peak:.2f}GB free={free / 1024**3:.2f}GB total={total / 1024**3:.2f}GB")
 def detect_aspect_ratio(image) -> str:
@@ -397,31 +437,49 @@ def generate_video(
             requested_strengths = (float(pose_lora_strength), float(general_lora_strength), float(motion_lora_strength))
             if _get_current_strengths(current_ledger) != requested_strengths:
-                # build new tuple and replace ledger.loras
                 current_ledger.loras = build_loras_tuple(*requested_strengths)
-                # clear cached model instances so new models are constructed with the new LoRAs
-                # (ModelLedger builds models on first access using its configured `loras`)
-                try:
-                    current_ledger.clear_vram()
-                except Exception:
-                    # `clear_vram` should exist; if it doesn't, fall back to deleting cached attrs
-                    for k in list(vars(current_ledger).keys()):
-                        if k in ("_transformer", "_video_encoder", "_video_decoder", "_audio_encoder", "_audio_decoder", "_vocoder", "_spatial_upsampler", "_text_encoder", "_gemma_embeddings_processor"):
-                            vars(current_ledger).pop(k, None)
-                # Now pre-load the models again (ensures they are on-device before pipeline call)
-                _ = current_ledger.transformer()
-                _ = current_ledger.video_encoder()
-                _ = current_ledger.video_decoder()
-                _ = current_ledger.audio_encoder()
-                _ = current_ledger.audio_decoder()
-                _ = current_ledger.vocoder()
-                _ = current_ledger.spatial_upsampler()
-                _ = current_ledger.text_encoder()
-                _ = current_ledger.gemma_embeddings_processor()
-                torch.cuda.empty_cache()
         except Exception as e:
-            # if this fails, we still proceed with the existing pipeline (safer to continue than to crash)
-            print(f"[LoRA rebuild warning] Could not update LoRA strengths in-place: {e}")
         # --- end LoRA update ---
         frame_rate = DEFAULT_FRAME_RATE

 # initial strengths (you can change defaults)
 INITIAL_LORAS = build_loras_tuple(1.0, 1.0, 1.0)
+# --- REPLACE pipeline creation with CUDA-aware quantization ---
+use_cuda = torch.cuda.is_available()
+print(f"[INFO] torch.cuda.is_available() = {use_cuda}")
+# Only enable FP8 quantization if CUDA is present (FP8 uses Triton/CUDA kernels).
+# If QuantizationPolicy defines a no-op or 'none' option, use it; otherwise omit the arg.
+quant = None
+if use_cuda:
+    quant = QuantizationPolicy.fp8_cast()
+else:
+    # try to use a 'none' policy if available; otherwise we'll omit quantization
+    quant = getattr(QuantizationPolicy, "none", None)
+quant_kwargs = {}
+if quant is not None:
+    quant_kwargs["quantization"] = quant
 pipeline = LTX23DistilledA2VPipeline(
     distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=gemma_root,
     loras=INITIAL_LORAS,
+    **quant_kwargs,
 )
+# --- end replace ---
+# --- REPLACE preload block with CUDA-aware version ---
+print("Preloading models (GPU preloads only if CUDA is available)...")
 ledger = pipeline.model_ledger
+if torch.cuda.is_available():
+    try:
+        # Preload models (this will trigger GPU-side building; only do this when CUDA is present)
+        _transformer = ledger.transformer()
+        _video_encoder = ledger.video_encoder()
+        _video_decoder = ledger.video_decoder()
+        _audio_encoder = ledger.audio_encoder()
+        _audio_decoder = ledger.audio_decoder()
+        _vocoder = ledger.vocoder()
+        _spatial_upsampler = ledger.spatial_upsampler()
+        _text_encoder = ledger.text_encoder()
+        _embeddings_processor = ledger.gemma_embeddings_processor()
+        print("All models preloaded onto GPU (Gemma text encoder and audio encoder included).")
+    except Exception as e:
+        # If FP8/Triton or other GPU initialization fails, print warning and continue in safe (lazy) mode.
+        print(f"[WARNING] Failed to preload GPU models at startup: {type(e).__name__}: {e}")
+        print("[WARNING] Falling back to lazy model loading / reduced quantization (if possible).")
+else:
+    # No CUDA — do not attempt GPU preloads that will invoke Triton kernels.
+    print("[INFO] No CUDA device detected — skipping GPU preloads. Models will be loaded lazily (CPU).")
+# --- end replace ---
 print("=" * 80)
 print("Pipeline ready!")
 def log_memory(tag: str):
+    try:
+        if torch.cuda.is_available():
+            allocated = torch.cuda.memory_allocated() / 1024**3
+            peak = torch.cuda.max_memory_allocated() / 1024**3
+            try:
+                free, total = torch.cuda.mem_get_info()
+                free_gb = free / 1024**3
+                total_gb = total / 1024**3
+            except Exception:
+                free_gb = total_gb = 0.0
+            print(f"[VRAM {tag}] allocated={allocated:.2f}GB peak={peak:.2f}GB free={free_gb:.2f}GB total={total_gb:.2f}GB")
+        else:
+            # Basic CPU fallback logging
+            print(f"[VRAM {tag}] CUDA not available — running on CPU.")
+    except Exception as e:
+        # Defensive: don't let logging crash the app
+        print(f"[log_memory error] {type(e).__name__}: {e}")
 def detect_aspect_ratio(image) -> str:
             requested_strengths = (float(pose_lora_strength), float(general_lora_strength), float(motion_lora_strength))
             if _get_current_strengths(current_ledger) != requested_strengths:
+                # replace ledger.loras with new strengths (list)
                 current_ledger.loras = build_loras_tuple(*requested_strengths)
+                if torch.cuda.is_available():
+                    # Only try to clear VRAM and rebuild on GPU-enabled hosts
+                    try:
+                        current_ledger.clear_vram()
+                    except Exception:
+                        # Fallback: remove cached attributes to force rebuild on next access
+                        for k in list(vars(current_ledger).keys()):
+                            if k in (
+                                "_transformer",
+                                "_video_encoder",
+                                "_video_decoder",
+                                "_audio_encoder",
+                                "_audio_decoder",
+                                "_vocoder",
+                                "_spatial_upsampler",
+                                "_text_encoder",
+                                "_gemma_embeddings_processor",
+                            ):
+                                vars(current_ledger).pop(k, None)
+                    # Preload the models again on GPU so they're available before pipeline call
+                    try:
+                        _ = current_ledger.transformer()
+                        _ = current_ledger.video_encoder()
+                        _ = current_ledger.video_decoder()
+                        _ = current_ledger.audio_encoder()
+                        _ = current_ledger.audio_decoder()
+                        _ = current_ledger.vocoder()
+                        _ = current_ledger.spatial_upsampler()
+                        _ = current_ledger.text_encoder()
+                        _ = current_ledger.gemma_embeddings_processor()
+                        torch.cuda.empty_cache()
+                    except Exception as e:
+                        print(f"[LoRA preload warning] Failed to preload models after LoRA change: {type(e).__name__}: {e}")
+                        # continue — the pipeline will attempt to build when called
+                else:
+                    # No CUDA: we updated the ledger.loras but won't attempt GPU preloads.
+                    print("[INFO] LoRA strengths updated (CPU-only; models will be applied lazily).")
         except Exception as e:
+            # if this fails, proceed with the existing pipeline (safer to continue than to crash)
+            print(f"[LoRA rebuild warning] Could not update LoRA strengths in-place: {type(e).__name__}: {e}")
         # --- end LoRA update ---
         frame_rate = DEFAULT_FRAME_RATE