TestingwithNeg

Running on Zero

App Files Files Community

dagloop5 commited on Mar 17

Commit

0dd62b1

verified ·

1 Parent(s): 8002597

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -92

app.py CHANGED Viewed

@@ -40,6 +40,7 @@ torch._dynamo.config.disable = True
 import spaces
 import gradio as gr
 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
 from ltx_core.components.diffusion_steps import EulerDiffusionStep
@@ -265,6 +266,17 @@ print("=" * 80)
 print("Downloading LTX-2.3 distilled model + Gemma...")
 print("=" * 80)
 checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-distilled.safetensors")
 spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
 gemma_root = snapshot_download(repo_id=GEMMA_REPO)
@@ -382,7 +394,7 @@ def generate_video(
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
-        global pipeline   # <<< ADD THIS LINE HERE (VERY TOP of try block)
         torch.cuda.reset_peak_memory_stats()
         log_memory("start")
@@ -417,75 +429,52 @@ def generate_video(
         tiling_config = TilingConfig.default()
         video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
-        # >>> RUNTIME LoRA application (robust, multi-fallback)
-        # We cannot rely on mutating the original descriptor (some implementations are immutable),
-        # so create a fresh runtime descriptor and try multiple ways to install it.
-        runtime_strength = float(lora_strength)
-        replaced = False
-        # 1) Try simple approach: build a new LoraPathStrengthAndSDOps
-        runtime_lora = LoraPathStrengthAndSDOps(lora_path, runtime_strength, LTXV_LORA_COMFY_RENAMING_MAP)
-        print(f"[LoRA] attempting to apply runtime LoRA (strength={runtime_strength})")
-        # Try a few likely places to replace the descriptor used by the pipeline/ledger.
-        try:
-            # common attribute on pipeline
-            if hasattr(pipeline, "loras"):
-                try:
-                    pipeline.loras = [runtime_lora]
-                    replaced = True
-                    print("[LoRA] replaced pipeline.loras")
-                except Exception as e:
-                    print(f"[LoRA] pipeline.loras assignment failed: {e}")
-        except Exception:
-            pass
-        try:
-            # common attribute on the model ledger
-            if hasattr(pipeline, "model_ledger") and hasattr(pipeline.model_ledger, "loras"):
                 try:
-                    pipeline.model_ledger.loras = [runtime_lora]
-                    replaced = True
-                    print("[LoRA] replaced pipeline.model_ledger.loras")
-                except Exception as e:
-                    print(f"[LoRA] pipeline.model_ledger.loras assignment failed: {e}")
-        except Exception:
-            pass
-        try:
-            # some internals use a private _loras list
-            if hasattr(pipeline, "model_ledger") and hasattr(pipeline.model_ledger, "_loras"):
-                try:
-                    pipeline.model_ledger._loras = [runtime_lora]
-                    replaced = True
-                    print("[LoRA] replaced pipeline.model_ledger._loras")
-                except Exception as e:
-                    print(f"[LoRA] pipeline.model_ledger._loras assignment failed: {e}")
-        except Exception:
-            pass
-        # 2) If we succeeded replacing the descriptor in-place, clear transformer cache so it will rebuild
-        if replaced:
-            try:
-                if hasattr(pipeline.model_ledger, "_transformer"):
-                    pipeline.model_ledger._transformer = None
-                # also clear potential caches named similar to 'transformer_cache' if present
-                if hasattr(pipeline.model_ledger, "transformer_cache"):
                     try:
-                        pipeline.model_ledger.transformer_cache = {}
                     except Exception:
                         pass
-                print("[LoRA] in-place descriptor replacement done; transformer cache cleared")
-            except Exception as e:
-                print(f"[LoRA] replacement succeeded but cache clearing failed: {e}")
-        # 3) FINAL FALLBACK - if none of the in-place replacements worked, rebuild the pipeline
-        if not replaced:
-            print("[LoRA] in-place replacement FAILED; rebuilding pipeline with runtime LoRA (this is slow)")
-            try:
-                # Rebuild pipeline object with the new LoRA descriptor
-                # NOTE: this replaces the global `pipeline`. We must declare global to reassign it.
-                pipeline = LTX23DistilledA2VPipeline(
                     distilled_checkpoint_path=checkpoint_path,
                     spatial_upsampler_path=spatial_upsampler_path,
                     gemma_root=gemma_root,
@@ -493,35 +482,50 @@ def generate_video(
                     quantization=QuantizationPolicy.fp8_cast(),
                 )
-                # After rebuilding, we *do not* re-run the original module-level preloads here,
-                # because re-pinning may be complex; the rebuilt pipeline will construct its
-                # own ledger as part of the first call. This is slower but reliable.
-                # Clear any transformer caches if they exist on the new ledger as well.
                 try:
-                    if hasattr(pipeline.model_ledger, "_transformer"):
-                        pipeline.model_ledger._transformer = None
-                except Exception:
-                    pass
-                print("[LoRA] pipeline rebuilt with runtime LoRA")
-            except Exception as e:
-                print(f"[LoRA] pipeline rebuild FAILED: {e}")
-        # Reset transformer so next call rebuilds it with new LoRA strength (NO preloading!)
-        try:
-            if hasattr(pipeline, "model_ledger"):
-                if hasattr(pipeline.model_ledger, "_transformer"):
-                    del pipeline.model_ledger._transformer
-                    pipeline.model_ledger._transformer = None
-            # CRITICAL: force cleanup BEFORE rebuild happens
-            cleanup_memory()
-            torch.cuda.empty_cache()
-            print("[LoRA] transformer reset; will rebuild during inference")
-        except Exception as e:
-            print(f"[LoRA] transformer reset failed: {e}")
         log_memory("before pipeline call")
         video, audio = pipeline(

 import spaces
 import gradio as gr
 import numpy as np
+from collections import OrderedDict
 from huggingface_hub import hf_hub_download, snapshot_download
 from ltx_core.components.diffusion_steps import EulerDiffusionStep
 print("Downloading LTX-2.3 distilled model + Gemma...")
 print("=" * 80)
+# ----------------------------
+# Pipeline cache for LoRA strengths (keeps at most 2 pipelines to limit VRAM)
+# ----------------------------
+# Use rounded strengths as keys (2 decimal places)
+pipeline_cache: OrderedDict[float, LTX23DistilledA2VPipeline] = OrderedDict()
+# Record the current pipeline's LoRA strength (we built the module above with lora_descriptor default 1.0)
+current_lora_strength: float = round(1.0, 2)
+pipeline_cache[current_lora_strength] = pipeline
+CACHE_MAX_SIZE = 2  # keep at most two pipeline instances in memory
+print(f"[CACHE] initialized pipeline cache with strength={current_lora_strength}")
 checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-distilled.safetensors")
 spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
 gemma_root = snapshot_download(repo_id=GEMMA_REPO)
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
+        global pipeline, pipeline_cache, current_lora_strength
         torch.cuda.reset_peak_memory_stats()
         log_memory("start")
         tiling_config = TilingConfig.default()
         video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
+        # ----------------------------
+        # Pipeline-per-strength (small LRU cache) — safe, deterministic LoRA switching
+        # ----------------------------
+        # Globals used: pipeline, pipeline_cache, current_lora_strength, CACHE_MAX_SIZE
+        requested_strength = round(float(lora_strength), 2)
+        # fast-path: same strength currently loaded
+        if requested_strength == current_lora_strength:
+            print(f"[LoRA] requested strength {requested_strength} == current {current_lora_strength} -> using current pipeline")
+        else:
+            print(f"[LoRA] requested strength {requested_strength} != current {current_lora_strength}")
+            # if cached, swap to that pipeline (move to end to mark as recently used)
+            if requested_strength in pipeline_cache:
+                print(f"[LoRA] using cached pipeline for strength={requested_strength}")
+                # set pipeline to cached instance & mark as most-recently-used
+                cached = pipeline_cache.pop(requested_strength)
+                pipeline_cache[requested_strength] = cached
+                pipeline = cached
+                current_lora_strength = requested_strength
+            else:
+                # Build new pipeline for requested strength.
+                print(f"[LoRA] building new pipeline for strength={requested_strength} (this will free and reallocate memory)")
+                # Free the previous pipeline & its GPU memory BEFORE building the new one
                 try:
+                    # remove previous pipeline from cache (if present)
+                    if current_lora_strength in pipeline_cache:
+                        pipeline_cache.pop(current_lora_strength, None)
+                    # delete current pipeline object reference
                     try:
+                        del pipeline
                     except Exception:
                         pass
+                    # aggressively free memory
+                    cleanup_memory()
+                    torch.cuda.empty_cache()
+                    print("[LoRA] freed memory, starting pipeline build")
+                except Exception as e:
+                    print(f"[LoRA] error while freeing old pipeline: {e}")
+                # create a runtime LoRA descriptor and build a fresh pipeline
+                runtime_lora = LoraPathStrengthAndSDOps(lora_path, float(requested_strength), LTXV_LORA_COMFY_RENAMING_MAP)
+                new_pipeline = LTX23DistilledA2VPipeline(
                     distilled_checkpoint_path=checkpoint_path,
                     spatial_upsampler_path=spatial_upsampler_path,
                     gemma_root=gemma_root,
                     quantization=QuantizationPolicy.fp8_cast(),
                 )
+                # Pin safe components (same preloads as original) so heavy parts remain stable.
                 try:
+                    ledger = new_pipeline.model_ledger
+                    _video_encoder = ledger.video_encoder()
+                    _video_decoder = ledger.video_decoder()
+                    _audio_encoder = ledger.audio_encoder()
+                    _audio_decoder = ledger.audio_decoder()
+                    _vocoder = ledger.vocoder()
+                    _spatial_upsampler = ledger.spatial_upsampler()
+                    _text_encoder = ledger.text_encoder()
+                    _embeddings_processor = ledger.gemma_embeddings_processor()
+                    ledger.video_encoder = lambda: _video_encoder
+                    ledger.video_decoder = lambda: _video_decoder
+                    ledger.audio_encoder = lambda: _audio_encoder
+                    ledger.audio_decoder = lambda: _audio_decoder
+                    ledger.vocoder = lambda: _vocoder
+                    ledger.spatial_upsampler = lambda: _spatial_upsampler
+                    ledger.text_encoder = lambda: _text_encoder
+                    ledger.gemma_embeddings_processor = lambda: _embeddings_processor
+                    print("[LoRA] new pipeline preloaded and pinned safe components")
+                except Exception as e:
+                    print(f"[LoRA] warning preloading pinned components failed: {e}")
+                # Set as current pipeline and cache it
+                pipeline = new_pipeline
+                pipeline_cache[requested_strength] = pipeline
+                current_lora_strength = requested_strength
+                # Evict oldest if cache size exceeded
+                try:
+                    while len(pipeline_cache) > CACHE_MAX_SIZE:
+                        evicted_strength, evicted_pipeline = pipeline_cache.popitem(last=False)
+                        try:
+                            del evicted_pipeline
+                        except Exception:
+                            pass
+                        cleanup_memory()
+                        torch.cuda.empty_cache()
+                        print(f"[CACHE] evicted pipeline strength={evicted_strength}")
+                except Exception as e:
+                    print(f"[CACHE] eviction error: {e}")
+        # end of pipeline-per-strength swap/build
         log_memory("before pipeline call")
         video, audio = pipeline(