nseq
/

garbage

Model card Files Files and versions

xet

Community

nseq commited on Feb 5

Commit

c1c3eae

verified ·

1 Parent(s): 9cc10e6

Update memory_management.py

Browse files

Files changed (1) hide show

memory_management.py +114 -156

memory_management.py CHANGED Viewed

@@ -9,8 +9,9 @@ import platform
 from enum import Enum
 from backend import stream, utils
 from backend.args import args
-from modules_forge.main_thread import gpu_sync_lock, unload_complete, load_complete, current_gpu_id
 cpu = torch.device('cpu')
@@ -74,11 +75,6 @@ except:
 if args.always_cpu:
     cpu_state = CPUState.CPU
-def get_current_gpu_id():
-    if torch.cuda.is_available():
-        return torch.cuda.current_device()
-    return None
 def is_intel_xpu():
     global cpu_state
@@ -566,66 +562,45 @@ def unload_model_clones(model):
 def free_memory(memory_required, device, keep_loaded=[], free_all=False):
-    # this check fully unloads any 'abandoned' models
-    global current_gpu_id, unload_complete, load_complete
-    with gpu_sync_lock:
-        gpu_id = get_current_gpu_id()
-        if current_gpu_id is None:
-            current_gpu_id = gpu_id
-        elif current_gpu_id != gpu_id:
-            # Wait for our turn
-            while current_gpu_id != gpu_id:
-                time.sleep(0.1)
-    for i in range(len(current_loaded_models) - 1, -1, -1):
-        if sys.getrefcount(current_loaded_models[i].model) <= 2:
-            current_loaded_models.pop(i).model_unload(avoid_model_moving=True)
-    if free_all:
-        memory_required = 1e30
-        print(f"[Unload] Trying to free all memory for {device} with {len(keep_loaded)} models keep loaded ... ", end="")
-    else:
-        print(f"[Unload] Trying to free {memory_required / (1024 * 1024):.2f} MB for {device} with {len(keep_loaded)} models keep loaded ... ", end="")
-    offload_everything = ALWAYS_VRAM_OFFLOAD or vram_state == VRAMState.NO_VRAM
-    unloaded_model = False
-    for i in range(len(current_loaded_models) - 1, -1, -1):
-        if not offload_everything:
-            free_memory = get_free_memory(device)
-            print(f"Current free memory is {free_memory / (1024 * 1024):.2f} MB ... ", end="")
-            if free_memory > memory_required:
-                break
-        shift_model = current_loaded_models[i]
-        if shift_model.device == device:
-            if shift_model not in keep_loaded:
-                m = current_loaded_models.pop(i)
-                print(f"Unload model {m.model.model.__class__.__name__} ", end="")
-                m.model_unload()
-                del m
-                unloaded_model = True
-    if unloaded_model:
-        soft_empty_cache()
-    else:
-        if vram_state != VRAMState.HIGH_VRAM:
-            mem_free_total, mem_free_torch = get_free_memory(device, torch_free_too=True)
-            if mem_free_torch > mem_free_total * 0.25:
-                soft_empty_cache()
-    print('Done.')
-    if gpu_id == 0:  # First GPU
-        unload_complete.set()
-        # Wait for second GPU to complete unloading
-        while not load_complete.is_set():
-            time.sleep(0.1)
-    else:  # Second GPU
-        # Wait for first GPU to complete unloading
-        while not unload_complete.is_set():
-            time.sleep(0.1)
-        load_complete.set()
-    return
 def compute_model_gpu_memory_when_using_cpu_swap(current_free_mem, inference_memory):
@@ -640,101 +615,84 @@ def compute_model_gpu_memory_when_using_cpu_swap(current_free_mem, inference_mem
 def load_models_gpu(models, memory_required=0, hard_memory_preservation=0):
-    global vram_state
-    global current_gpu_id, unload_complete, load_complete
-    gpu_id = get_current_gpu_id()
-    # Wait for unloading to complete on both GPUs
-    if gpu_id == 1:  # Second GPU
-        while not unload_complete.is_set() or not load_complete.is_set():
-            time.sleep(0.1)
-    with gpu_sync_lock:
-        if current_gpu_id is None or current_gpu_id == gpu_id:
-            execution_start_time = time.perf_counter()
-            memory_to_free = max(minimum_inference_memory(), memory_required) + hard_memory_preservation
-            memory_for_inference = minimum_inference_memory() + hard_memory_preservation
-            models_to_load = []
-            models_already_loaded = []
-            for x in models:
-                loaded_model = LoadedModel(x)
-                if loaded_model in current_loaded_models:
-                    index = current_loaded_models.index(loaded_model)
-                    current_loaded_models.insert(0, current_loaded_models.pop(index))
-                    models_already_loaded.append(loaded_model)
-                else:
-                    models_to_load.append(loaded_model)
-            if len(models_to_load) == 0:
-                devs = set(map(lambda a: a.device, models_already_loaded))
-                for d in devs:
-                    if d != torch.device("cpu"):
-                        free_memory(memory_to_free, d, models_already_loaded)
-                moving_time = time.perf_counter() - execution_start_time
-                if moving_time > 0.1:
-                    print(f'Memory cleanup has taken {moving_time:.2f} seconds')
-                return
-            for loaded_model in models_to_load:
-                unload_model_clones(loaded_model.model)
-            total_memory_required = {}
-            for loaded_model in models_to_load:
-                loaded_model.compute_inclusive_exclusive_memory()
-                total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.exclusive_memory + loaded_model.inclusive_memory * 0.25
-            for device in total_memory_required:
-                if device != torch.device("cpu"):
-                    free_memory(total_memory_required[device] * 1.3 + memory_to_free, device, models_already_loaded)
-            for loaded_model in models_to_load:
-                model = loaded_model.model
-                torch_dev = model.load_device
-                if is_device_cpu(torch_dev):
-                    vram_set_state = VRAMState.DISABLED
-                else:
-                    vram_set_state = vram_state
-                model_gpu_memory_when_using_cpu_swap = -1
-                if lowvram_available and (vram_set_state == VRAMState.LOW_VRAM or vram_set_state == VRAMState.NORMAL_VRAM):
-                    model_require = loaded_model.exclusive_memory
-                    previously_loaded = loaded_model.inclusive_memory
-                    current_free_mem = get_free_memory(torch_dev)
-                    estimated_remaining_memory = current_free_mem - model_require - memory_for_inference
-                    print(f"[Memory Management] Target: {loaded_model.model.model.__class__.__name__}, Free GPU: {current_free_mem / (1024 * 1024):.2f} MB, Model Require: {model_require / (1024 * 1024):.2f} MB, Previously Loaded: {previously_loaded / (1024 * 1024):.2f} MB, Inference Require: {memory_for_inference / (1024 * 1024):.2f} MB, Remaining: {estimated_remaining_memory / (1024 * 1024):.2f} MB, ", end="")
-                    if estimated_remaining_memory < 0:
-                        vram_set_state = VRAMState.LOW_VRAM
-                        model_gpu_memory_when_using_cpu_swap = compute_model_gpu_memory_when_using_cpu_swap(current_free_mem, memory_for_inference)
-                        if previously_loaded > 0:
-                            model_gpu_memory_when_using_cpu_swap = previously_loaded
-                if vram_set_state == VRAMState.NO_VRAM:
-                    model_gpu_memory_when_using_cpu_swap = 0
-                loaded_model.model_load(model_gpu_memory_when_using_cpu_swap)
-                current_loaded_models.insert(0, loaded_model)
             moving_time = time.perf_counter() - execution_start_time
-            print(f'Moving model(s) has taken {moving_time:.2f} seconds')
-            if gpu_id == 0:  # First GPU
-                current_gpu_id = 1  # Signal second GPU to start
-            else:  # Second GPU
-                # Reset synchronization
-                current_gpu_id = None
-                unload_complete.clear()
-                load_complete.clear()
             return
 def load_model_gpu(model):
     return load_models_gpu([model])

 from enum import Enum
 from backend import stream, utils
 from backend.args import args
+import threading
+global_model_lock = threading.Lock()
 cpu = torch.device('cpu')
 if args.always_cpu:
     cpu_state = CPUState.CPU
 def is_intel_xpu():
     global cpu_state
 def free_memory(memory_required, device, keep_loaded=[], free_all=False):
+    with global_model_lock:
+        # this check fully unloads any 'abandoned' models
+        for i in range(len(current_loaded_models) - 1, -1, -1):
+            if sys.getrefcount(current_loaded_models[i].model) <= 2:
+                current_loaded_models.pop(i).model_unload(avoid_model_moving=True)
+        if free_all:
+            memory_required = 1e30
+            print(f"[Unload] Trying to free all memory for {device} with {len(keep_loaded)} models keep loaded ... ", end="")
+        else:
+            print(f"[Unload] Trying to free {memory_required / (1024 * 1024):.2f} MB for {device} with {len(keep_loaded)} models keep loaded ... ", end="")
+        offload_everything = ALWAYS_VRAM_OFFLOAD or vram_state == VRAMState.NO_VRAM
+        unloaded_model = False
+        for i in range(len(current_loaded_models) - 1, -1, -1):
+            if not offload_everything:
+                free_memory = get_free_memory(device)
+                print(f"Current free memory is {free_memory / (1024 * 1024):.2f} MB ... ", end="")
+                if free_memory > memory_required:
+                    break
+            shift_model = current_loaded_models[i]
+            if shift_model.device == device:
+                if shift_model not in keep_loaded:
+                    m = current_loaded_models.pop(i)
+                    print(f"Unload model {m.model.model.__class__.__name__} ", end="")
+                    m.model_unload()
+                    del m
+                    unloaded_model = True
+        if unloaded_model:
+            soft_empty_cache()
+        else:
+            if vram_state != VRAMState.HIGH_VRAM:
+                mem_free_total, mem_free_torch = get_free_memory(device, torch_free_too=True)
+                if mem_free_torch > mem_free_total * 0.25:
+                    soft_empty_cache()
+        print('Done.')
+        return
 def compute_model_gpu_memory_when_using_cpu_swap(current_free_mem, inference_memory):
 def load_models_gpu(models, memory_required=0, hard_memory_preservation=0):
+    with global_model_lock:  # Add this line
+        global vram_state
+        execution_start_time = time.perf_counter()
+        memory_to_free = max(minimum_inference_memory(), memory_required) + hard_memory_preservation
+        memory_for_inference = minimum_inference_memory() + hard_memory_preservation
+        models_to_load = []
+        models_already_loaded = []
+        for x in models:
+            loaded_model = LoadedModel(x)
+            if loaded_model in current_loaded_models:
+                index = current_loaded_models.index(loaded_model)
+                current_loaded_models.insert(0, current_loaded_models.pop(index))
+                models_already_loaded.append(loaded_model)
+            else:
+                models_to_load.append(loaded_model)
+        if len(models_to_load) == 0:
+            devs = set(map(lambda a: a.device, models_already_loaded))
+            for d in devs:
+                if d != torch.device("cpu"):
+                    free_memory(memory_to_free, d, models_already_loaded)
             moving_time = time.perf_counter() - execution_start_time
+            if moving_time > 0.1:
+                print(f'Memory cleanup has taken {moving_time:.2f} seconds')
             return
+        for loaded_model in models_to_load:
+            unload_model_clones(loaded_model.model)
+        total_memory_required = {}
+        for loaded_model in models_to_load:
+            loaded_model.compute_inclusive_exclusive_memory()
+            total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.exclusive_memory + loaded_model.inclusive_memory * 0.25
+        for device in total_memory_required:
+            if device != torch.device("cpu"):
+                free_memory(total_memory_required[device] * 1.3 + memory_to_free, device, models_already_loaded)
+        for loaded_model in models_to_load:
+            model = loaded_model.model
+            torch_dev = model.load_device
+            if is_device_cpu(torch_dev):
+                vram_set_state = VRAMState.DISABLED
+            else:
+                vram_set_state = vram_state
+            model_gpu_memory_when_using_cpu_swap = -1
+            if lowvram_available and (vram_set_state == VRAMState.LOW_VRAM or vram_set_state == VRAMState.NORMAL_VRAM):
+                model_require = loaded_model.exclusive_memory
+                previously_loaded = loaded_model.inclusive_memory
+                current_free_mem = get_free_memory(torch_dev)
+                estimated_remaining_memory = current_free_mem - model_require - memory_for_inference
+                print(f"[Memory Management] Target: {loaded_model.model.model.__class__.__name__}, Free GPU: {current_free_mem / (1024 * 1024):.2f} MB, Model Require: {model_require / (1024 * 1024):.2f} MB, Previously Loaded: {previously_loaded / (1024 * 1024):.2f} MB, Inference Require: {memory_for_inference / (1024 * 1024):.2f} MB, Remaining: {estimated_remaining_memory / (1024 * 1024):.2f} MB, ", end="")
+                if estimated_remaining_memory < 0:
+                    vram_set_state = VRAMState.LOW_VRAM
+                    model_gpu_memory_when_using_cpu_swap = compute_model_gpu_memory_when_using_cpu_swap(current_free_mem, memory_for_inference)
+                    if previously_loaded > 0:
+                        model_gpu_memory_when_using_cpu_swap = previously_loaded
+            if vram_set_state == VRAMState.NO_VRAM:
+                model_gpu_memory_when_using_cpu_swap = 0
+            loaded_model.model_load(model_gpu_memory_when_using_cpu_swap)
+            current_loaded_models.insert(0, loaded_model)
+        moving_time = time.perf_counter() - execution_start_time
+        print(f'Moving model(s) has taken {moving_time:.2f} seconds')
+        return
 def load_model_gpu(model):
     return load_models_gpu([model])