zetavg commited on
Commit
6148b7c
1 Parent(s): 0054cc5

improve speed of switching models by offloading unused ones to cpu ram instead if unloading

Browse files
llama_lora/globals.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
2
  import subprocess
 
 
3
 
4
  from typing import Any, Dict, List, Optional, Tuple, Union
5
 
@@ -7,6 +9,7 @@ from numba import cuda
7
  import nvidia_smi
8
 
9
  from .utils.lru_cache import LRUCache
 
10
  from .lib.finetune import train
11
 
12
 
@@ -34,7 +37,7 @@ class Global:
34
  generation_force_stopped_at = None
35
 
36
  # Model related
37
- loaded_models = LRUCache(1)
38
  loaded_tokenizers = LRUCache(1)
39
  new_base_model_that_is_ready_to_be_used = None
40
  name_of_new_base_model_that_is_ready_to_be_used = None
@@ -89,6 +92,7 @@ if commit_hash:
89
 
90
 
91
  def load_gpu_info():
 
92
  try:
93
  cc_cores_per_SM_dict = {
94
  (2, 0): 32,
@@ -135,8 +139,20 @@ def load_gpu_info():
135
  f"GPU total memory: {total_memory} bytes ({total_memory_mb:.2f} MB) ({total_memory_gb:.2f} GB)")
136
  Global.gpu_total_memory = total_memory
137
 
 
 
 
 
 
 
 
 
 
 
 
138
  except Exception as e:
139
  print(f"Notice: cannot get GPU info: {e}")
140
 
 
141
 
142
  load_gpu_info()
 
1
  import os
2
  import subprocess
3
+ import psutil
4
+ import math
5
 
6
  from typing import Any, Dict, List, Optional, Tuple, Union
7
 
 
9
  import nvidia_smi
10
 
11
  from .utils.lru_cache import LRUCache
12
+ from .utils.model_lru_cache import ModelLRUCache
13
  from .lib.finetune import train
14
 
15
 
 
37
  generation_force_stopped_at = None
38
 
39
  # Model related
40
+ loaded_models = ModelLRUCache(1)
41
  loaded_tokenizers = LRUCache(1)
42
  new_base_model_that_is_ready_to_be_used = None
43
  name_of_new_base_model_that_is_ready_to_be_used = None
 
92
 
93
 
94
  def load_gpu_info():
95
+ print("")
96
  try:
97
  cc_cores_per_SM_dict = {
98
  (2, 0): 32,
 
139
  f"GPU total memory: {total_memory} bytes ({total_memory_mb:.2f} MB) ({total_memory_gb:.2f} GB)")
140
  Global.gpu_total_memory = total_memory
141
 
142
+ available_cpu_ram = psutil.virtual_memory().available
143
+ available_cpu_ram_mb = available_cpu_ram / (1024 ** 2)
144
+ available_cpu_ram_gb = available_cpu_ram / (1024 ** 3)
145
+ print(
146
+ f"CPU available memory: {available_cpu_ram} bytes ({available_cpu_ram_mb:.2f} MB) ({available_cpu_ram_gb:.2f} GB)")
147
+ preserve_loaded_models_count = math.floor((available_cpu_ram * 0.8) / total_memory) - 1
148
+ if preserve_loaded_models_count > 1:
149
+ print(f"Will keep {preserve_loaded_models_count} offloaded models in CPU RAM.")
150
+ Global.loaded_models = ModelLRUCache(preserve_loaded_models_count)
151
+ Global.loaded_tokenizers = LRUCache(preserve_loaded_models_count)
152
+
153
  except Exception as e:
154
  print(f"Notice: cannot get GPU info: {e}")
155
 
156
+ print("")
157
 
158
  load_gpu_info()
llama_lora/ui/main_page.py CHANGED
@@ -136,7 +136,6 @@ def main_page():
136
  const tokenizer_name = current_tokenizer_hint_elem && current_tokenizer_hint_elem.innerText;
137
 
138
  if (tokenizer_name && tokenizer_name !== base_model_name) {
139
- document.querySelector('#global_tokenizer_select input').value = tokenizer_name;
140
  const btn = document.getElementById('use_custom_tokenizer_btn');
141
  if (btn) btn.click();
142
  }
 
136
  const tokenizer_name = current_tokenizer_hint_elem && current_tokenizer_hint_elem.innerText;
137
 
138
  if (tokenizer_name && tokenizer_name !== base_model_name) {
 
139
  const btn = document.getElementById('use_custom_tokenizer_btn');
140
  if (btn) btn.click();
141
  }
llama_lora/utils/model_lru_cache.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ import gc
3
+ import torch
4
+ from ..lib.get_device import get_device
5
+
6
+ device_type = get_device()
7
+
8
+
9
+ class ModelLRUCache:
10
+ def __init__(self, capacity=5):
11
+ self.cache = OrderedDict()
12
+ self.capacity = capacity
13
+
14
+ def get(self, key):
15
+ if key in self.cache:
16
+ # Move the accessed item to the end of the OrderedDict
17
+ self.cache.move_to_end(key)
18
+
19
+ models_did_move = False
20
+ for k, m in self.cache.items():
21
+ if key != k and m.device.type != 'cpu':
22
+ models_did_move = True
23
+ self.cache[k] = m.to('cpu')
24
+
25
+ if models_did_move:
26
+ gc.collect()
27
+ # if not shared.args.cpu: # will not be running on CPUs anyway
28
+ with torch.no_grad():
29
+ torch.cuda.empty_cache()
30
+
31
+ model = self.cache[key]
32
+
33
+ if (model.device.type != device_type or
34
+ hasattr(model, "model") and
35
+ model.model.device.type != device_type):
36
+ model = model.to(device_type)
37
+
38
+ return model
39
+ return None
40
+
41
+ def set(self, key, value):
42
+ if key in self.cache:
43
+ # If the key already exists, update its value
44
+ self.cache[key] = value
45
+ else:
46
+ # If the cache has reached its capacity, remove the least recently used item
47
+ if len(self.cache) >= self.capacity:
48
+ self.cache.popitem(last=False)
49
+ self.cache[key] = value
50
+
51
+ def clear(self):
52
+ self.cache.clear()
53
+
54
+ def prepare_to_set(self):
55
+ if len(self.cache) >= self.capacity:
56
+ self.cache.popitem(last=False)
57
+
58
+ models_did_move = False
59
+ for k, m in self.cache.items():
60
+ if m.device.type != 'cpu':
61
+ models_did_move = True
62
+ self.cache[k] = m.to('cpu')
63
+
64
+ if models_did_move:
65
+ gc.collect()
66
+ # if not shared.args.cpu: # will not be running on CPUs anyway
67
+ with torch.no_grad():
68
+ torch.cuda.empty_cache()