zetavg commited on
Commit
8b0ae10
1 Parent(s): d019027

make gradio reload faster by using dynamic imports

Browse files
app.py CHANGED
@@ -7,8 +7,8 @@ import yaml
7
 
8
  from llama_lora.config import Config, process_config
9
  from llama_lora.globals import initialize_global
10
- from llama_lora.models import prepare_base_model
11
  from llama_lora.utils.data import init_data_dir
 
12
  from llama_lora.ui.main_page import (
13
  main_page, get_page_title
14
  )
 
7
 
8
  from llama_lora.config import Config, process_config
9
  from llama_lora.globals import initialize_global
 
10
  from llama_lora.utils.data import init_data_dir
11
+ from llama_lora.models import prepare_base_model
12
  from llama_lora.ui.main_page import (
13
  main_page, get_page_title
14
  )
llama_lora/dynamic_import.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import importlib
2
+
3
+
4
+ def dynamic_import(module):
5
+ return importlib.import_module(module, package='llama_lora')
llama_lora/globals.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import subprocess
3
  import psutil
@@ -8,10 +9,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union
8
  from numba import cuda
9
  import nvidia_smi
10
 
 
11
  from .config import Config
12
  from .utils.lru_cache import LRUCache
13
- from .utils.model_lru_cache import ModelLRUCache
14
- from .lib.finetune import train
15
 
16
 
17
  class Global:
@@ -22,20 +22,21 @@ class Global:
22
  version: Union[str, None] = None
23
 
24
  base_model_name: str = ""
25
- tokenizer_name = None
26
 
27
  # Functions
28
- train_fn: Any = train
 
29
 
30
  # Training Control
31
- should_stop_training = False
32
 
33
  # Generation Control
34
- should_stop_generating = False
35
- generation_force_stopped_at = None
36
 
37
  # Model related
38
- loaded_models = ModelLRUCache(1)
39
  loaded_tokenizers = LRUCache(1)
40
  new_base_model_that_is_ready_to_be_used = None
41
  name_of_new_base_model_that_is_ready_to_be_used = None
@@ -54,7 +55,12 @@ def initialize_global():
54
  if commit_hash:
55
  Global.version = commit_hash[:8]
56
 
57
- load_gpu_info()
 
 
 
 
 
58
 
59
 
60
  def get_package_dir():
@@ -81,6 +87,8 @@ def get_git_commit_hash():
81
 
82
 
83
  def load_gpu_info():
 
 
84
  print("")
85
  try:
86
  cc_cores_per_SM_dict = {
@@ -133,9 +141,11 @@ def load_gpu_info():
133
  available_cpu_ram_gb = available_cpu_ram / (1024 ** 3)
134
  print(
135
  f"CPU available memory: {available_cpu_ram} bytes ({available_cpu_ram_mb:.2f} MB) ({available_cpu_ram_gb:.2f} GB)")
136
- preserve_loaded_models_count = math.floor((available_cpu_ram * 0.8) / total_memory) - 1
 
137
  if preserve_loaded_models_count > 1:
138
- print(f"Will keep {preserve_loaded_models_count} offloaded models in CPU RAM.")
 
139
  Global.loaded_models = ModelLRUCache(preserve_loaded_models_count)
140
  Global.loaded_tokenizers = LRUCache(preserve_loaded_models_count)
141
 
 
1
+ import importlib
2
  import os
3
  import subprocess
4
  import psutil
 
9
  from numba import cuda
10
  import nvidia_smi
11
 
12
+ from .dynamic_import import dynamic_import
13
  from .config import Config
14
  from .utils.lru_cache import LRUCache
 
 
15
 
16
 
17
  class Global:
 
22
  version: Union[str, None] = None
23
 
24
  base_model_name: str = ""
25
+ tokenizer_name: Union[str, None] = None
26
 
27
  # Functions
28
+ inference_generate_fn: Any
29
+ finetune_train_fn: Any
30
 
31
  # Training Control
32
+ should_stop_training: bool = False
33
 
34
  # Generation Control
35
+ should_stop_generating: bool = False
36
+ generation_force_stopped_at: Union[float, None] = None
37
 
38
  # Model related
39
+ loaded_models = LRUCache(1)
40
  loaded_tokenizers = LRUCache(1)
41
  new_base_model_that_is_ready_to_be_used = None
42
  name_of_new_base_model_that_is_ready_to_be_used = None
 
55
  if commit_hash:
56
  Global.version = commit_hash[:8]
57
 
58
+ if not Config.ui_dev_mode:
59
+ ModelLRUCache = dynamic_import('.utils.model_lru_cache').ModelLRUCache
60
+ Global.loaded_models = ModelLRUCache(1)
61
+ Global.inference_generate_fn = dynamic_import('.lib.inference').generate
62
+ Global.finetune_train_fn = dynamic_import('.lib.finetune').train
63
+ load_gpu_info()
64
 
65
 
66
  def get_package_dir():
 
87
 
88
 
89
  def load_gpu_info():
90
+ # cuda = importlib.import_module('numba').cuda
91
+ # nvidia_smi = importlib.import_module('nvidia_smi')
92
  print("")
93
  try:
94
  cc_cores_per_SM_dict = {
 
141
  available_cpu_ram_gb = available_cpu_ram / (1024 ** 3)
142
  print(
143
  f"CPU available memory: {available_cpu_ram} bytes ({available_cpu_ram_mb:.2f} MB) ({available_cpu_ram_gb:.2f} GB)")
144
+ preserve_loaded_models_count = math.floor(
145
+ (available_cpu_ram * 0.8) / total_memory) - 1
146
  if preserve_loaded_models_count > 1:
147
+ print(
148
+ f"Will keep {preserve_loaded_models_count} offloaded models in CPU RAM.")
149
  Global.loaded_models = ModelLRUCache(preserve_loaded_models_count)
150
  Global.loaded_tokenizers = LRUCache(preserve_loaded_models_count)
151
 
llama_lora/lib/get_device.py CHANGED
@@ -1,7 +1,8 @@
1
- import torch
2
 
3
 
4
  def get_device():
 
5
  device ="cpu"
6
  if torch.cuda.is_available():
7
  device = "cuda"
 
1
+ import importlib
2
 
3
 
4
  def get_device():
5
+ torch = importlib.import_module('torch')
6
  device ="cpu"
7
  if torch.cuda.is_available():
8
  device = "cuda"
llama_lora/lib/inference.py CHANGED
@@ -4,6 +4,7 @@ import transformers
4
  from .get_device import get_device
5
  from .streaming_generation_utils import Iteratorize, Stream
6
 
 
7
  def generate(
8
  # model
9
  model,
 
4
  from .get_device import get_device
5
  from .streaming_generation_utils import Iteratorize, Stream
6
 
7
+
8
  def generate(
9
  # model
10
  model,
llama_lora/models.py CHANGED
@@ -1,21 +1,28 @@
 
1
  import os
2
  import sys
3
  import gc
4
  import json
5
  import re
6
 
7
- import torch
8
  from transformers import (
9
  AutoModelForCausalLM, AutoModel,
10
  AutoTokenizer, LlamaTokenizer
11
  )
12
- from peft import PeftModel
13
 
14
  from .config import Config
15
  from .globals import Global
16
  from .lib.get_device import get_device
17
 
18
 
 
 
 
 
 
 
 
 
19
  def get_new_base_model(base_model_name):
20
  if Config.ui_dev_mode:
21
  return
@@ -75,6 +82,7 @@ def get_new_base_model(base_model_name):
75
 
76
 
77
  def _get_model_from_pretrained(model_class, model_name, from_tf=False, force_download=False):
 
78
  device = get_device()
79
 
80
  if device == "cuda":
@@ -183,6 +191,8 @@ def get_model(
183
 
184
  if peft_model_name:
185
  device = get_device()
 
 
186
 
187
  if device == "cuda":
188
  model = PeftModel.from_pretrained(
 
1
+ import importlib
2
  import os
3
  import sys
4
  import gc
5
  import json
6
  import re
7
 
 
8
  from transformers import (
9
  AutoModelForCausalLM, AutoModel,
10
  AutoTokenizer, LlamaTokenizer
11
  )
 
12
 
13
  from .config import Config
14
  from .globals import Global
15
  from .lib.get_device import get_device
16
 
17
 
18
+ def get_torch():
19
+ return importlib.import_module('torch')
20
+
21
+
22
+ def get_peft_model_class():
23
+ return importlib.import_module('peft').PeftModel
24
+
25
+
26
  def get_new_base_model(base_model_name):
27
  if Config.ui_dev_mode:
28
  return
 
82
 
83
 
84
  def _get_model_from_pretrained(model_class, model_name, from_tf=False, force_download=False):
85
+ torch = get_torch()
86
  device = get_device()
87
 
88
  if device == "cuda":
 
191
 
192
  if peft_model_name:
193
  device = get_device()
194
+ torch = get_torch()
195
+ PeftModel = get_peft_model_class()
196
 
197
  if device == "cuda":
198
  model = PeftModel.from_pretrained(
llama_lora/ui/finetune/finetune_ui.py CHANGED
@@ -550,7 +550,7 @@ Train data (first 10):
550
  wandb_group += f"/{dataset_from_data_dir}"
551
  wandb_tags.append(f"dataset:{dataset_from_data_dir}")
552
 
553
- train_output = Global.train_fn(
554
  base_model=base_model_name,
555
  tokenizer=tokenizer_name,
556
  output_dir=output_dir,
 
550
  wandb_group += f"/{dataset_from_data_dir}"
551
  wandb_tags.append(f"dataset:{dataset_from_data_dir}")
552
 
553
+ train_output = Global.finetune_train_fn(
554
  base_model=base_model_name,
555
  tokenizer=tokenizer_name,
556
  output_dir=output_dir,
llama_lora/ui/inference_ui.py CHANGED
@@ -3,14 +3,11 @@ import os
3
  import time
4
  import json
5
 
6
- import torch
7
- import transformers
8
  from transformers import GenerationConfig
9
 
10
  from ..config import Config
11
  from ..globals import Global
12
  from ..models import get_model, get_tokenizer, get_device
13
- from ..lib.inference import generate
14
  from ..lib.csv_logger import CSVLogger
15
  from ..utils.data import (
16
  get_available_template_names,
@@ -181,7 +178,7 @@ def do_inference(
181
  'stream_output': stream_output
182
  }
183
 
184
- for (decoded_output, output, completed) in generate(**generation_args):
185
  raw_output_str = str(output)
186
  response = prompter.get_response(decoded_output)
187
 
@@ -217,7 +214,7 @@ def do_inference(
217
 
218
  return
219
  except Exception as e:
220
- raise gr.Error(e)
221
 
222
 
223
  def handle_stop_generate():
 
3
  import time
4
  import json
5
 
 
 
6
  from transformers import GenerationConfig
7
 
8
  from ..config import Config
9
  from ..globals import Global
10
  from ..models import get_model, get_tokenizer, get_device
 
11
  from ..lib.csv_logger import CSVLogger
12
  from ..utils.data import (
13
  get_available_template_names,
 
178
  'stream_output': stream_output
179
  }
180
 
181
+ for (decoded_output, output, completed) in Global.inference_generate_fn(**generation_args):
182
  raw_output_str = str(output)
183
  response = prompter.get_response(decoded_output)
184
 
 
214
 
215
  return
216
  except Exception as e:
217
+ raise gr.Error(str(e))
218
 
219
 
220
  def handle_stop_generate():