Spaces:

zetavg
/

LLaMA-LoRA-Tuner-UI-Demo

Runtime error

App Files Files Community

zetavg commited on Apr 23, 2023

Commit

8b0ae10

•

1 Parent(s): d019027

make gradio reload faster by using dynamic imports

Browse files

Files changed (8) hide show

app.py +1 -1
llama_lora/dynamic_import.py +5 -0
llama_lora/globals.py +21 -11
llama_lora/lib/get_device.py +2 -1
llama_lora/lib/inference.py +1 -0
llama_lora/models.py +12 -2
llama_lora/ui/finetune/finetune_ui.py +1 -1
llama_lora/ui/inference_ui.py +2 -5

app.py CHANGED Viewed

@@ -7,8 +7,8 @@ import yaml
 from llama_lora.config import Config, process_config
 from llama_lora.globals import initialize_global
-from llama_lora.models import prepare_base_model
 from llama_lora.utils.data import init_data_dir
 from llama_lora.ui.main_page import (
     main_page, get_page_title
 )

 from llama_lora.config import Config, process_config
 from llama_lora.globals import initialize_global
 from llama_lora.utils.data import init_data_dir
+from llama_lora.models import prepare_base_model
 from llama_lora.ui.main_page import (
     main_page, get_page_title
 )

llama_lora/dynamic_import.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import importlib
+def dynamic_import(module):
+    return importlib.import_module(module, package='llama_lora')

llama_lora/globals.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import subprocess
 import psutil
@@ -8,10 +9,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 from numba import cuda
 import nvidia_smi
 from .config import Config
 from .utils.lru_cache import LRUCache
-from .utils.model_lru_cache import ModelLRUCache
-from .lib.finetune import train
 class Global:
@@ -22,20 +22,21 @@ class Global:
     version: Union[str, None] = None
     base_model_name: str = ""
-    tokenizer_name = None
     # Functions
-    train_fn: Any = train
     # Training Control
-    should_stop_training = False
     # Generation Control
-    should_stop_generating = False
-    generation_force_stopped_at = None
     # Model related
-    loaded_models = ModelLRUCache(1)
     loaded_tokenizers = LRUCache(1)
     new_base_model_that_is_ready_to_be_used = None
     name_of_new_base_model_that_is_ready_to_be_used = None
@@ -54,7 +55,12 @@ def initialize_global():
     if commit_hash:
         Global.version = commit_hash[:8]
-    load_gpu_info()
 def get_package_dir():
@@ -81,6 +87,8 @@ def get_git_commit_hash():
 def load_gpu_info():
     print("")
     try:
         cc_cores_per_SM_dict = {
@@ -133,9 +141,11 @@ def load_gpu_info():
         available_cpu_ram_gb = available_cpu_ram / (1024 ** 3)
         print(
             f"CPU available memory: {available_cpu_ram} bytes ({available_cpu_ram_mb:.2f} MB) ({available_cpu_ram_gb:.2f} GB)")
-        preserve_loaded_models_count = math.floor((available_cpu_ram * 0.8) / total_memory) - 1
         if preserve_loaded_models_count > 1:
-            print(f"Will keep {preserve_loaded_models_count} offloaded models in CPU RAM.")
             Global.loaded_models = ModelLRUCache(preserve_loaded_models_count)
             Global.loaded_tokenizers = LRUCache(preserve_loaded_models_count)

+import importlib
 import os
 import subprocess
 import psutil
 from numba import cuda
 import nvidia_smi
+from .dynamic_import import dynamic_import
 from .config import Config
 from .utils.lru_cache import LRUCache
 class Global:
     version: Union[str, None] = None
     base_model_name: str = ""
+    tokenizer_name: Union[str, None] = None
     # Functions
+    inference_generate_fn: Any
+    finetune_train_fn: Any
     # Training Control
+    should_stop_training: bool = False
     # Generation Control
+    should_stop_generating: bool = False
+    generation_force_stopped_at: Union[float, None] = None
     # Model related
+    loaded_models = LRUCache(1)
     loaded_tokenizers = LRUCache(1)
     new_base_model_that_is_ready_to_be_used = None
     name_of_new_base_model_that_is_ready_to_be_used = None
     if commit_hash:
         Global.version = commit_hash[:8]
+    if not Config.ui_dev_mode:
+        ModelLRUCache = dynamic_import('.utils.model_lru_cache').ModelLRUCache
+        Global.loaded_models = ModelLRUCache(1)
+        Global.inference_generate_fn = dynamic_import('.lib.inference').generate
+        Global.finetune_train_fn = dynamic_import('.lib.finetune').train
+        load_gpu_info()
 def get_package_dir():
 def load_gpu_info():
+    # cuda = importlib.import_module('numba').cuda
+    # nvidia_smi = importlib.import_module('nvidia_smi')
     print("")
     try:
         cc_cores_per_SM_dict = {
         available_cpu_ram_gb = available_cpu_ram / (1024 ** 3)
         print(
             f"CPU available memory: {available_cpu_ram} bytes ({available_cpu_ram_mb:.2f} MB) ({available_cpu_ram_gb:.2f} GB)")
+        preserve_loaded_models_count = math.floor(
+            (available_cpu_ram * 0.8) / total_memory) - 1
         if preserve_loaded_models_count > 1:
+            print(
+                f"Will keep {preserve_loaded_models_count} offloaded models in CPU RAM.")
             Global.loaded_models = ModelLRUCache(preserve_loaded_models_count)
             Global.loaded_tokenizers = LRUCache(preserve_loaded_models_count)

llama_lora/lib/get_device.py CHANGED Viewed

@@ -1,7 +1,8 @@
-import torch
 def get_device():
     device ="cpu"
     if torch.cuda.is_available():
         device = "cuda"

+import importlib
 def get_device():
+    torch = importlib.import_module('torch')
     device ="cpu"
     if torch.cuda.is_available():
         device = "cuda"

llama_lora/lib/inference.py CHANGED Viewed

@@ -4,6 +4,7 @@ import transformers
 from .get_device import get_device
 from .streaming_generation_utils import Iteratorize, Stream
 def generate(
     # model
     model,

 from .get_device import get_device
 from .streaming_generation_utils import Iteratorize, Stream
 def generate(
     # model
     model,

llama_lora/models.py CHANGED Viewed

@@ -1,21 +1,28 @@
 import os
 import sys
 import gc
 import json
 import re
-import torch
 from transformers import (
     AutoModelForCausalLM, AutoModel,
     AutoTokenizer, LlamaTokenizer
 )
-from peft import PeftModel
 from .config import Config
 from .globals import Global
 from .lib.get_device import get_device
 def get_new_base_model(base_model_name):
     if Config.ui_dev_mode:
         return
@@ -75,6 +82,7 @@ def get_new_base_model(base_model_name):
 def _get_model_from_pretrained(model_class, model_name, from_tf=False, force_download=False):
     device = get_device()
     if device == "cuda":
@@ -183,6 +191,8 @@ def get_model(
     if peft_model_name:
         device = get_device()
         if device == "cuda":
             model = PeftModel.from_pretrained(

+import importlib
 import os
 import sys
 import gc
 import json
 import re
 from transformers import (
     AutoModelForCausalLM, AutoModel,
     AutoTokenizer, LlamaTokenizer
 )
 from .config import Config
 from .globals import Global
 from .lib.get_device import get_device
+def get_torch():
+    return importlib.import_module('torch')
+def get_peft_model_class():
+    return importlib.import_module('peft').PeftModel
 def get_new_base_model(base_model_name):
     if Config.ui_dev_mode:
         return
 def _get_model_from_pretrained(model_class, model_name, from_tf=False, force_download=False):
+    torch = get_torch()
     device = get_device()
     if device == "cuda":
     if peft_model_name:
         device = get_device()
+        torch = get_torch()
+        PeftModel = get_peft_model_class()
         if device == "cuda":
             model = PeftModel.from_pretrained(

llama_lora/ui/finetune/finetune_ui.py CHANGED Viewed

@@ -550,7 +550,7 @@ Train data (first 10):
             wandb_group += f"/{dataset_from_data_dir}"
             wandb_tags.append(f"dataset:{dataset_from_data_dir}")
-        train_output = Global.train_fn(
             base_model=base_model_name,
             tokenizer=tokenizer_name,
             output_dir=output_dir,

             wandb_group += f"/{dataset_from_data_dir}"
             wandb_tags.append(f"dataset:{dataset_from_data_dir}")
+        train_output = Global.finetune_train_fn(
             base_model=base_model_name,
             tokenizer=tokenizer_name,
             output_dir=output_dir,

llama_lora/ui/inference_ui.py CHANGED Viewed

@@ -3,14 +3,11 @@ import os
 import time
 import json
-import torch
-import transformers
 from transformers import GenerationConfig
 from ..config import Config
 from ..globals import Global
 from ..models import get_model, get_tokenizer, get_device
-from ..lib.inference import generate
 from ..lib.csv_logger import CSVLogger
 from ..utils.data import (
     get_available_template_names,
@@ -181,7 +178,7 @@ def do_inference(
             'stream_output': stream_output
         }
-        for (decoded_output, output, completed) in generate(**generation_args):
             raw_output_str = str(output)
             response = prompter.get_response(decoded_output)
@@ -217,7 +214,7 @@ def do_inference(
         return
     except Exception as e:
-        raise gr.Error(e)
 def handle_stop_generate():

 import time
 import json
 from transformers import GenerationConfig
 from ..config import Config
 from ..globals import Global
 from ..models import get_model, get_tokenizer, get_device
 from ..lib.csv_logger import CSVLogger
 from ..utils.data import (
     get_available_template_names,
             'stream_output': stream_output
         }
+        for (decoded_output, output, completed) in Global.inference_generate_fn(**generation_args):
             raw_output_str = str(output)
             response = prompter.get_response(decoded_output)
         return
     except Exception as e:
+        raise gr.Error(str(e))
 def handle_stop_generate():