Spaces:

zetavg
/

LLaMA-LoRA-Tuner-UI-Demo

Runtime error

App Files Files Community

zetavg commited on Apr 7, 2023

Commit

9279c83

•

1 Parent(s): 8788753

update

Browse files

Files changed (5) hide show

llama_lora/globals.py +63 -0
llama_lora/models.py +9 -5
llama_lora/ui/finetune_ui.py +27 -6
llama_lora/ui/inference_ui.py +1 -1
requirements.txt +2 -0

llama_lora/globals.py CHANGED Viewed

@@ -3,6 +3,9 @@ import subprocess
 from typing import Any, Dict, List, Optional, Tuple, Union
 from .lib.finetune import train
@@ -25,6 +28,12 @@ class Global:
     # Model related
     model_has_been_used = False
     # UI related
     ui_title: str = "LLaMA-LoRA"
     ui_emoji: str = "🦙🎛️"
@@ -60,3 +69,57 @@ commit_hash = get_git_commit_hash()
 if commit_hash:
     Global.version = commit_hash[:8]

 from typing import Any, Dict, List, Optional, Tuple, Union
+from numba import cuda
+import nvidia_smi
 from .lib.finetune import train
     # Model related
     model_has_been_used = False
+    # GPU Info
+    gpu_cc = None  # GPU compute capability
+    gpu_sms = None  # GPU total number of SMs
+    gpu_total_cores = None  # GPU total cores
+    gpu_total_memory = None
     # UI related
     ui_title: str = "LLaMA-LoRA"
     ui_emoji: str = "🦙🎛️"
 if commit_hash:
     Global.version = commit_hash[:8]
+def load_gpu_info():
+    try:
+        cc_cores_per_SM_dict = {
+            (2, 0): 32,
+            (2, 1): 48,
+            (3, 0): 192,
+            (3, 5): 192,
+            (3, 7): 192,
+            (5, 0): 128,
+            (5, 2): 128,
+            (6, 0): 64,
+            (6, 1): 128,
+            (7, 0): 64,
+            (7, 5): 64,
+            (8, 0): 64,
+            (8, 6): 128,
+            (8, 9): 128,
+            (9, 0): 128
+        }
+        # the above dictionary should result in a value of "None" if a cc match
+        # is not found.  The dictionary needs to be extended as new devices become
+        # available, and currently does not account for all Jetson devices
+        device = cuda.get_current_device()
+        device_sms = getattr(device, 'MULTIPROCESSOR_COUNT')
+        device_cc = device.compute_capability
+        cores_per_sm = cc_cores_per_SM_dict.get(device_cc)
+        total_cores = cores_per_sm*device_sms
+        print("GPU compute capability: ", device_cc)
+        print("GPU total number of SMs: ", device_sms)
+        print("GPU total cores: ", total_cores)
+        Global.gpu_cc = device_cc
+        Global.gpu_sms = device_sms
+        Global.gpu_total_cores = total_cores
+        nvidia_smi.nvmlInit()
+        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
+        info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
+        total_memory = info.total
+        total_memory_mb = total_memory / (1024 ** 2)
+        total_memory_gb = total_memory / (1024 ** 3)
+        # Print the memory size
+        print(
+            f"GPU total memory: {total_memory} bytes ({total_memory_mb:.2f} MB) ({total_memory_gb:.2f} GB)")
+        Global.gpu_total_memory = total_memory
+    except Exception as e:
+        print(f"Notice: cannot get GPU info: {e}")
+load_gpu_info()

llama_lora/models.py CHANGED Viewed

@@ -102,6 +102,14 @@ def load_base_model():
             )
 def unload_models():
     del Global.loaded_base_model
     Global.loaded_base_model = None
@@ -109,11 +117,7 @@ def unload_models():
     del Global.loaded_tokenizer
     Global.loaded_tokenizer = None
-    gc.collect()
-    # if not shared.args.cpu: # will not be running on CPUs anyway
-    with torch.no_grad():
-        torch.cuda.empty_cache()
     Global.model_has_been_used = False

             )
+def clear_cache():
+    gc.collect()
+    # if not shared.args.cpu: # will not be running on CPUs anyway
+    with torch.no_grad():
+        torch.cuda.empty_cache()
 def unload_models():
     del Global.loaded_base_model
     Global.loaded_base_model = None
     del Global.loaded_tokenizer
     Global.loaded_tokenizer = None
+    clear_cache()
     Global.model_has_been_used = False

llama_lora/ui/finetune_ui.py CHANGED Viewed

@@ -9,7 +9,9 @@ from random_word import RandomWords
 from transformers import TrainerCallback
 from ..globals import Global
-from ..models import get_base_model, get_tokenizer, unload_models_if_already_used
 from ..utils.data import (
     get_available_template_names,
     get_available_dataset_names,
@@ -238,6 +240,12 @@ def parse_plain_text_input(
     return result
 def do_train(
     # Dataset
     template,
@@ -258,9 +266,10 @@ def do_train(
     lora_alpha,
     lora_dropout,
     model_name,
-    progress=gr.Progress(track_tqdm=True),
 ):
     try:
         # If model has been used in inference, we need to unload it first.
         # Otherwise, we'll get a 'Function MmBackward0 returned an invalid
         # gradient at index 1 - expected device meta but got cuda:0' error.
@@ -337,7 +346,8 @@ def do_train(
                 progress(
                     (i, 300),
-                    desc="(Simulate) " + get_progress_text(epoch, epochs, last_loss)
                 )
                 time.sleep(0.1)
@@ -401,12 +411,13 @@ Train data (first 10):
         # Do this again right before training to make sure the model is not used in inference.
         unload_models_if_already_used()
         base_model = get_base_model()
         tokenizer = get_tokenizer()
         # Do not let other tqdm iterations interfere the progress reporting after training starts.
-        progress.track_tqdm = False
         results = Global.train_fn(
             base_model,  # base_model
@@ -431,7 +442,8 @@ Train data (first 10):
             training_callbacks  # callbacks
         )
-        logs_str = "\n".join([json.dumps(log) for log in log_history]) or "None"
         result_message = f"Training ended:\n{str(results)}\n\nLogs:\n{logs_str}"
         print(result_message)
@@ -590,9 +602,18 @@ def finetune_ui():
             )
         with gr.Row():
             with gr.Column():
                 micro_batch_size = gr.Slider(
-                    minimum=1, maximum=100, step=1, value=8,
                     label="Micro Batch Size",
                     info="The number of examples in each mini-batch for gradient computation. A smaller micro_batch_size reduces memory usage but may increase training time."
                 )

 from transformers import TrainerCallback
 from ..globals import Global
+from ..models import (
+    get_base_model, get_tokenizer,
+    clear_cache, unload_models_if_already_used)
 from ..utils.data import (
     get_available_template_names,
     get_available_dataset_names,
     return result
+should_training_progress_track_tqdm = True
+if Global.gpu_total_cores is not None and Global.gpu_total_cores > 2560:
+    should_training_progress_track_tqdm = False
 def do_train(
     # Dataset
     template,
     lora_alpha,
     lora_dropout,
     model_name,
+    progress=gr.Progress(track_tqdm=should_training_progress_track_tqdm),
 ):
     try:
+        clear_cache()
         # If model has been used in inference, we need to unload it first.
         # Otherwise, we'll get a 'Function MmBackward0 returned an invalid
         # gradient at index 1 - expected device meta but got cuda:0' error.
                 progress(
                     (i, 300),
+                    desc="(Simulate) " +
+                    get_progress_text(epoch, epochs, last_loss)
                 )
                 time.sleep(0.1)
         # Do this again right before training to make sure the model is not used in inference.
         unload_models_if_already_used()
+        clear_cache()
         base_model = get_base_model()
         tokenizer = get_tokenizer()
         # Do not let other tqdm iterations interfere the progress reporting after training starts.
+        # progress.track_tqdm = False  # setting this dynamically is not working, determining if track_tqdm should be enabled based on GPU cores at start instead.
         results = Global.train_fn(
             base_model,  # base_model
             training_callbacks  # callbacks
         )
+        logs_str = "\n".join([json.dumps(log)
+                             for log in log_history]) or "None"
         result_message = f"Training ended:\n{str(results)}\n\nLogs:\n{logs_str}"
         print(result_message)
             )
         with gr.Row():
+            micro_batch_size_default_value = 1
+            if Global.gpu_total_cores is not None and Global.gpu_total_memory is not None:
+                memory_per_core = Global.gpu_total_memory / Global.gpu_total_cores
+                if memory_per_core >= 6291456:
+                    micro_batch_size_default_value = 8
+                elif memory_per_core >= 4000000:  # ?
+                    micro_batch_size_default_value = 4
             with gr.Column():
                 micro_batch_size = gr.Slider(
+                    minimum=1, maximum=100, step=1, value=micro_batch_size_default_value,
                     label="Micro Batch Size",
                     info="The number of examples in each mini-batch for gradient computation. A smaller micro_batch_size reduces memory usage but may increase training time."
                 )

llama_lora/ui/inference_ui.py CHANGED Viewed

@@ -245,7 +245,7 @@ def inference_ui():
                         preview_prompt = gr.Textbox(
                             show_label=False, interactive=False, elem_id="inference_preview_prompt")
                         update_prompt_preview_btn = gr.Button(
-                            "↻", elem_id="inference_update_prompt_preview_btn", full_width=False)
                         update_prompt_preview_btn.style(size="sm")
                 # with gr.Column():

                         preview_prompt = gr.Textbox(
                             show_label=False, interactive=False, elem_id="inference_preview_prompt")
                         update_prompt_preview_btn = gr.Button(
+                            "↻", elem_id="inference_update_prompt_preview_btn")
                         update_prompt_preview_btn.style(size="sm")
                 # with gr.Column():

requirements.txt CHANGED Viewed

@@ -7,6 +7,8 @@ datasets
 fire
 git+https://github.com/huggingface/peft.git
 git+https://github.com/huggingface/transformers.git
 gradio
 loralib
 sentencepiece

 fire
 git+https://github.com/huggingface/peft.git
 git+https://github.com/huggingface/transformers.git
+numba
+nvidia-ml-py3
 gradio
 loralib
 sentencepiece