Spaces:

joaogante
/

medusa-maker

Paused

App Files Files Community

joaogante HF staff commited on Jan 4

Commit

0b94c41

•

1 Parent(s): 6ba93dd

two files

Browse files

Files changed (2) hide show

app.py +1 -152
medusa_training.py +152 -0

app.py CHANGED Viewed

@@ -1,15 +1,7 @@
-import json
-import os
-import multiprocessing as mp
 from git import Repo
 import gradio as gr
-from huggingface_hub import HfApi
-from huggingface_hub.utils import RepositoryNotFoundError
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
-import torch
-import torch.distributed.run as distributed_run
 # Clone the medusa repo locally
 print("Cloning the medusa repo locally...")
@@ -18,149 +10,6 @@ print("Cloning the vicuna data locally...")
 Repo.clone_from("https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered", "data")
 print("Done")
-OUTPUT_DIR = "medusa_heads"
-MEDUSA_NUM_HEADS = 3
-MEDUSA_NUM_LAYERS = 1
-LR = 1e-3
-DATASET = "vicuna"
-# These can't be changed (e.g. they control the output path)
-FIXED_TRAINING_ARGS = \
-"""medusa/medusa/train/train.py
---model_name_or_path {model_id}
---output_dir {output_dir}
---run_name {model_id}-medusa-{dataset}
---medusa_num_heads {medusa_num_heads}
---medusa_num_layers {medusa_num_layers}
---learning_rate {lr}
---data_path data/ShareGPT_V4.3_unfiltered_cleaned_split.json"""
-# These can be freely changed
-DEFAULT_TRAINING_ARGS = \
-"""--bf16 True
---num_train_epochs 1
---per_device_train_batch_size 64
---per_device_eval_batch_size 64
---gradient_accumulation_steps 4
---evaluation_strategy no
---save_strategy no
---weight_decay 0.0
---warmup_ratio 0.1
---lr_scheduler_type cosine
---logging_steps 10
---tf32 True
---model_max_length 2048
---lazy_preprocess True
---auto_find_batch_size True"""
-def train_medusa_heads(model_id: str, training_args: str):
-    all_training_args = FIXED_TRAINING_ARGS.format(
-        model_id=model_id,
-        output_dir=OUTPUT_DIR,
-        dataset=DATASET,
-        medusa_num_heads=MEDUSA_NUM_HEADS,
-        lr=LR,
-        medusa_num_layers=MEDUSA_NUM_LAYERS
-    ) + "\n" + training_args
-    all_training_arg_list = []
-    for arg in all_training_args.split("\n"):
-        all_training_arg_list += arg.split(" ")
-    print("Full argument list:", all_training_arg_list)
-    parser = distributed_run.get_args_parser()
-    args = parser.parse_args(all_training_arg_list)
-    distributed_run.run(args)
-def run(model_id: str, training_args: str) -> str:
-    print(f"\n\n\nNEW RUN: {model_id}")
-    api = HfApi()
-    model_name = model_id.split("/")[-1]
-    repo_id = f"joaogante/{model_name}-medusa-{DATASET}"
-    # Input validation
-    if model_id == "":
-        return """
-        ### Invalid input 🐞
-        Please fill a model_id.
-        """
-    if api.repo_exists(repo_id):
-        return f"""
-        ### Invalid input 🐞
-        {repo_id} already exists, which means that {model_id} has already been used to create medusa heads.
-        """
-    print(f"Valid inputs ✅\nValidating model_id: {model_id}")
-    # Attempt to load the base model
-    try:
-        config = AutoConfig.from_pretrained(model_id)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-        del config, tokenizer, model
-    except Exception as e:
-        return f"""
-        ### {model_id} can't be loaded with AutoClasses 🐞
-        {e}
-        """
-    print(f"{model_id} can be loaded ✅\nCreating medusa heads (will take a few hours)")
-    # Run the medusa heads creation
-    try:
-        proc = mp.Process(target=train_medusa_heads, args=(model_id, training_args))
-        proc.start()
-        proc.join()
-        print("Medusa heads training process completed (it might have crashed!)")
-    except Exception as e:
-        print("Error ❌\n", e)
-        return f"""
-        ### Error 😢😢😢
-        {e}
-        """
-    # Upload the medusa heads to the Hub
-    try:
-        # Folder path from https://github.com/FasterDecoding/Medusa/blob/main/medusa/train/train.py#L399
-        folder_path = (
-            f"{OUTPUT_DIR}_medusa_mlp_{model_name}_medusa_{MEDUSA_NUM_HEADS}_lr_{LR}_layers_{MEDUSA_NUM_LAYERS}"
-        )
-        if not any([x for x in os.listdir(folder_path) if len(x) >= 3 and x[-3:] == ".pt"]):
-            raise Exception(
-                "No model data in the expected model folder, the traning run probably failed. Check the logs for more "
-                "information."
-            )
-        api.create_repo(
-            repo_id=repo_id,
-            exist_ok=True,
-        )
-        api.upload_folder(
-            folder_path=folder_path,
-            repo_id=repo_id,
-        )
-        print("Medusa heads upload success ✅\n Uploaded to: ", repo_id)
-        return f"""
-        ### Success 🔥
-        Yay! Medusa heads were successfully created and uploaded to the following repo: {repo_id}
-        """
-    except Exception as e:
-        print("Error ❌\n", e)
-        try:
-            api.delete_repo(repo_id)
-        except RepositoryNotFoundError:
-            pass
-        return f"""
-        ### Error 😢😢😢
-        {e}
-        """
 DESCRIPTION = """
 The steps to create [medusa](https://sites.google.com/view/medusa-llm) heads are the following:

 from git import Repo
 import gradio as gr
+from medusa_training import run, DEFAULT_TRAINING_ARGS
 # Clone the medusa repo locally
 print("Cloning the medusa repo locally...")
 Repo.clone_from("https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered", "data")
 print("Done")
 DESCRIPTION = """
 The steps to create [medusa](https://sites.google.com/view/medusa-llm) heads are the following:

medusa_training.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import json
+import os
+import multiprocessing as mp
+from huggingface_hub import HfApi
+from huggingface_hub.utils import RepositoryNotFoundError
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+import torch
+import torch.distributed.run as distributed_run
+OUTPUT_DIR = "medusa_heads"
+MEDUSA_NUM_HEADS = 3
+MEDUSA_NUM_LAYERS = 1
+LR = 1e-3
+DATASET = "vicuna"
+# These can't be changed (e.g. they control the output path)
+FIXED_TRAINING_ARGS = \
+"""medusa/medusa/train/train.py
+--model_name_or_path {model_id}
+--output_dir {output_dir}
+--run_name {model_id}-medusa-{dataset}
+--medusa_num_heads {medusa_num_heads}
+--medusa_num_layers {medusa_num_layers}
+--learning_rate {lr}
+--data_path data/ShareGPT_V4.3_unfiltered_cleaned_split.json"""
+# These can be freely changed
+DEFAULT_TRAINING_ARGS = \
+"""--bf16 True
+--num_train_epochs 1
+--per_device_train_batch_size 64
+--per_device_eval_batch_size 64
+--gradient_accumulation_steps 4
+--evaluation_strategy no
+--save_strategy no
+--weight_decay 0.0
+--warmup_ratio 0.1
+--lr_scheduler_type cosine
+--logging_steps 10
+--tf32 True
+--model_max_length 2048
+--lazy_preprocess True
+--auto_find_batch_size True"""
+def train_medusa_heads(model_id: str, training_args: str):
+    all_training_args = FIXED_TRAINING_ARGS.format(
+        model_id=model_id,
+        output_dir=OUTPUT_DIR,
+        dataset=DATASET,
+        medusa_num_heads=MEDUSA_NUM_HEADS,
+        lr=LR,
+        medusa_num_layers=MEDUSA_NUM_LAYERS
+    ) + "\n" + training_args
+    all_training_arg_list = []
+    for arg in all_training_args.split("\n"):
+        all_training_arg_list += arg.split(" ")
+    print("Full argument list:", all_training_arg_list)
+    parser = distributed_run.get_args_parser()
+    args = parser.parse_args(all_training_arg_list)
+    distributed_run.run(args)
+def run(model_id: str, training_args: str) -> str:
+    print(f"\n\n\nNEW RUN: {model_id}")
+    api = HfApi()
+    model_name = model_id.split("/")[-1]
+    repo_id = f"joaogante/{model_name}-medusa-{DATASET}"
+    # Input validation
+    if model_id == "":
+        return """
+        ### Invalid input 🐞
+        Please fill a model_id.
+        """
+    if api.repo_exists(repo_id):
+        return f"""
+        ### Invalid input 🐞
+        {repo_id} already exists, which means that {model_id} has already been used to create medusa heads.
+        """
+    print(f"Valid inputs ✅\nValidating model_id: {model_id}")
+    # Attempt to load the base model
+    try:
+        config = AutoConfig.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        del config, tokenizer, model
+    except Exception as e:
+        return f"""
+        ### {model_id} can't be loaded with AutoClasses 🐞
+        {e}
+        """
+    print(f"{model_id} can be loaded ✅\nCreating medusa heads (will take a few hours)")
+    # Run the medusa heads creation
+    try:
+        proc = mp.Process(target=train_medusa_heads, args=(model_id, training_args))
+        proc.start()
+        proc.join()
+        print("Medusa heads training process completed (it might have crashed!)")
+    except Exception as e:
+        print("Error ❌\n", e)
+        return f"""
+        ### Error 😢😢😢
+        {e}
+        """
+    # Upload the medusa heads to the Hub
+    try:
+        # Folder path from https://github.com/FasterDecoding/Medusa/blob/main/medusa/train/train.py#L399
+        folder_path = (
+            f"{OUTPUT_DIR}_medusa_mlp_{model_name}_medusa_{MEDUSA_NUM_HEADS}_lr_{LR}_layers_{MEDUSA_NUM_LAYERS}"
+        )
+        if not any([x for x in os.listdir(folder_path) if len(x) >= 3 and x[-3:] == ".pt"]):
+            raise Exception(
+                "No model data in the expected model folder, the traning run probably failed. Check the logs for more "
+                "information."
+            )
+        api.create_repo(
+            repo_id=repo_id,
+            exist_ok=True,
+        )
+        api.upload_folder(
+            folder_path=folder_path,
+            repo_id=repo_id,
+        )
+        print("Medusa heads upload success ✅\n Uploaded to: ", repo_id)
+        return f"""
+        ### Success 🔥
+        Yay! Medusa heads were successfully created and uploaded to the following repo: {repo_id}
+        """
+    except Exception as e:
+        print("Error ❌\n", e)
+        try:
+            api.delete_repo(repo_id)
+        except RepositoryNotFoundError:
+            pass
+        return f"""
+        ### Error 😢😢😢
+        {e}
+        """