Spaces:

joaogante
/

medusa-maker

Paused

App Files Files Community

joaogante HF staff commited on Jan 6, 2024

Commit

bd89ed8

1 Parent(s): 0b94c41

datasets refactor

Browse files

Files changed (7) hide show

.gitignore +169 -0
app.py +15 -13
medusa_heads_medusa_TinyLlama-1.1B-Chat-v1.0/config.json +6 -0
requirements.txt +0 -1
src/calibration_datasets.py +603 -0
src/medusa_training_script.py +269 -0
medusa_training.py → src/train_workflow.py +17 -24

.gitignore ADDED Viewed

	@@ -0,0 +1,169 @@

+# Initially taken from Github's Python gitignore file
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# tests and logs
+tests/fixtures/cached_*_text.txt
+logs/
+lightning_logs/
+lang_code_data/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vs
+.vscode
+# Pycharm
+.idea
+# TF code
+tensorflow_code
+# Models
+proc_data
+# examples
+runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep
+# data
+/data
+serialization_dir
+# emacs
+*.*~
+debug.env
+# vim
+.*.swp
+#ctags
+tags
+# pre-commit
+.pre-commit*
+# .lock
+*.lock
+# DS_Store (MacOS)
+.DS_Store
+# ruff
+.ruff_cache

app.py CHANGED Viewed

@@ -1,22 +1,20 @@
-from git import Repo
-import gradio as gr
-from medusa_training import run, DEFAULT_TRAINING_ARGS
-# Clone the medusa repo locally
-print("Cloning the medusa repo locally...")
-Repo.clone_from("https://github.com/FasterDecoding/Medusa.git", "medusa")
-print("Cloning the vicuna data locally...")
-Repo.clone_from("https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered", "data")
-print("Done")
 DESCRIPTION = """
 The steps to create [medusa](https://sites.google.com/view/medusa-llm) heads are the following:
 1. Input a public model id from the Hub
-2. Click "Submit"
-3. That's it! You'll get feedback if it works or not, and if it worked, you'll get the name of the new repo 🔥
 """
 title="Create LLM medusa heads in a new repo 🐍"
@@ -28,8 +26,12 @@ with gr.Blocks(title=title) as demo:
     with gr.Row() as r:
         with gr.Column() as c:
             model_id = gr.Text(max_lines=1, label="model_id")
             with gr.Accordion("Training arguments (advanced)", open=False):
-                training_args = gr.Textbox(DEFAULT_TRAINING_ARGS, interactive=True, lines=14, label="training_args")
             with gr.Row() as c:
                 clean = gr.ClearButton()
                 submit = gr.Button("Submit", variant="primary")
@@ -37,6 +39,6 @@ with gr.Blocks(title=title) as demo:
         with gr.Column() as d:
             status_box = gr.Markdown()
-    submit.click(run, inputs=[model_id, training_args], outputs=status_box, concurrency_limit=1)
 demo.queue(max_size=10).launch(show_api=True)

+"""
+Holds the gradio app itself
+"""
+import gradio as gr
+from src.train_workflow import run, DEFAULT_TRAINING_ARGS
+from src.calibration_datasets import CalibrationDataset
 DESCRIPTION = """
 The steps to create [medusa](https://sites.google.com/view/medusa-llm) heads are the following:
 1. Input a public model id from the Hub
+2. Select a dataset to train the medusa heads on. The dataset should be representative of the downstream use case.
+3. Click "Submit"
+4. That's it! You'll get feedback if it works or not, and if it worked, you'll get the name of the new repo 🔥
 """
 title="Create LLM medusa heads in a new repo 🐍"
     with gr.Row() as r:
         with gr.Column() as c:
             model_id = gr.Text(max_lines=1, label="model_id")
+            dataset_names = [
+                cls.dataset for cls in CalibrationDataset.__subclasses__()
+            ]
+            dataset = gr.Dropdown(dataset_names, label="dataset")
             with gr.Accordion("Training arguments (advanced)", open=False):
+                training_args = gr.Textbox(DEFAULT_TRAINING_ARGS, interactive=True, lines=20, label="training_args")
             with gr.Row() as c:
                 clean = gr.ClearButton()
                 submit = gr.Button("Submit", variant="primary")
         with gr.Column() as d:
             status_box = gr.Markdown()
+    submit.click(run, inputs=[model_id, training_args, dataset], outputs=status_box, concurrency_limit=1)
 demo.queue(max_size=10).launch(show_api=True)

medusa_heads_medusa_TinyLlama-1.1B-Chat-v1.0/config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "medusa_num_heads": 3,
+  "medusa_num_layers": 1,
+  "transformers_version": "4.37.0.dev0"
+}

requirements.txt CHANGED Viewed

	@@ -1,2 +1 @@
1	medusa-llm[train]
2	- gitpython


1	medusa-llm[train]

src/calibration_datasets.py ADDED Viewed

	@@ -0,0 +1,603 @@

+"""Prepares the datasets for calibration. Original code gently shared by TheBloke"""
+from abc import ABC
+import time
+from typing import Dict, List, Optional
+from datasets import load_dataset, Dataset
+from transformers import PreTrainedTokenizerBase
+class CalibrationDataset(ABC):
+    tokenizer: Optional[PreTrainedTokenizerBase] = None
+    num_samples: int = 128
+    seqlen: int = 4096
+    dataset_config: dict
+    dataset: str
+    dataset_name: str
+    dataset_limit: int = int(1e7)
+    # Defines the field to extract from the HF dataset
+    # If specified, just this field will be returned, and no transformation will be done.
+    dataset_field: Optional[str] = None
+    # Define the default parameters for a dataset which requires a transformation
+    # Only used if dataset_field is None.
+    # The fields to extract from the original dataset
+    transform_fields: List[str] = []
+    # A format string describing how the fields should be joined
+    # Can use {field1}, {field2}, etc. as placeholders for the field names
+    # Or can use actual names, eg "{input} {output}"
+    transform_join: str = "{field1} {field2}"
+    # Optional override for the dataset URL
+    # By default this is automatically derived from the dataset name and config
+    dataset_url: Optional[str] = None
+    data: Optional[Dataset] = None
+    samples: List[str] = []
+    tokenized_samples: List[Dict[str, str]] = {}
+    randomize: bool = False
+    randomize_seed: int = 42
+    def __init__(
+        self,
+        num_samples: int = 128,
+        seqlen: int = 4096,
+        tokenizer: Optional[PreTrainedTokenizerBase] = None
+    ):
+        self.num_samples = num_samples
+        self.seqlen = seqlen
+        self.tokenizer = tokenizer
+    @classmethod
+    def get_dataset(cls, dataset_name, **kwargs):
+        for subclass in cls.__subclasses__():
+            if hasattr(subclass, "dataset") and subclass.dataset == dataset_name:
+                return subclass(**kwargs)
+        raise ValueError(f"No dataset class found for name: {dataset_name}")
+    def tokenize_dataset(self, samples: Optional[List[str]] = None) -> List[Dict[str, int]]:
+        """
+        Tokenize the dataset and return a list of tokens of `seqlen` length
+        First tokenize the List[str] of samples, as a batch.
+        Then flatten the batch, and split it into `num_samples` rows of `seqlen` length.
+        """
+        if not self.tokenizer:
+            raise ValueError("No tokenizer provided to tokenize_dataset()")
+        else:
+            if not samples:
+                if not self.samples:
+                    self.get_samples()
+                samples = self.samples
+            print(f"Tokenizing {self.dataset_name} of length {len(samples)}")
+            start_time = time.time()
+            # Tokenize the list of samples. We don't use return_tensors="pt",
+            # as that requires the samples to be the same length, or padding to be used.
+            tokenized = self.tokenizer(samples)
+            # Output of tokenizer will be:
+            # {"input_ids": [[1,2,3], [4,5], [6,7]], "attention_mask": [[1,1,1], [1,1], [1,1]]}
+            # Flatten that so as to concatenate the samples into a single input_mask and attention_mask
+            flattened = {
+                key: [
+                    item for sublist in value
+                    for item in sublist
+                ]
+                for key, value in tokenized.items()
+            }
+            print(
+                f"Tokenized length: {len(flattened['input_ids'])} tokens."
+            )
+            # Slice our single input_mask list into num_samples samples of seqlen length
+            tokenized_samples = []
+            for i in range(0, self.num_samples * self.seqlen, self.seqlen):
+                if i + self.seqlen >= len(flattened["input_ids"]):
+                    break
+                sample = {
+                    "input_ids": flattened["input_ids"][i:i + self.seqlen],
+                    "attention_mask": flattened["attention_mask"][i:i + self.seqlen]
+                }
+                tokenized_samples.append(sample)
+            print(
+                f"Return {len(tokenized_samples)} samples of {self.seqlen} length. "
+                f"Time taken: {time.time() - start_time:.2f}s."
+            )
+            self.tokenized_samples = tokenized_samples
+            return self.tokenized_samples
+    def get_hf_dataset(
+        self,
+        path: str,
+        limit: Optional[int] = None,
+        **kwargs
+    ) -> Dataset:
+        """Load the Hugging Face dataset at `path`, using the provided kwargs."""
+        print(f"Loading HF dataset {path} with params: {kwargs}")
+        data: Dataset = load_dataset(path=path, **kwargs)
+        limit = limit and min(limit, len(data)) or len(data)
+        return data.select(range(limit))
+    @staticmethod
+    def list_with_nls(samples: List[str]) -> List[str]:
+        """
+        Return a List[str] with each sample ending in a newline.
+        Also filters the list by stripping, then removing any empty samples.
+        """
+        return [
+            x.rstrip() + '\n'
+            for x in samples
+            if x and len(x.strip()) > 0
+        ]
+    def get_samples(self) -> List[str]:
+        """
+        Return a list of samples for the dataset.
+        If the subclass implements `dataset_field`, this is used to filter the HF Dataset.
+        Otherwise, the subclass must implement `process_samples()`, for custom filtering.
+        Samples are returned as a List[str], each ending in a newline.
+        """
+        # Load HF dataset. Subclasses provide HF dataset details in `dataset_config`
+        if not self.data:
+            self.data = self.get_hf_dataset(**self.dataset_config, limit=self.dataset_limit)
+        if not self.samples:
+            if hasattr(self, "dataset_field") and self.dataset_field:
+                samples = self.data[self.dataset_field]
+            else:
+                try:
+                    samples = self.process_samples()
+                except NotImplementedError:
+                    raise ValueError(
+                        f"No dataset field specified for class {self.__class__}, "
+                        f"and process_samples() method not defined."
+                    )
+            if self.randomize:
+                import random
+                random.seed(self.randomize_seed)
+                random.shuffle(samples)
+            self.samples = self.list_with_nls(samples)
+        return self.samples
+    def process_samples(self) -> List[str]:
+        if not self.transform_fields or not isinstance(self.transform_fields, list):
+            raise ValueError("transform_fields must be a List[str], defined in the subclass")
+        if not self.transform_join or not isinstance(self.transform_join, str):
+            raise ValueError("transform_fields must be a str defined in the subclass")
+        def transform_sample(sample):
+            field_values = {field: sample[field] for field in self.transform_fields}
+            # We support both:
+            # generic numbered fields: "{field1} {field2}"
+            # and named fields: "{input} {output}"
+            # Creating a combined dictionary to handle both specific field names and generic placeholders
+            combined_dict = {**field_values, **{f'field{i+1}': field for i, field in enumerate(field_values.values())}}
+            output = self.transform_join.format_map(combined_dict)
+            return {"output": output}
+        return self.data.map(transform_sample)["output"]
+    def generate_checksum(self) -> str:
+        # Create a sha256sum checksum of the joined samples
+        # Can be used to confirm that code updates haven't changed the output
+        import hashlib
+        samples = self.get_samples()
+        combined_samples = ''.join(samples)
+        checksum = hashlib.sha256(combined_samples.encode()).hexdigest()
+        return checksum
+    @classmethod
+    def get_dataset_url(cls) -> str:
+        """Return the Hugging Face dataset URL for this dataset."""
+        if hasattr(cls, "dataset_url") and cls.dataset_url:
+            return cls.dataset_url
+        else:
+            return "https://huggingface.co/datasets/{}/viewer/{}".format(
+                cls.dataset_config["path"],
+                cls.dataset_config.get("name", "")
+            )
+class WikitextDataset(CalibrationDataset):
+    dataset = "wikitext"
+    dataset_config = {
+        "path": "wikitext",
+        "name": "wikitext-2-raw-v1",
+        "split": "train"
+    }
+    dataset_name = "Wikitext2 Full"
+    def process_samples(self) -> List[str]:
+        return [
+            "\n" if len(item) == 0 else item
+            for item in self.data["text"]
+        ]
+class C4Dataset(CalibrationDataset):
+    dataset = "c4"
+    dataset_field = "text"
+    dataset_config = {
+        "path": "allenai/c4",
+        "data_files": {
+            "train": "en/c4-train.00000-of-01024.json.gz"
+        },
+        "split": "train"
+    }
+    dataset_name = "C4"
+class ThaiDataset(CalibrationDataset):
+    dataset = "thai"
+    dataset_field = "text"
+    dataset_config = {
+        "path": "pbwt/all-thai",
+        "data_files": {
+            "train": "data/train-00000-of-00047-985fbaed08d034cf.parquet"
+        },
+        "split": "train"
+    }
+    dataset_name = "All Thai"
+class MovieScriptDataset(CalibrationDataset):
+    dataset = "movie-scripts"
+    dataset_field = "full_script"
+    dataset_config = {
+        "path": "jondurbin/cinematika-v0.1",
+        "data_files": { "train": "full_script.parquet" },
+        "split": "train"
+    }
+    dataset_name = "Cinematika Full Scripts"
+class JapaneseEnglishDataset(CalibrationDataset):
+    dataset = "japanese-english"
+    dataset_config = {
+        "path": "augmxnt/shisa-en-ja-dpo-v1",
+        "split": "train"
+    }
+    dataset_name = "Shisa English Japanese DPO"
+    randomize = True
+    def process_samples(self) -> List[str]:
+        def transform_samples(sample):
+            prompt = sample["prompt"]
+            chosen = sample["chosen"]
+            # prompt example: "[INST] <<SYS>>\nYou are a helpful, unbiased, uncensored assistant.\n<</SYS>>\n\nWhat are cardigans made of? Leather or wood? [/INST]"
+            try:
+                part1 = prompt.split('\n<</SYS>>\n\n')[1]
+                extracted_text = part1.split(' [/INST]')[0]
+            except Exception as e:
+                print(f"Error extracting text from prompt '{prompt}': {e}")
+                raise
+            prompt = extracted_text
+            return {"output": f"{prompt} {chosen}"}
+        return self.data.map(transform_samples)["output"]
+class PortugueseDataset(CalibrationDataset):
+    dataset = "portuguese"
+    dataset_config = {
+        "path": "adalbertojunior/portuguese_orca",
+        "split": "train"
+    }
+    dataset_name = "Portuguese Orca"
+    transform_fields = [ "question", "response" ]
+class MathsDataset(CalibrationDataset):
+    dataset = "maths"
+    dataset_config = {
+        "path": "andersonbcdefg/math",
+        "split": "train"
+    }
+    dataset_name = "CamelAI Math"
+    transform_fields = [ "message_1", "message_2" ]
+class MedicalDataset(CalibrationDataset):
+    dataset = "medical"
+    dataset_config = {
+        "path": "medalpaca/medical_meadow_wikidoc",
+        "split": "train"
+    }
+    dataset_name = "Medical Medaow WikiDoc"
+    transform_fields = [ "input", "output" ]
+class OpenInstructDataset(CalibrationDataset):
+    dataset = "open-instruct"
+    dataset_config = {
+        "path": "VMware/open-instruct",
+        "split": "train"
+    }
+    dataset_name = "VMware Open Instruct"
+    transform_fields = [ "instruction", "response" ]
+class KoreanDataset(CalibrationDataset):
+    dataset = "korean"
+    dataset_config = {
+        "path": "beomi/KoAlpaca-v1.1a",
+        "split": "train"
+    }
+    dataset_name = "Korean Alpaca"
+    transform_fields = [ "instruction", "output" ]
+class CodeDataset(CalibrationDataset):
+    dataset = "code"
+    dataset_field = "output"
+    dataset_config = {
+        "path": "nickrosh/Evol-Instruct-Code-80k-v1",
+        "split": "train"
+    }
+    dataset_name = "Evol Instruct Code"
+class MultiLanguageDataset(CalibrationDataset):
+    dataset = "multi-language"
+    dataset_field = "text"
+    dataset_config = {
+        "path": "papluca/language-identification",
+        "split": "train"
+    }
+    dataset_name = "Language Identification"
+class RussianDataset(CalibrationDataset):
+    dataset = "russian"
+    dataset_config = {
+        "path": "Den4ikAI/russian_instructions_2",
+        "split": "train"
+    }
+    dataset_name = "Russian Instructions 2"
+    transform_fields = [ "question", "answer" ]
+class DutchDataset(CalibrationDataset):
+    dataset = "dutch"
+    dataset_config = {
+        "path": "BramVanroy/dolly-15k-dutch",
+        "split": "train"
+    }
+    dataset_name = "Dolly 15K Dutch"
+    transform_fields = [ "instruction", "context", "response" ]
+    transform_join = "{field1} {field2} {field3}"
+class VietnameseChineseDataset(CalibrationDataset):
+    dataset = "vietnamesechinese"
+    dataset_config = {
+        "path": "nRuaif/Vietnamese_x_Alpaca",
+        "split": "train"
+    }
+    dataset_name = "Vietnamese and Chinese"
+    def get_dataset_url(self) -> None:
+        return None
+    def process_samples(self) -> List[str]:
+        samples = self.data["output"]
+        chinese_samples = CalibrationDataset.get_dataset("chinese").get_samples()
+        joined_list = samples + chinese_samples
+        import random
+        random.shuffle(joined_list)
+        return joined_list[:self.dataset_limit]
+class VietnameseDataset(CalibrationDataset):
+    dataset = "vietnamese"
+    dataset_field = "output"
+    dataset_config = {
+        "path": "nRuaif/Vietnamese_x_Alpaca",
+        "split": "train"
+    }
+    dataset_name = "Alpaca Vietnamese"
+class ChineseDataset(CalibrationDataset):
+    dataset = "chinese"
+    dataset_config = {
+        "path": "TigerResearch/tigerbot-alpaca-zh-0.5m",
+        "split": "train"
+    }
+    dataset_name = "Tiger Alpaca ZH"
+    transform_fields = [ "instruction", "input", "output" ]
+    transform_join = "{field1} {field2} {field3}"
+class LatinEnglishDataset(CalibrationDataset):
+    dataset = "latin-english"
+    dataset_config = {
+        "path": "grosenthal/latin_english_parallel",
+        "split": "train"
+    }
+    dataset_name = "Latin English Parallel"
+    transform_fields = [ "la", "en" ]
+    transform_join = "{field1}\n{field2}"
+class PolishDataset(CalibrationDataset):
+    dataset = "polish"
+    dataset_field = "content"
+    dataset_config = {
+        "path": "WiktorS/polish-news",
+        "split": "train"
+    }
+    dataset_name = "Polish News"
+class JapaneseDataset(CalibrationDataset):
+    dataset = "japanese"
+    dataset_field = "output"
+    dataset_config = {
+        "path": "fujiki/japanese_alpaca_data",
+        "split": "train"
+    }
+    dataset_name = "Alpaca Japanese"
+class SpanishDataset(CalibrationDataset):
+    dataset = "spanish"
+    dataset_field = "output"
+    dataset_config = {
+        "path": "bertin-project/alpaca-spanish",
+        "split": "train"
+    }
+    dataset_name = "Alpaca Spanish"
+class GermanDataset(CalibrationDataset):
+    dataset = "german"
+    dataset_config = {
+        "path": "deepset/germanquad",
+        "split": "train"
+    }
+    dataset_name = "German Quad"
+    def process_samples(self) -> List[str]:
+        def transform_samples(sample):
+            split_context = sample["context"].split("===")
+            if len(split_context) >= 3:
+                trans_context = split_context[2]
+            else:
+                trans_context = sample["context"]
+            return {"output": trans_context.strip()}
+        return self.data.map(transform_samples)["output"]
+class FrenchDataset(CalibrationDataset):
+    dataset = "french"
+    dataset_field = "text"
+    dataset_config = {
+        "path": "Kant1/French_Wikipedia_articles",
+        "data_files": { "wiki_00.txt" },
+        "split": "train"
+    }
+    dataset_name = "French Wikipedia Articles"
+def validate_dataset(dataset_name: str, **kwargs):
+    for cls in CalibrationDataset.__subclasses__():
+        if hasattr(cls, "dataset") and cls.dataset == dataset_name:
+            return True
+    return False
+# FIXME: a temp function put in for AutoAWQ, pending full refactor where it won't be necessary
+def get_dataset_url(dataset_name: str):
+    for cls in CalibrationDataset.__subclasses__():
+        if hasattr(cls, "dataset") and cls.dataset == dataset_name:
+            return cls.get_dataset_url()
+    raise ValueError(f"No dataset class found for name: {dataset_name}")
+def get_dataset_name(dataset_name: str):
+    for cls in CalibrationDataset.__subclasses__():
+        if hasattr(cls, "dataset") and cls.dataset == dataset_name:
+            return cls.dataset_name
+    raise ValueError(f"No dataset class found for name: {dataset_name}")
+def test_datasets(datasets: Optional[List[str]] = None, checksum_only=False):
+    import sys
+    from transformers import AutoTokenizer
+    try:
+        failed = []
+        for cls in CalibrationDataset.__subclasses__():
+            if not hasattr(cls, "dataset") or not cls.dataset:
+                failed.append(cls.__name__)
+        if failed:
+            print(f"The following classes have no 'dataset' attribute: {failed}")
+            sys.exit(-1)
+        else:
+            print()(f"All classes have 'dataset' attribute.")
+        print(f"Enumerating CalibrationDataset classes")
+        classes = CalibrationDataset.__subclasses__()
+        dataset_names = [
+            cls.dataset
+            for cls in classes
+            if cls.dataset and (not datasets or cls.dataset in datasets)
+        ]
+        print(f"Found {len(classes)} total dataset classes: {[c.dataset for c in classes]}")
+        if datasets:
+            print(f"Will test {len(dataset_names)} datasets: {dataset_names}")
+        print(f"Starting test: loading Llama-2 tokenizer")
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_fast=True)
+        for name in dataset_names:
+            print(f"{name} test: loading dataset.")
+            dataset = CalibrationDataset.get_dataset(name, tokenizer=tokenizer)
+            if not checksum_only:
+                print(f"{name} test: running tokenize_dataset.")
+                toks = dataset.tokenize_dataset()
+                print(f"{name} test: getting dataset_url.")
+                url = dataset.get_dataset_url()
+                print(f"{name} - randomized? {dataset.randomize}")
+                print(
+                    f"{name} - result: cls.data: length: {len(dataset.data)}, "
+                    f"first row length: {len(dataset.data[0])}, "
+                    f"first row data: '{dataset.data[0]}'."
+                )
+                print(
+                    f"{name} - result: cls.samples: length: {len(dataset.samples)}, "
+                    f"first row length: {len(dataset.samples[0])}, "
+                    f"first row sample: '{dataset.samples[0]}'."
+                )
+                print(
+                    f"{name} - result: tokenize_dataset result: length: {len(toks)}, "
+                    f"length first row input_ids: {len(toks[0]['input_ids'])}."
+                )
+                print(
+                    f"{name} - result: dataset_url: {url}"
+                )
+            checksum = dataset.generate_checksum()
+            print(
+                f"{name} - result: sha256 checksum: {checksum}"
+            )
+    except KeyboardInterrupt:
+        print("Test aborted")
+    except Exception as e:
+        print(
+            f"Received an exception during test. Test failed. "
+            f"Exception: {e}"
+        )
+        raise
+if __name__ == "__main__":
+        import argparse
+        parser = argparse.ArgumentParser(description="Test calibration datasets")
+        parser.add_argument("--datasets", "-d", "-n", nargs="*", type=str, help="Dataset(s) to check; default is all")
+        parser.add_argument("--checksum_only", "-co", action="store_true", help="Only ouput the checksums for the datasets")
+        args = parser.parse_args()
+        test_datasets(args.datasets, checksum_only=args.checksum_only)

src/medusa_training_script.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+Hold the training script for the medusa model.
+Adapted from the original code here: https://github.com/FasterDecoding/Medusa/blob/main/medusa/train/train.py
+"""
+import os
+from dataclasses import dataclass, field
+import pathlib
+from typing import Dict, Optional
+import torch
+from torch.utils.data import Dataset
+import transformers
+from transformers import Trainer, BitsAndBytesConfig
+from transformers.trainer_pt_utils import LabelSmoother
+from torch.nn import CrossEntropyLoss
+from medusa.model.medusa_model import MedusaModel, MedusaConfig
+from calibration_datasets import CalibrationDataset
+IGNORE_TOKEN_ID = LabelSmoother.ignore_index
+# Customized for training Medusa heads
+class CustomizedTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """
+        Compute the training loss for the model.
+        Args:
+            model (torch.nn.Module): The model for which to compute the loss.
+            inputs (dict): The input data, including input IDs, attention mask, and labels.
+            return_outputs (bool): Whether to return model outputs along with the loss.
+        Returns:
+            Union[float, Tuple[float, torch.Tensor]]: The computed loss, optionally with model outputs.
+        """
+        # DDP will give us model.module
+        if hasattr(model, "module"):
+            medusa = model.module.medusa
+        else:
+            medusa = model.medusa
+        logits = model(
+            input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
+        )
+        labels = inputs["labels"]
+        # Shift so that tokens < n predict n
+        loss = 0
+        loss_fct = CrossEntropyLoss()
+        log = {}
+        for i in range(medusa):
+            medusa_logits = logits[i, :, : -(2 + i)].contiguous()
+            medusa_labels = labels[..., 2 + i :].contiguous()
+            medusa_logits = medusa_logits.view(-1, logits.shape[-1])
+            medusa_labels = medusa_labels.view(-1)
+            medusa_labels = medusa_labels.to(medusa_logits.device)
+            loss_i = loss_fct(medusa_logits, medusa_labels)
+            loss += loss_i
+            not_ignore = medusa_labels.ne(IGNORE_TOKEN_ID)
+            medusa_labels = medusa_labels[not_ignore]
+            # Add top-k accuracy
+            for k in range(1, 6):
+                _, topk = medusa_logits.topk(k, dim=-1)
+                topk = topk[not_ignore]
+                correct = topk.eq(medusa_labels.unsqueeze(-1)).any(-1)
+                log[f"medusa{i}_top{k}"] = correct.float().mean().item()
+            log[f"medusa{i}_loss"] = loss_i.item()
+        self.log(log)
+        return (loss, logits) if return_outputs else loss
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field()
+    load_in_4bit: bool = field(
+        default=False,
+        metadata={"help": "Load in 4 bit."},
+    )
+    load_in_8bit: bool = field(
+        default=False,
+        metadata={"help": "Load in 8 bit."},
+    )
+@dataclass
+class DataArguments:
+    dataset: str = field(
+        metadata={"help": "One of the datasets names in a CalibrationDataset subclass."},
+    )
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=2048,
+        metadata={
+            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    medusa_num_heads: int = field(
+        default=1,
+        metadata={"help": "Number of Medusa heads."},
+    )
+    medusa_num_layers: int = field(
+        default=1,
+        metadata={"help": "Number of layers for each Medusa head."},
+    )
+local_rank = None
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
+    """
+    Save the model's state dictionary to a specified directory.
+    Args:
+        trainer (transformers.Trainer): The Hugging Face Trainer object.
+        output_dir (str): The directory where the model state dictionary will be saved.
+    """
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+class SupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning.
+    Args:
+        dataset (str): One of the datasets names in a CalibrationDataset subclass.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use for data preprocessing.
+    """
+    def __init__(self, dataset, tokenizer: transformers.PreTrainedTokenizer):
+        super(SupervisedDataset, self).__init__()
+        rank0_print("Formatting inputs...")
+        dataset_classes = CalibrationDataset.__subclasses__()
+        for dataset_class in dataset_classes:
+            if dataset_class.dataset == dataset:
+                dataset = dataset_class(num_samples=int(1e6), seqlen=tokenizer.model_max_length, tokenizer=tokenizer)
+                break
+        tokenized = dataset.tokenize_dataset()
+        self.input_ids = torch.tensor([data["input_ids"] for data in tokenized], dtype=torch.long)
+        self.attention_mask = torch.tensor([data["attention_mask"] for data in tokenized], dtype=torch.long)
+    def __len__(self):
+        return self.input_ids.shape[0]
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        return dict(
+            input_ids=self.input_ids[i],
+            labels=self.input_ids[i],
+            attention_mask=self.attention_mask[i],
+        )
+def train():
+    global local_rank
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments)
+    )
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    local_rank = training_args.local_rank
+    config = transformers.AutoConfig.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+    )
+    config.use_cache = False
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+    )
+    # Load model and tokenizer
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        cache_dir=training_args.cache_dir,
+        low_cpu_mem_usage=True,
+        torch_dtype=torch.bfloat16,
+        quantization_config=quantization_config if model_args.load_in_4bit else None,
+        load_in_4bit=model_args.load_in_4bit,
+        load_in_8bit=model_args.load_in_8bit,
+    )
+    # Freeze the base model
+    for param in model.base_model.parameters():
+        param.requires_grad = False
+    # Add Medusa heads
+    medusa_lm_head = MedusaModel(
+        model,
+        medusa_num_heads=training_args.medusa_num_heads,
+        medusa_num_layers=training_args.medusa_num_layers,
+        base_model_name_or_path=model_args.model_name_or_path,
+    )
+    # Format output dir
+    training_args.output_dir = f"{training_args.output_dir}_medusa_{model_args.model_name_or_path.split('/')[-1]}"
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    # Load data
+    data_module = {"train_dataset": SupervisedDataset(data_args.dataset, tokenizer), "eval_dataset": None}
+    # Generate Medusa config for pushing to HF hub
+    medusa_config = MedusaConfig(
+        medusa_num_heads=training_args.medusa_num_heads,
+        medusa_num_layers=training_args.medusa_num_layers,
+        base_model_name_or_path=model_args.model_name_or_path,
+    )
+    # Save Medusa config
+    medusa_config.save_pretrained(training_args.output_dir)
+    # Start trainner
+    trainer = CustomizedTrainer(
+        model=medusa_lm_head, tokenizer=tokenizer, args=training_args, **data_module
+    )
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    model.config.use_cache = True
+    # Save MedusaHead seperately
+    if hasattr(medusa_lm_head, "module"):
+        lm_head = medusa_lm_head.module.medusa_head
+    else:
+        lm_head = medusa_lm_head.medusa_head
+    # Save Medusa heads
+    torch.save(
+        lm_head.state_dict(),
+        os.path.join(training_args.output_dir, "medusa_lm_head.pt"),
+    )
+if __name__ == "__main__":
+    train()

medusa_training.py → src/train_workflow.py RENAMED Viewed

@@ -1,4 +1,6 @@
-import json
 import os
 import multiprocessing as mp
@@ -9,26 +11,23 @@ import torch
 import torch.distributed.run as distributed_run
 OUTPUT_DIR = "medusa_heads"
-MEDUSA_NUM_HEADS = 3
-MEDUSA_NUM_LAYERS = 1
-LR = 1e-3
 DATASET = "vicuna"
 # These can't be changed (e.g. they control the output path)
 FIXED_TRAINING_ARGS = \
-"""medusa/medusa/train/train.py
 --model_name_or_path {model_id}
 --output_dir {output_dir}
 --run_name {model_id}-medusa-{dataset}
---medusa_num_heads {medusa_num_heads}
---medusa_num_layers {medusa_num_layers}
---learning_rate {lr}
---data_path data/ShareGPT_V4.3_unfiltered_cleaned_split.json"""
 # These can be freely changed
 DEFAULT_TRAINING_ARGS = \
-"""--bf16 True
 --num_train_epochs 1
 --per_device_train_batch_size 64
 --per_device_eval_batch_size 64
@@ -40,19 +39,13 @@ DEFAULT_TRAINING_ARGS = \
 --lr_scheduler_type cosine
 --logging_steps 10
 --tf32 True
---model_max_length 2048
---lazy_preprocess True
---auto_find_batch_size True"""
-def train_medusa_heads(model_id: str, training_args: str):
     all_training_args = FIXED_TRAINING_ARGS.format(
-        model_id=model_id,
-        output_dir=OUTPUT_DIR,
-        dataset=DATASET,
-        medusa_num_heads=MEDUSA_NUM_HEADS,
-        lr=LR,
-        medusa_num_layers=MEDUSA_NUM_LAYERS
     ) + "\n" + training_args
     all_training_arg_list = []
     for arg in all_training_args.split("\n"):
@@ -64,11 +57,11 @@ def train_medusa_heads(model_id: str, training_args: str):
     distributed_run.run(args)
-def run(model_id: str, training_args: str) -> str:
     print(f"\n\n\nNEW RUN: {model_id}")
     api = HfApi()
     model_name = model_id.split("/")[-1]
-    repo_id = f"joaogante/{model_name}-medusa-{DATASET}"
     # Input validation
     if model_id == "":
@@ -101,7 +94,7 @@ def run(model_id: str, training_args: str) -> str:
     # Run the medusa heads creation
     try:
-        proc = mp.Process(target=train_medusa_heads, args=(model_id, training_args))
         proc.start()
         proc.join()
         print("Medusa heads training process completed (it might have crashed!)")
@@ -117,7 +110,7 @@ def run(model_id: str, training_args: str) -> str:
     try:
         # Folder path from https://github.com/FasterDecoding/Medusa/blob/main/medusa/train/train.py#L399
         folder_path = (
-            f"{OUTPUT_DIR}_medusa_mlp_{model_name}_medusa_{MEDUSA_NUM_HEADS}_lr_{LR}_layers_{MEDUSA_NUM_LAYERS}"
         )
         if not any([x for x in os.listdir(folder_path) if len(x) >= 3 and x[-3:] == ".pt"]):
             raise Exception(

+"""
+Holds the interface between the gradio app and the medusa training script
+"""
 import os
 import multiprocessing as mp
 import torch.distributed.run as distributed_run
 OUTPUT_DIR = "medusa_heads"
 DATASET = "vicuna"
 # These can't be changed (e.g. they control the output path)
 FIXED_TRAINING_ARGS = \
+"""src/medusa_training_script.py
 --model_name_or_path {model_id}
 --output_dir {output_dir}
 --run_name {model_id}-medusa-{dataset}
+--dataset {dataset}"""
 # These can be freely changed
 DEFAULT_TRAINING_ARGS = \
+"""--medusa_num_heads 3
+--medusa_num_layers 1
+--model_max_length 2048
+--bf16 True
 --num_train_epochs 1
 --per_device_train_batch_size 64
 --per_device_eval_batch_size 64
 --lr_scheduler_type cosine
 --logging_steps 10
 --tf32 True
+--auto_find_batch_size True
+--learning_rate 1e-3"""
+def train_medusa_heads(model_id: str, training_args: str, dataset: str):
     all_training_args = FIXED_TRAINING_ARGS.format(
+        model_id=model_id, output_dir=OUTPUT_DIR, dataset=dataset,
     ) + "\n" + training_args
     all_training_arg_list = []
     for arg in all_training_args.split("\n"):
     distributed_run.run(args)
+def run(model_id: str, training_args: str, dataset: str) -> str:
     print(f"\n\n\nNEW RUN: {model_id}")
     api = HfApi()
     model_name = model_id.split("/")[-1]
+    repo_id = f"joaogante/{model_name}-medusa-{dataset}"
     # Input validation
     if model_id == "":
     # Run the medusa heads creation
     try:
+        proc = mp.Process(target=train_medusa_heads, args=(model_id, training_args, dataset))
         proc.start()
         proc.join()
         print("Medusa heads training process completed (it might have crashed!)")
     try:
         # Folder path from https://github.com/FasterDecoding/Medusa/blob/main/medusa/train/train.py#L399
         folder_path = (
+            f"{OUTPUT_DIR}_medusa_{model_name}"
         )
         if not any([x for x in os.listdir(folder_path) if len(x) >= 3 and x[-3:] == ".pt"]):
             raise Exception(