Spaces:

OptimalScale
/

Robin-7b

Runtime error

App Files Files Community

hendrydong commited on Apr 23, 2023

Commit

e4f3acf

1 Parent(s): 176f39e

Upload 32 files

Browse files

Files changed (32) hide show

lmflow/.DS_Store +0 -0
lmflow/__init__.py +13 -0
lmflow/args.py +622 -0
lmflow/datasets/__init__.py +7 -0
lmflow/datasets/dataset.py +308 -0
lmflow/models/__init__.py +0 -0
lmflow/models/auto_model.py +24 -0
lmflow/models/base_model.py +12 -0
lmflow/models/decoder_model.py +22 -0
lmflow/models/encoder_decoder_model.py +22 -0
lmflow/models/hf_decoder_model.py +537 -0
lmflow/models/hf_encoder_decoder_model.py +352 -0
lmflow/models/interfaces/__init__.py +0 -0
lmflow/models/interfaces/tunable.py +10 -0
lmflow/models/regression_model.py +11 -0
lmflow/models/text_regression_model.py +57 -0
lmflow/pipeline/__init__.py +0 -0
lmflow/pipeline/auto_pipeline.py +45 -0
lmflow/pipeline/base_aligner.py +21 -0
lmflow/pipeline/base_pipeline.py +9 -0
lmflow/pipeline/base_tuner.py +20 -0
lmflow/pipeline/evaluator.py +387 -0
lmflow/pipeline/finetuner.py +273 -0
lmflow/pipeline/inferencer.py +194 -0
lmflow/pipeline/raft_aligner.py +456 -0
lmflow/pipeline/utils/__init__.py +0 -0
lmflow/pipeline/utils/raft_trainer.py +0 -0
lmflow/utils/__init__.py +0 -0
lmflow/utils/constants.py +141 -0
lmflow/utils/data_utils.py +212 -0
lmflow/version.py +1 -0
requirements.txt +13 -1

lmflow/.DS_Store ADDED Viewed

Binary file (10.2 kB). View file

lmflow/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .version import __version__ as internal_version
+__version__ = internal_version
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+from lmflow import args, datasets, models, pipeline, utils
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.27.0.dev0")
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

lmflow/args.py ADDED Viewed

	@@ -0,0 +1,622 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""This script defines dataclasses: ModelArguments and DatasetArguments,
+that contain the arguments for the model and dataset used in training.
+It imports several modules, including dataclasses, field from typing, Optional from typing,
+require_version from transformers.utils.versions, MODEL_FOR_CAUSAL_LM_MAPPING,
+and TrainingArguments from transformers.
+MODEL_CONFIG_CLASSES is assigned a list of the model config classes from
+MODEL_FOR_CAUSAL_LM_MAPPING. MODEL_TYPES is assigned a tuple of the model types
+extracted from the MODEL_CONFIG_CLASSES.
+"""
+from dataclasses import dataclass, field
+from typing import Optional, List
+from transformers.utils.versions import require_version
+from transformers import (
+    MODEL_FOR_CAUSAL_LM_MAPPING,
+    TrainingArguments,
+)
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+@dataclass
+class ModelArguments:
+    """
+    Define a class ModelArguments using the dataclass decorator.
+    The class contains several optional parameters that can be used to configure a model.
+    model_name_or_path : str
+        a string representing the path or name of a pretrained
+        model checkpoint for weights initialization. If None, a model will be trained from scratch.
+    model_type :  str
+        a string representing the type of model to use if training from
+        scratch. If not provided, a pretrained model will be used.
+    config_overrides :  str
+        a string representing the default config settings to override
+        when training a model from scratch.
+    config_name : str
+        a string representing the name or path of the pretrained config to
+        use, if different from the model_name_or_path.
+    tokenizer_name :  str
+        a string representing the name or path of the pretrained tokenizer
+        to use, if different from the model_name_or_path.
+    cache_dir :  str
+        a string representing the path to the directory where pretrained models
+        downloaded from huggingface.co will be stored.
+    use_fast_tokenizer : bool
+        a boolean indicating whether to use a fast tokenizer (backed by the
+        tokenizers library) or not.
+    model_revision :  str
+        a string representing the specific model version to use (can be a
+        branch name, tag name, or commit id).
+    use_auth_token : bool
+        a boolean indicating whether to use the token generated when running
+        huggingface-cli login (necessary to use this script with private models).
+    torch_dtype :  str
+        a string representing the dtype to load the model under. If auto is
+        passed, the dtype will be automatically derived from the model's weights.
+    use_ram_optimized_load : bool
+        a boolean indicating whether to use disk mapping when memory is not
+        enough.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
+            )
+        },
+    )
+    lora_model_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The incremental model diff introduced by LoRA finetuning."
+                " Along with the original non-finetuned model forms the whole"
+                " finetuned model."
+            )
+        }
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    arch_type: Optional[str] = field(
+        default="decoder_only",
+        metadata={"help": "The architecture type of the model. Currently supported decoder_only or encoder_decoder"}
+    )
+    config_overrides: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Override some existing default config settings when a model is trained from scratch. Example: "
+                "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
+            )
+        },
+    )
+    arch_type: Optional[str] = field(
+        default="decoder_only",
+        metadata={
+            "help": (
+                "Model architecture type, e.g. \"decoder_only\","
+                " \"encoder_decoder\""
+            ),
+            "choices": ["decoder_only", "encoder_decoder", "text_regression"],
+        },
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
+                "with private models)."
+            )
+        },
+    )
+    torch_dtype: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
+                "dtype will be automatically derived from the model's weights."
+            ),
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    use_lora: bool = field(
+        default=False,
+        metadata={"help": "Whether to lora."},
+    )
+    lora_r: int = field(
+        default=8,
+        metadata={"help": "the rank of the lora parameters. The smaller lora_r is , the fewer parameters lora has."},
+    )
+    lora_alpha: int = field(
+        default=32,
+        metadata={"help": "Merging ratio between the fine-tuned model and the original. This is controlled by a parameter called alpha in the paper."},
+    )
+    lora_target_modules: List[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name",
+                              }
+    )
+    lora_dropout: float = field(
+        default=0.1,
+        metadata={"help": "The dropout rate in lora.linear."},
+    )
+    save_aggregated_lora: bool = field(
+        default=False,
+        metadata={"help": "Whether to save aggregated lora."},
+        )
+    use_ram_optimized_load: bool = field(
+        default=True,
+        metadata={"help": "Whether use disk mapping when memory is not enough."}
+    )
+    def __post_init__(self):
+        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
+            raise ValueError(
+                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
+            )
+@dataclass
+class DatasetArguments:
+    """
+    Define a class DatasetArguments using the dataclass decorator.
+    The class contains several optional parameters that can be used to configure a dataset for a language model.
+    dataset_path : str
+        a string representing the path of the dataset to use.
+    dataset_name : str
+        a string representing the name of the dataset to use. The default value is "customized".
+    is_custom_dataset : bool
+        a boolean indicating whether to use custom data. The default value is False.
+    customized_cache_dir : str
+        a string representing the path to the directory where customized dataset caches will be stored.
+    dataset_config_name : str
+        a string representing the configuration name of the dataset to use (via the datasets library).
+    train_file : str
+        a string representing the path to the input training data file (a text file).
+    validation_file : str
+        a string representing the path to the input evaluation data file to evaluate the perplexity on (a text file).
+    max_train_samples : int
+        an integer indicating the maximum number of training examples to use for debugging or quicker training.
+        If set, the training dataset will be truncated to this number.
+    max_eval_samples: int
+        an integer indicating the maximum number of evaluation examples to use for debugging or quicker training.
+        If set, the evaluation dataset will be truncated to this number.
+    streaming : bool
+        a boolean indicating whether to enable streaming mode.
+    block_size: int
+        an integer indicating the optional input sequence length after tokenization. The training dataset will be
+        truncated in blocks of this size for training.
+    The class also includes some additional parameters that can be used to configure the dataset further, such as `overwrite_cache`,
+    `validation_split_percentage`, `preprocessing_num_workers`, `disable_group_texts`, `demo_example_in_prompt`, `explanation_in_prompt`,
+    `keep_linebreaks`, and `prompt_structure`.
+    The field function is used to set default values and provide help messages for each parameter. The Optional type hint is
+    used to indicate that a parameter is optional. The metadata argument is used to provide additional information about
+    each parameter, such as a help message.
+    """
+    dataset_path: Optional[str] = field(
+        default=None, metadata={"help": "The path of the dataset to use."}
+    )
+    dataset_name: Optional[str] = field(
+        default="customized", metadata={"help": "Should be \"customized\""}
+    )
+    is_custom_dataset: Optional[bool] = field(
+        default=False, metadata={"help": "whether to use custom data"}
+    )
+    customized_cache_dir: Optional[str] = field(
+        default=".cache/llm-ft/datasets",
+        metadata={"help": "Where do you want to store the customized dataset caches"},
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=1e10,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
+    block_size: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Optional input sequence length after tokenization. "
+                "The training dataset will be truncated in block of this size for training. "
+                "Default to the model max input length for single sentence inputs (take into account special tokens)."
+            )
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    disable_group_texts: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether we group original samples together to generate sample"
+                " sequences of length `block_size`. By default, we group every"
+                " 1000 tokenized sequences together, divide them into "
+                " [{total_num_tokens} / {block_size}] sequences, each with"
+                " `block_size` tokens (the remaining tokens are ommited."
+                " If this flag is set to True, we only group 1 tokenized"
+                " sequence, i.e. cutting long sequence into chunks."
+            )
+        },
+    )
+    keep_linebreaks: bool = field(
+        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "Evaluation File Path"},
+    )
+    def __post_init__(self):
+        if self.streaming:
+            require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+@dataclass
+class FinetunerArguments(TrainingArguments):
+    """
+    Adapt transformers.TrainingArguments
+    """
+    pass
+@dataclass
+class EvaluatorArguments:
+    """
+    Define a class EvaluatorArguments using the dataclass decorator. The class contains several optional
+    parameters that can be used to configure a evaluator.
+    local_rank : str
+        For distributed training: local_rank
+    random_shuffle : bool
+    use_wandb : bool
+    random_seed : int, default = 1
+    output_dir : str, default = './output_dir',
+    mixed_precision : str, choice from ["bf16","fp16"].
+        mixed precision mode, whether to use bf16 or fp16
+    deepspeed :
+        Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already
+        loaded json file as a dict
+    """
+    local_rank: int = field(
+        default=-1,
+        metadata={"help": "For distributed training: local_rank"
+        }
+    )
+    random_shuffle: Optional[bool] = field(
+        default=False,
+        metadata={"help": ""
+        }
+    )
+    use_wandb: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": (
+                "When this flag is True, wandb will be enabled"
+            )
+        },
+    )
+    random_seed: Optional[int] = field(
+        default=1,
+        metadata={
+            "help": (
+                "used to set random seed"
+            )
+        },
+    )
+    output_dir: Optional[str] = field(
+        default="./output_dir",
+        metadata={"help": "Output path for the inferenced results"},
+    )
+    mixed_precision: Optional[str] = field(
+        default="bf16",
+        metadata={
+            "help": (
+                "mixed precision mode, whether to use bf16 or fp16"
+            ),
+            "choices": ["bf16","fp16"],
+        },
+    )
+    deepspeed: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already"
+                " loaded json file as a dict"
+            )
+        },
+    )
+    answer_type: Optional[str] = field(
+        default="text",
+        metadata={
+            "help": (
+                'Question type for answer extraction from the decoder output.'
+                ' Supported types: \n'
+                '   1) "multiple_choice", e.g. A, B, C, D, ...\n'
+                '   2) "binary_choice", e.g. yes, no, maybe\n'
+                '   3) "math", e.g. 1.0, -3.52\n'
+                '   4) "text", e.g. "I think that it is okay"\n'
+                '   5) Special treatment for several datasets\n'
+                '     - "gsm8k"\n'
+                '     - "svamp"\n'
+                '     - "asdiv"\n'
+                '     - "addsub"\n'
+                '     - "singleeq"\n'
+                '     - "multiarith"\n'
+                '     - "aqua"\n'
+                '     - "csqa"\n'
+                '     - "strategyqa"\n'
+                '     - "pubmedqa"\n'
+                '     - "medmcqa"\n'
+                '     - "usmle"\n'
+            )
+        },
+    )
+    prompt_structure: Optional[str] = field(
+        default="{input}",
+        metadata={
+            "help": (
+                'Prompt structure to facilitate prompt engineering during'
+                ' inference. The model will receive'
+                ' `prompt_structure.format(input=input)` as its input.'
+            )
+        },
+    )
+    evaluate_block_size: Optional[int] = field(
+        default=512,
+        metadata={
+            "help": (
+                "the model will have at least block_size tokens for context when calculating the conditional likelihood of any one token"
+                " (provided there are block_size preceding tokens available to condition on)"
+            )
+        },
+    )
+    metric: Optional[str] = field(
+        default="accuracy",
+        metadata={
+            "help": "the metric the model will be evaluated on",
+            "choices": ["ppl", "perplexity", "acc", "accuracy", "nll", "neg_log_likelihood"],
+        },
+    )
+@dataclass
+class InferencerArguments:
+    """
+    Define a class InferencerArguments using the dataclass decorator. The class contains several optional
+    parameters that can be used to configure a inferencer.
+    local_rank : str
+        For distributed training: local_rank
+    random_seed : int, default = 1
+    deepspeed :
+        Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already
+        loaded json file as a dict
+    mixed_precision : str, choice from ["bf16","fp16"].
+        mixed precision mode, whether to use bf16 or fp16
+    """
+    device: str = field(
+        default="gpu",
+        metadata={
+            "help": "device of chatbot",
+            "choices": ["gpu", "cpu"],
+        },
+    )
+    local_rank: int = field(
+        default=-1,
+        metadata={"help": "For distributed training: local_rank"
+        }
+    )
+    random_seed: Optional[int] = field(
+        default=1,
+        metadata={
+            "help": (
+                "used to set random seed"
+            )
+        },
+    )
+    deepspeed: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already"
+                " loaded json file as a dict"
+            )
+        },
+    )
+    mixed_precision: Optional[str] = field(
+        default="bf16",
+        metadata={
+            "help": (
+                "mixed precision mode, whether to use bf16 or fp16"
+            ),
+            "choices": ["bf16","fp16"],
+        },
+    )
+@dataclass
+class RaftAlignerArguments(TrainingArguments):
+    """
+    Define a class RaftAlignerArguments to configure raft aligner.
+    """
+    output_reward_path: Optional[str] = field(
+        default="tmp/raft_aligner/",
+        metadata={
+            "help": "The path of output rewards."
+        }
+    )
+    output_min_length: Optional[int] = field(
+        default=16,
+        metadata={
+            "help": (
+                "minimum length of the output token sequence generated from"
+                " model given an input."
+            ),
+        },
+    )
+    output_max_length: Optional[int] = field(
+        default=48,
+        metadata={
+            "help": (
+                "maximum length of the output token sequence generated from"
+                " model given an output."
+            ),
+        },
+    )
+    num_raft_iteration: Optional[int] = field(
+        default=20,
+        metadata={
+            "help": "number of iterations of the raft aligner."
+        },
+    )
+    raft_batch_size: Optional[int] = field(
+        default=320,
+        metadata={
+            "help": (
+                "only select {raft_batch_size} samples each time to"
+                " generate rewards and be ranked for STF training."
+            )
+        },
+    )
+    top_reward_percentage: Optional[int] = field(
+        default=0.2,
+        metadata={
+            "help": (
+                "only top {top_reward_percentage} samples in the raft batch,"
+                " (in terms of rewards), will be used for SFT the model."
+            ),
+        },
+    )
+    inference_batch_size_per_device: Optional[int] = field(
+        default=1,
+        metadata={
+            "help": (
+                "every device will infer {inference_batch_size_per_device}"
+                " samples in parallel. The inferred results will be concatenaed"
+                " with inputs and attach a reward."
+            ),
+        },
+    )
+PIPELINE_ARGUMENT_MAPPING = {
+    "finetuner": FinetunerArguments,
+    "evaluator": EvaluatorArguments,
+    "inferencer": InferencerArguments,
+    "raft_aligner": RaftAlignerArguments,
+}
+class AutoArguments:
+    """
+    Automatically choose arguments from FinetunerArguments or EvaluatorArguments.
+    """
+    def get_pipeline_args_class(pipeline_name: str):
+        return PIPELINE_ARGUMENT_MAPPING[pipeline_name]

lmflow/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""This Python code defines a class Dataset with methods for initializing, loading,
+and manipulating datasets from different backends such as Hugging Face and JSON.
+The `Dataset` class includes methods for loading datasets from a dictionary and a Hugging
+Face dataset, mapping datasets, and retrieving the backend dataset and arguments.
+"""
+from lmflow.datasets.dataset import Dataset

lmflow/datasets/dataset.py ADDED Viewed

	@@ -0,0 +1,308 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""This Python code defines a class Dataset with methods for initializing, loading,
+and manipulating datasets from different backends such as Hugging Face and JSON.
+The `Dataset` class includes methods for loading datasets from a dictionary and a Hugging
+Face dataset, mapping datasets, and retrieving the backend dataset and arguments.
+"""
+# Importing necessary libraries and modules
+import json
+from pathlib import Path
+from typing import Optional
+from datasets import load_dataset
+from datasets import Dataset as HFDataset
+from lmflow.args import DatasetArguments
+DATASET_TYPES = [
+    "text_only",
+    "text2text",
+]
+KEY_TYPE = "type"
+KEY_INSTANCES = "instances"
+class Dataset:
+    r"""
+    Initializes the Dataset object with the given parameters.
+    Parameters
+    ------------
+    data_args : DatasetArguments object.
+        Contains the arguments required to load the dataset.
+    backend : str,  default="huggingface"
+        A string representing the dataset backend. Defaults to "huggingface".
+    args : Optional.
+        Positional arguments.
+    kwargs : Optional.
+        Keyword arguments.
+    """
+    def __init__(self, data_args=None, backend: str="huggingface", *args, **kwargs):
+        self.data_args = data_args
+        self.backend = backend
+        self.backend_dataset = None
+        self.type = None        # Original type of the dataset
+        self.dataset_path = data_args.dataset_path
+        if data_args.dataset_path is None:
+            return
+        if backend == "huggingface":
+            data_files = [
+                x.absolute().as_posix()
+                 for x in Path(self.dataset_path).glob("*.json")
+            ]
+            # Iterate through all the files and ensure they have the same data type
+            for single_file in data_files:
+                with open(single_file) as fin:
+                    json_data = json.load(fin)
+                    if KEY_TYPE not in json_data.keys():
+                        raise ValueError(
+                            f'"{KEY_TYPE}" field must be specified for data, e.g.'
+                            '{\n'
+                            f'   "{KEY_TYPE}: "text_only",\n'
+                            f'   "{KEY_INSTANCES}": [\n'
+                            '       { "text": "Sentence 1: This is a sentence." }\n'
+                            '       { "text": "Sentence 2: This is another sentence." }\n'
+                            f'   ]\n'
+                            '}'
+                        )
+                    if self.type is None:
+                        self.type = json_data[KEY_TYPE]
+                    elif self.type != json_data[KEY_TYPE]:
+                        raise ValueError(
+                            'All task files must have same data types. Previous'
+                            f' files have type "{self.type}", but in file'
+                            f' {single_file}, it has type "{self.type}".'
+                        )
+            # Load the dataset using the HuggingFace dataset library
+            extensions = "json"
+            raw_dataset = load_dataset(
+                extensions,
+                data_files=data_files,
+                field=KEY_INSTANCES,
+                split="train",
+                use_auth_token=None,
+            )
+            self.backend_dataset = raw_dataset
+        elif backend == "json":
+            # TODO (@Jiachun)
+            pass
+        else:
+            raise NotImplementedError(f'Unsupported dataset backend "{backend}"')
+    def _check_data_type(self):
+        # TODO: check if data type and data structure matches, raise messages
+        # with hints
+        pass
+    def from_dict(self, dict_obj: dict, *args, **kwargs):
+        r"""
+        Create a Dataset object from a dictionary.
+        Return a Dataset given a dict with format:
+            {
+                "type": TYPE,
+                "instances": [
+                    {
+                        "key_1": VALUE_1.1,
+                        "key_2": VALUE_1.2,
+                        ...
+                    },
+                    {
+                        "key_1": VALUE_2.1,
+                        "key_2": VALUE_2.2,
+                        ...
+                    },
+                    ...
+                ]
+            }
+        Parameters
+        -----------
+        dict_obj : dict.
+            A dictionary containing the dataset information.
+        args : Optional.
+            Positional arguments.
+        kwargs : Optional.
+            Keyword arguments.
+        Returns
+        ---------
+        self : Dataset object.
+        """
+        if self.backend == "huggingface":
+            if KEY_TYPE not in dict_obj:
+                raise ValueError(
+                    f'"{KEY_TYPE}" must be provided to initialize a dataset'
+                )
+            if KEY_INSTANCES not in dict_obj:
+                raise ValueError(
+                    f'"{KEY_INSTANCES}" must be provided to initialize a dataset'
+                )
+            self.type = dict_obj[KEY_TYPE]
+            hf_dict = {}
+            if len(dict_obj[KEY_INSTANCES]) > 0:
+                for key in dict_obj[KEY_INSTANCES][0].keys():
+                    hf_dict[key] = [ instance[key] for instance in dict_obj[KEY_INSTANCES] ]
+            self.backend_dataset = HFDataset.from_dict(hf_dict, *args, **kwargs)
+            return self
+        else:
+            raise NotImplementedError(
+                f'Currently .from_dict is not supported for backend "{backend}"'
+            )
+    @classmethod
+    def create_from_dict(cls, dict_obj, *args, **kwargs):
+        r"""
+        Returns
+        --------
+        Returns a Dataset object given a dict.
+        """
+        empty_data_args = DatasetArguments(dataset_path=None)
+        dataset = Dataset(empty_data_args)
+        return dataset.from_dict(dict_obj)
+    def to_dict(self):
+        r"""
+        Returns
+        ---------
+        Return a dict represents the dataset:
+            {
+                "type": TYPE,
+                "instances": [
+                    {
+                        "key_1": VALUE_1.1,
+                        "key_2": VALUE_1.2,
+                        ...
+                    },
+                    {
+                        "key_1": VALUE_2.1,
+                        "key_2": VALUE_2.2,
+                        ...
+                    },
+                    ...
+                ]
+            }
+        A python dict object represents the content of this dataset.
+        """
+        if self.backend == "huggingface":
+            dict_obj = {}
+            dict_obj[KEY_TYPE] = self.get_type()
+            hf_dict = self.backend_dataset.to_dict()
+            dict_obj[KEY_INSTANCES] = []
+            first_key = None
+            for key in hf_dict.keys():
+                first_key = key
+                break
+            if first_key is not None:
+                num_instances = len(hf_dict[first_key])
+                dict_obj[KEY_INSTANCES] = [
+                    {
+                        key: hf_dict[key][i] for key in hf_dict.keys()
+                    }
+                    for i in range(num_instances)
+                ]
+            return dict_obj
+        else:
+            raise NotImplementedError(
+                f'Current .to_dict is not supported for backend "{backend}"'
+            )
+    def map(self, *args, **kwargs):
+        r"""
+        Parameters
+        ------------
+        args : Optional.
+            Positional arguments.
+        kwargs : Optional.
+            Keyword arguments.
+        Returns
+        ---------
+        self : Dataset object.
+        """
+        # If the dataset uses Hugging Face as the backend,
+        # call the `map()` function of the Hugging Face backend dataset
+        if self.backend == "huggingface":
+            # Set the mapped dataset as the backend dataset of the current dataset
+            mapped_backend_dataset = self.backend_dataset.map(*args, **kwargs)
+            self.backend_dataset = mapped_backend_dataset
+            return self
+        else:
+            # If the backend is not Hugging Face, raise a NotImplementedError
+            raise NotImplementedError(
+                f'Currently .map is not supported for backend "{backend}"'
+            )
+    def get_backend(self) -> Optional[str]:
+        r"""
+        Returns
+        ---------
+        self.backend
+        """
+        return self.backend
+    def get_backend_dataset(self):
+        r"""
+        Returns
+        ---------
+        self.backend_dataset
+        """
+        return self.backend_dataset
+    def get_data_args(self):
+        r"""
+        Returns
+        ---------
+        self.data_args
+        """
+        return self.data_args
+    def get_type(self):
+        r"""
+        Returns
+        ---------
+        self.type
+        """
+        return self.type

lmflow/models/__init__.py ADDED Viewed

File without changes

lmflow/models/auto_model.py ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""Automatically get correct model type.
+"""
+from lmflow.models.hf_decoder_model import HFDecoderModel
+from lmflow.models.text_regression_model import TextRegressionModel
+from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel
+class AutoModel:
+    @classmethod
+    def get_model(self, model_args, *args, **kwargs):
+        arch_type = model_args.arch_type
+        if arch_type == "decoder_only":
+            return HFDecoderModel(model_args, *args, **kwargs)
+        elif arch_type == "text_regression":
+            return TextRegressionModel(model_args, *args, **kwargs)
+        elif arch_type == "encoder_decoder":
+            return HFEncoderDecoderModel(model_args, *args, **kwargs)
+        else:
+            raise NotImplementedError(
+                f"model architecture type \"{arch_type}\" is not supported"
+            )

lmflow/models/base_model.py ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""Base model class.
+"""
+from abc import ABC
+class BaseModel(ABC):
+    def __init__(self, *args, **kwargs):
+        pass

lmflow/models/decoder_model.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""A one-line summary of the module or program, terminated by a period.
+Leave one blank line.  The rest of this docstring should contain an
+overall description of the module or program.  Optionally, it may also
+contain a brief description of exported classes and functions and/or usage
+examples.
+Typical usage example:
+  foo = ClassFoo()
+  bar = foo.FunctionBar()
+"""
+from lmflow.models.base_model import BaseModel
+class DecoderModel(BaseModel):
+    def __init__(self, *args, **kwargs):
+        pass

lmflow/models/encoder_decoder_model.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""A one-line summary of the module or program, terminated by a period.
+Leave one blank line.  The rest of this docstring should contain an
+overall description of the module or program.  Optionally, it may also
+contain a brief description of exported classes and functions and/or usage
+examples.
+Typical usage example:
+  foo = ClassFoo()
+  bar = foo.FunctionBar()
+"""
+from lmflow.models.base_model import BaseModel
+class EncoderDecoderModel(BaseModel):
+    def __init__(self, *args, **kwargs):
+        pass

lmflow/models/hf_decoder_model.py ADDED Viewed

	@@ -0,0 +1,537 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""This is a class called HFDecoderModel which is a wrapper around transformers model and
+tokenizer classes. It has several methods such as __init__, tokenize, and train that are
+used for training and fine-tuning the model. The __init__ method takes in several arguments
+such as model_args, tune_strategy, and ds_config, which are used to load the pretrained
+model and tokenizer, and initialize the training settings.
+The tokenize method is used to tokenize the input text and return the input IDs and attention
+masks that can be fed to the model for training or inference.
+This class supports different tune_strategy options such as 'normal', 'none', 'lora', and
+'adapter', which allow for different fine-tuning settings of the model. However, the 'lora'
+and 'adapter' strategies are not yet implemented.
+Overall, this class provides a convenient interface for loading and fine-tuning transformer
+models and can be used for various NLP tasks such as language modeling, text classification,
+and question answering.
+"""
+import logging
+from typing import List, Union
+import deepspeed
+from peft import (
+    LoraConfig,
+    PeftModel,
+    TaskType,
+    get_peft_config,
+    get_peft_model,
+)
+import torch
+import transformers
+from transformers.deepspeed import HfDeepSpeedConfig
+from transformers.testing_utils import CaptureLogger
+from transformers import (
+    CONFIG_MAPPING,
+    AutoConfig,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+)
+from lmflow.datasets.dataset import Dataset
+from lmflow.models.decoder_model import DecoderModel
+from lmflow.models.interfaces.tunable import Tunable
+from lmflow.utils.constants import (
+    TEXT_ONLY_DATASET_DESCRIPTION,
+    TEXT2TEXT_DATASET_DESCRIPTION,
+)
+logger = logging.getLogger(__name__)
+class HFDecoderModel(DecoderModel, Tunable):
+    r"""
+    Initializes a HFDecoderModel instance.
+    Parameters
+    ------------
+    model_args :
+        Model arguments such as model name, path, revision, etc.
+    tune_strategy : str or none,  default="normal".
+        A string representing the dataset backend. Defaults to "huggingface".
+    ds_config :
+        Deepspeed configuations.
+    args : Optional.
+        Positional arguments.
+    kwargs : Optional.
+        Keyword arguments.
+    """
+    def __init__(
+        self,
+        model_args,
+        tune_strategy='normal',
+        ds_config=None,
+        device="gpu",
+        *args,
+        **kwargs
+    ):
+        """
+        Initializes a HFDecoderModel instance.
+        :param model_args: dictionary with model arguments such as model name, path, revision, etc.
+        :param tune_strategy: tuning strategy: normal, none, lora or adapter
+        :param ds_config: deepspeed configuration for distributed training
+        """
+        # See more about loading any type of standard or custom dataset (from
+        # files, python dict, pandas DataFrame, etc) at
+        # https://huggingface.co/docs/datasets/loading_datasets.html.
+        # Load pretrained model and tokenizer
+        #
+        # Distributed training: The .from_pretrained methods guarantee that
+        # only one local process can concurrently download model & vocab.
+        self.device = device
+        self.model_args = model_args
+        torch_dtype = (
+            model_args.torch_dtype
+            if model_args.torch_dtype in ["auto", None]
+            else getattr(torch, model_args.torch_dtype)
+        )
+        if tune_strategy == 'normal':
+            config_kwargs = {
+                "cache_dir": model_args.cache_dir,
+                "revision": model_args.model_revision,
+                "use_auth_token": True if model_args.use_auth_token else None,
+            }
+            if model_args.config_name:
+                config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
+            elif model_args.model_name_or_path:
+                config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
+            else:
+                config = CONFIG_MAPPING[model_args.model_type]()
+                logger.warning("You are instantiating a new config instance from scratch.")
+                if model_args.config_overrides is not None:
+                    logger.info(f"Overriding config: {model_args.config_overrides}")
+                    config.update_from_string(model_args.config_overrides)
+                    logger.info(f"New config: {config}")
+            tokenizer_kwargs = {
+                "cache_dir": model_args.cache_dir,
+                "use_fast": model_args.use_fast_tokenizer,
+                "revision": model_args.model_revision,
+                "use_auth_token": True if model_args.use_auth_token else None,
+            }
+            if model_args.tokenizer_name:
+                tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
+            elif model_args.model_name_or_path:
+                tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
+            else:
+                raise ValueError(
+                    "You are instantiating a new tokenizer from scratch. This is"
+                    " not supported by this script. You can do it from another"
+                    " script, save it, and load it from here, using"
+                    " --tokenizer_name."
+                )
+            if model_args.model_name_or_path:
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_args.model_name_or_path,
+                    from_tf=bool(".ckpt" in model_args.model_name_or_path),
+                    config=config,
+                    cache_dir=model_args.cache_dir,
+                    revision=model_args.model_revision,
+                    use_auth_token=True if model_args.use_auth_token else None,
+                    torch_dtype=torch_dtype,
+                )
+            else:
+                model = AutoModelForCausalLM.from_config(config)
+                n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
+                logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+            self.backend_model_full = model
+            if model_args.use_lora:
+                if model_args.lora_target_modules:
+                    lora_target_modules = model_args.lora_target_modules
+                else:
+                    lora_target_modules = None
+                peft_config = LoraConfig(
+                    task_type=TaskType.CAUSAL_LM,
+                    inference_mode=False,
+                    r=model_args.lora_r,
+                    lora_alpha=model_args.lora_alpha,
+                    lora_dropout=model_args.lora_dropout,
+                    target_modules=lora_target_modules,
+                )
+                model = get_peft_model(model, peft_config)
+                model.print_trainable_parameters()
+            # We resize the embeddings only when necessary to avoid index errors.
+            # If you are creating a model from scratch on a small vocab and want a
+            # smaller embedding size, remove this test.
+            embedding_size = model.get_input_embeddings().weight.shape[0]
+            if len(tokenizer) > embedding_size:
+                model.resize_token_embeddings(len(tokenizer))
+            self.config = config
+            self.backend_model = model
+            self.tokenizer = tokenizer
+            self.tune_strategy = tune_strategy
+        elif tune_strategy == 'none':
+            peft_model_id = model_args.lora_model_path
+            # NOTE: Currently offload is not supported by llama
+            if "llama" in model_args.model_name_or_path and model_args.use_ram_optimized_load:
+                logger.warning(
+                    "llama does not support RAM optimized load. Automatically"
+                    " use original load instead."
+                )
+                model_args.use_ram_optimized_load = False
+            if model_args.use_ram_optimized_load and peft_model_id is None:
+                try:
+                    # RAM-optimized load
+                    self.backend_model = AutoModelForCausalLM.from_pretrained(
+                        model_args.model_name_or_path,
+                        device_map="auto",
+                        offload_folder="offload",
+                        offload_state_dict=True,
+                        torch_dtype=torch_dtype,
+                    )
+                except:
+                    logger.warning(
+                        "Failed to use RAM optimized load. Automatically"
+                        " use original load instead."
+                    )
+                    # Normal load
+                    self.backend_model = AutoModelForCausalLM.from_pretrained(
+                        model_args.model_name_or_path,
+                        torch_dtype=torch_dtype,
+                    )
+            else:
+                if peft_model_id is not None:
+                    logger.warning(
+                        "LoRA does not support RAM optimized load currently."
+                        " Automatically use original load instead."
+                    )
+                self.backend_model = AutoModelForCausalLM.from_pretrained(
+                    model_args.model_name_or_path,
+                    torch_dtype=torch_dtype,
+                )
+            self.tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+            self.backend_model_full = self.backend_model
+            if peft_model_id is not None:
+                self.backend_model = PeftModel.from_pretrained(
+                    self.backend_model, peft_model_id
+                )
+            if device == "gpu":
+                deepspeed.init_distributed()
+                self.ds_engine = deepspeed.initialize(model=self.backend_model, config_params=ds_config)[0]
+                self.ds_engine.module.eval()
+        elif tune_strategy == 'adapter':
+            raise NotImplementedError('adapter tune strategy not implemented')
+    def tokenize(self, dataset, add_special_tokens=True, *args, **kwargs):
+        """
+        Tokenize the full dataset.
+        Parameters
+        ------------
+        dataset : lmflow.datasets.Dataset.
+        args : Optional.
+            Positional arguments.
+        kwargs : Optional.
+            Keyword arguments.
+        Returns
+        ------------
+        tokenized_datasets :
+            The tokenized dataset, without any leading or trailing special
+            tokens (normally they are Begin-Of-Sentence or End-Of-Sentence
+            tokens).
+        """
+        # Preprocessing the datasets.
+        # First we tokenize all the texts.
+        if dataset.get_backend() != "huggingface":
+            raise NotImplementedError(
+                "tokenization of datasets with non-huggingface backend are"
+                "not supported yet"
+            )
+        dataset_type = dataset.get_type()
+        # Requires three types of information for tokenizing different datasets
+        #   1) Which fields require tokenization, e.g.
+        #        "text2float": "text", but not "float"
+        #        "text2text": both "input" and "output"
+        #   2) How will there tokenized sequence concatenated together, e.g.
+        #        "text_only": "text" -> "text"
+        #        "text2text": "input", "output" -> "input" + "output"
+        #   3) Which fields require loss in final computation, e.g.
+        #        "text_only": "text"
+        #        "text2text": "output" only
+        tokenized_column_order = None       # Handles 1) and 2)
+        label_columns = None                # Handles 3)
+        if dataset_type == "text_only":
+            tokenized_column_order = ["text"]
+            label_columns = ["text"]
+        elif dataset_type == "text2text":
+            tokenized_column_order = ["input", "output"]
+            label_columns = ["output"]
+        else:
+            raise NotImplementedError(
+                f"dataset type \"{dataset_type}\" is not supported, currently"
+                " only support following data types:\n"
+                f"    1) {TEXT_ONLY_DATASET_DESCRIPTION}\n"
+                f"    2) {TEXT2TEXT_DATASET_DESCRIPTION}\n"
+            )
+        model_args = self.model_args
+        raw_datasets = dataset
+        hf_raw_datasets = dataset.get_backend_dataset()
+        column_names = list(hf_raw_datasets.features)
+        # since this will be pickled to avoid _LazyModule error in Hasher force
+        # logger loading before tokenize_function
+        tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
+        def tokenize_function(examples):
+            num_example = len(examples[column_names[0]])
+            token_dict = {
+                "input_ids": [[] for _ in range(num_example)],
+                "attention_mask": [[] for _ in range(num_example)],
+                "labels": [[] for _ in range(num_example)],
+            }
+            with CaptureLogger(tok_logger) as cl:
+                for column_name in tokenized_column_order:
+                    encoding = self.tokenizer(
+                        examples[column_name],
+                        add_special_tokens=add_special_tokens,
+                        truncation=True if model_args.use_lora else None,
+                    )
+                    if column_name in label_columns:
+                        labels = encoding["input_ids"].copy()
+                    else:
+                        labels = [
+                            [-100] * len(encoding["input_ids"][i])
+                             for i in range(num_example)
+                        ]
+                    for i in range(num_example):
+                        token_dict["input_ids"][i].extend(
+                            encoding["input_ids"][i]
+                        )
+                        token_dict["attention_mask"][i].extend(
+                            encoding["attention_mask"][i]
+                        )
+                        token_dict["labels"][i].extend(labels[i])
+            # clm input could be much much longer than block_size
+            if "Token indices sequence length is longer than the" in cl.out:
+                tok_logger.warning(
+                    "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
+                    " before being passed to the model."
+                )
+            return token_dict
+        data_args = raw_datasets.get_data_args()
+        if not data_args.streaming:
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on dataset",
+            )
+        else:
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                remove_columns=column_names,
+            )
+        return tokenized_datasets
+    def encode(self, input: Union[str, List[str]], *args, **kwargs ) -> Union[List[int], List[List[int]]]:
+        """
+        Perform encoding process of the tokenizer.
+        Parameters
+        ------------
+        inputs : str or list.
+            The text sequence.
+        args : Optional.
+            Positional arguments.
+        kwargs : Optional.
+            Keyword arguments.
+        Returns
+        ------------
+        outputs :
+            The tokenized inputs.
+        """
+        if isinstance(input, list):
+            output = []
+            for single_input in input:
+                single_output = self.encode(single_input, *args, **kwargs)
+                output.append(single_output)
+            return output
+        elif isinstance(input, str):
+            return self.tokenizer.encode(text=input, *args, **kwargs)
+        else:
+            raise NotImplementedError(f'type "{type(input)}" cannot be encoded')
+    def decode(self, input, *args, **kwargs ) -> Union[str, List[str]]:
+        """
+        Perform decoding process of the tokenizer.
+        Parameters
+        ------------
+        inputs : list.
+            The token sequence.
+        args : Optional.
+            Positional arguments.
+        kwargs : Optional.
+            Keyword arguments.
+        Returns
+        ------------
+        outputs :
+            The text decoded from the token inputs.
+        """
+        if isinstance(input, list) and input and isinstance(input[0], list):
+            output = []
+            for single_input in input:
+                single_output = self.decode(single_input, *args, **kwargs)
+                output.append(single_output)
+            return output
+        else:
+            # Can be list of ints or a Tensor
+            return self.tokenizer.decode(input, *args, **kwargs)
+    def inference(self, inputs, *args, **kwargs):
+        """
+        Perform generation process of the model.
+        Parameters
+        ------------
+        inputs :
+            The sequence used as a prompt for the generation or as model inputs to the model.
+        args : Optional.
+            Positional arguments.
+        kwargs : Optional.
+            Keyword arguments.
+        Returns
+        ------------
+        outputs :
+            The generated sequence output
+        """
+        with torch.no_grad():
+            if self.device == "gpu":
+                outputs = self.ds_engine.module.generate(
+                    input_ids=inputs,
+                    synced_gpus=True,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    *args,
+                    **kwargs
+                )
+            elif self.device == "cpu":
+                outputs = self.backend_model.generate(
+                    input_ids=inputs,
+                    synced_gpus=True,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    *args,
+                    **kwargs
+                )
+            else:
+                raise NotImplementedError(
+                    f"device \"{self.device}\" is not supported"
+                )
+        return outputs
+    def merge_lora_weights(self):
+        if self.model_args.use_lora:
+            self.get_backend_model().merge_and_unload()
+        else:
+            logger.warning("LoRA training is NOT enabled. Merging LoRA weights is not applicable.")
+    def save(self, dir, save_full_model=False, *args, **kwargs):
+        """
+        Perform generation process of the model.
+        Parameters
+        ------------
+        dir :
+            The directory to save model and tokenizer
+        save_full_model : Optional.
+            Whether to save full model.
+        kwargs : Optional.
+            Keyword arguments.
+        Returns
+        ------------
+        outputs :
+            The generated sequence output
+        """
+        self.get_tokenizer().save_pretrained(dir)
+        if save_full_model and self.model_args.use_lora:
+            self.backend_model_full.save_pretrained(dir)
+        else:
+            self.get_backend_model().save_pretrained(dir)
+    def get_max_length(self):
+        """
+        Return max acceptable input length in terms of tokens.
+        """
+        return self.tokenizer.model_max_length
+    def get_tokenizer(self):
+        """
+        Return the tokenizer of the model.
+        """
+        return self.tokenizer
+    def get_backend_model(self):
+        """
+        Return the backend model.
+        """
+        return self.backend_model

lmflow/models/hf_encoder_decoder_model.py ADDED Viewed

	@@ -0,0 +1,352 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""This is a class called HFDecoderModel which is a wrapper around transformers model and
+tokenizer classes. It has several methods such as __init__, tokenize, and train that are
+used for training and fine-tuning the model. The __init__ method takes in several arguments
+such as model_args, tune_strategy, and ds_config, which are used to load the pretrained
+model and tokenizer, and initialize the training settings.
+The tokenize method is used to tokenize the input text and return the input IDs and attention
+masks that can be fed to the model for training or inference.
+This class supports different tune_strategy options such as 'normal', 'none', 'lora', and
+'adapter', which allow for different fine-tuning settings of the model. However, the 'lora'
+and 'adapter' strategies are not yet implemented.
+Overall, this class provides a convenient interface for loading and fine-tuning transformer
+models and can be used for various NLP tasks such as language modeling, text classification,
+and question answering.
+"""
+import logging
+from typing import List, Union
+import deepspeed
+from peft import (
+    LoraConfig,
+    PeftModel,
+    TaskType,
+    get_peft_config,
+    get_peft_model,
+)
+import torch
+import transformers
+from transformers.deepspeed import HfDeepSpeedConfig
+from transformers.testing_utils import CaptureLogger
+from transformers import (
+    CONFIG_MAPPING,
+    AutoConfig,
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    AutoModel,
+)
+from lmflow.datasets.dataset import Dataset
+from lmflow.models.encoder_decoder_model import EncoderDecoderModel
+from lmflow.models.interfaces.tunable import Tunable
+logger = logging.getLogger(__name__)
+class HFEncoderDecoderModel(EncoderDecoderModel, Tunable):
+    r"""
+    Initializes a HFEncoderDecoderModel instance.
+    Parameters
+    ------------
+    model_args :
+        Model arguments such as model name, path, revision, etc.
+    tune_strategy : str or none,  default="normal".
+        A string representing the dataset backend. Defaults to "huggingface".
+    ds_config :
+        Deepspeed configuations.
+    args : Optional.
+        Positional arguments.
+    kwargs : Optional.
+        Keyword arguments.
+    """
+    def __init__(
+        self,
+        model_args,
+        tune_strategy='normal',
+        ds_config=None,
+        device="gpu",
+        *args,
+        **kwargs
+    ):
+        """
+        Initializes a HFDecoderModel instance.
+        :param model_args: dictionary with model arguments such as model name, path, revision, etc.
+        :param tune_strategy: tuning strategy: normal, none, lora or adapter
+        :param ds_config: deepspeed configuration for distributed training
+        """
+        # See more about loading any type of standard or custom dataset (from
+        # files, python dict, pandas DataFrame, etc) at
+        # https://huggingface.co/docs/datasets/loading_datasets.html.
+        # Load pretrained model and tokenizer
+        #
+        # Distributed training: The .from_pretrained methods guarantee that
+        # only one local process can concurrently download model & vocab.
+        self.device = device
+        if tune_strategy == 'normal':
+            raise NotImplementedError(
+                f"tune_strategy \"{tune_strategy}\" is not supported"
+            )
+        elif tune_strategy == 'none':
+            dschf = HfDeepSpeedConfig(ds_config)
+            peft_model_id = model_args.lora_model_path
+            # NOTE: Currently offload is not supported by llama
+            if "llama" in model_args.model_name_or_path and model_args.use_ram_optimized_load:
+                logger.warning(
+                    "llama does not support RAM optimized load. Automatically"
+                    " use original load instead."
+                )
+                model_args.use_ram_optimized_load = False
+            if model_args.model_name_or_path == 'THUDM/chatglm-6b':
+                self.backend_model = AutoModel.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
+            elif model_args.use_ram_optimized_load and peft_model_id is None:
+                try:
+                    # RAM-optimized load
+                    self.backend_model = AutoModelForSeq2SeqLM.from_pretrained(
+                        model_args.model_name_or_path,
+                        device_map="auto",
+                        offload_folder="offload",
+                        offload_state_dict=True,
+                    )
+                except:
+                    logger.warning(
+                        "Failed to use RAM optimized load. Automatically"
+                        " use original load instead."
+                    )
+                    # Normal load
+                    self.backend_model = AutoModelForSeq2SeqLM.from_pretrained(
+                        model_args.model_name_or_path,
+                    )
+            else:
+                if peft_model_id is not None:
+                    logger.warning(
+                        "LoRA does not support RAM optimized load currently."
+                        " Automatically use original load instead."
+                    )
+                self.backend_model = AutoModelForSeq2SeqLM.from_pretrained(
+                    model_args.model_name_or_path,
+                )
+            self.tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
+            self.backend_model_full = self.backend_model
+            if peft_model_id is not None:
+                self.backend_model = PeftModel.from_pretrained(
+                    self.backend_model, peft_model_id
+                )
+            if device == "gpu":
+                deepspeed.init_distributed()
+                self.ds_engine = deepspeed.initialize(model=self.backend_model, config_params=ds_config)[0]
+                self.ds_engine.module.eval()
+        elif tune_strategy == 'adapter':
+            raise NotImplementedError('adapter tune strategy not implemented')
+    def tokenize(self, dataset, *args, **kwargs):
+        """
+        Tokenize the full dataset.
+        Parameters
+        ------------
+        dataset :
+            Text dataset.
+        args : Optional.
+            Positional arguments.
+        kwargs : Optional.
+            Keyword arguments.
+        Returns
+        ------------
+        tokenized_datasets :
+            The tokenized dataset.
+        """
+        raise NotImplementedError('tokenize not implemented')
+    def encode(self, input: Union[str, List[str]], *args, **kwargs ) -> Union[List[int], List[List[int]]]:
+        """
+        Perform encoding process of the tokenizer.
+        Parameters
+        ------------
+        inputs : str or list.
+            The text sequence.
+        args : Optional.
+            Positional arguments.
+        kwargs : Optional.
+            Keyword arguments.
+        Returns
+        ------------
+        outputs :
+            The tokenized inputs.
+        """
+        if isinstance(input, list):
+            output = []
+            for single_input in input:
+                single_output = self.encode(single_input, *args, **kwargs)
+                output.append(single_output)
+            return output
+        elif isinstance(input, str):
+            return self.tokenizer.encode(text=input, *args, **kwargs)
+        else:
+            raise NotImplementedError(f'type "{type(input)}" cannot be encoded')
+    def decode(self, input, *args, **kwargs ) -> Union[str, List[str]]:
+        """
+        Perform decoding process of the tokenizer.
+        Parameters
+        ------------
+        inputs : list.
+            The token sequence.
+        args : Optional.
+            Positional arguments.
+        kwargs : Optional.
+            Keyword arguments.
+        Returns
+        ------------
+        outputs :
+            The text decoded from the token inputs.
+        """
+        if isinstance(input, list) and input and isinstance(input[0], list):
+            output = []
+            for single_input in input:
+                single_output = self.decode(single_input, *args, **kwargs)
+                output.append(single_output)
+            return output
+        else:
+            # Can be list of ints or a Tensor
+            return self.tokenizer.decode(input, *args, **kwargs)
+    def inference(self, inputs, *args, **kwargs):
+        """
+        Perform generation process of the model.
+        Parameters
+        ------------
+        inputs :
+            The sequence used as a prompt for the generation or as model inputs to the model.
+        args : Optional.
+            Positional arguments.
+        kwargs : Optional.
+            Keyword arguments.
+        Returns
+        ------------
+        outputs :
+            The generated sequence output
+        """
+        with torch.no_grad():
+            if self.device == "gpu":
+                outputs = self.ds_engine.module.generate(
+                    input_ids=inputs,
+                    synced_gpus=True,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    *args,
+                    **kwargs
+                )
+            elif self.device == "cpu":
+                outputs = self.backend_model.generate(
+                    input_ids=inputs,
+                    synced_gpus=True,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    *args,
+                    **kwargs
+                )
+            else:
+                raise NotImplementedError(
+                    f"device \"{self.device}\" is not supported"
+                )
+        return outputs
+    def merge_lora_weights(self):
+        if self.model_args.use_lora:
+            self.get_backend_model().merge_and_unload()
+        else:
+            logger.warning("LoRA training is NOT enabled. Merging LoRA weights is not applicable.")
+    def save(self, dir, save_full_model=False, *args, **kwargs):
+        """
+        Perform generation process of the model.
+        Parameters
+        ------------
+        dir :
+            The directory to save model and tokenizer
+        save_full_model : Optional.
+            Whether to save full model.
+        kwargs : Optional.
+            Keyword arguments.
+        Returns
+        ------------
+        outputs :
+            The generated sequence output
+        """
+        self.get_tokenizer().save_pretrained(dir)
+        if save_full_model and self.model_args.use_lora:
+            self.backend_model_full.save_pretrained(dir)
+        else:
+            self.get_backend_model().save_pretrained(dir)
+    def get_max_length(self):
+        """
+        Return max acceptable input length in terms of tokens.
+        """
+        return self.tokenizer.model_max_length
+    def get_tokenizer(self):
+        """
+        Return the tokenizer of the model.
+        """
+        return self.tokenizer
+    def get_backend_model(self):
+        """
+        Return the backend model.
+        """
+        return self.backend_model

lmflow/models/interfaces/__init__.py ADDED Viewed

File without changes

lmflow/models/interfaces/tunable.py ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""Tunable class
+"""
+from abc import ABC
+class Tunable(ABC):
+    pass

lmflow/models/regression_model.py ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""General regression model."""
+from lmflow.models.base_model import BaseModel
+class RegressionModel(BaseModel):
+    def __init__(self, *args, **kwargs):
+        pass

lmflow/models/text_regression_model.py ADDED Viewed

	@@ -0,0 +1,57 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""
+A model maps "text_only" data to float.
+"""
+from lmflow.models.regression_model import RegressionModel
+from lmflow.datasets.dataset import Dataset
+class TextRegressionModel(RegressionModel):
+    r"""
+    Initializes a TextRegressionModel instance.
+    Parameters
+    ------------
+    model_args :
+        Model arguments such as model name, path, revision, etc.
+    args : Optional.
+        Positional arguments.
+    kwargs : Optional.
+        Keyword arguments.
+    """
+    def __init__(
+        self,
+        model_args,
+        *args,
+        **kwargs
+    ):
+        """
+        Initializes a TextRegressionModel instance.
+        :param model_args: dictionary with model arguments such as model name, path, revision, etc.
+        """
+        self.inference_func = None
+    def register_inference_function(self, inference_func):
+        """
+        Registers a regression function.
+        """
+        self.inference_func = inference_func
+    def inference(self, inputs: Dataset):
+        """
+        Gets regression results of a given dataset.
+        :inputs: Dataset object, only accept type "text_only".
+        """
+        if self.inference_func is not None:
+            return self.inference_func(inputs)
+        else:
+            pass

lmflow/pipeline/__init__.py ADDED Viewed

File without changes

lmflow/pipeline/auto_pipeline.py ADDED Viewed

	@@ -0,0 +1,45 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""Return a pipeline automatically based on its name.
+"""
+from lmflow.pipeline.evaluator import Evaluator
+from lmflow.pipeline.finetuner import Finetuner
+from lmflow.pipeline.inferencer import Inferencer
+from lmflow.pipeline.raft_aligner import RaftAligner
+PIPELINE_MAPPING = {
+    "evaluator": Evaluator,
+    "finetuner": Finetuner,
+    "inferencer": Inferencer,
+    "raft_aligner": RaftAligner,
+}
+class AutoPipeline:
+    """
+    The class designed to return a pipeline automatically based on its name.
+    """
+    @classmethod
+    def get_pipeline(self,
+        pipeline_name,
+        model_args,
+        data_args,
+        pipeline_args,
+        *args,
+        **kwargs
+    ):
+        if pipeline_name not in PIPELINE_MAPPING:
+            raise NotImplementedError(
+                f'Pipeline "{pipeline_name}" is not supported'
+            )
+        pipeline = PIPELINE_MAPPING[pipeline_name](
+            model_args,
+            data_args,
+            pipeline_args,
+            *args,
+            **kwargs
+        )
+        return pipeline

lmflow/pipeline/base_aligner.py ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/usr/bin/env python
+# coding=utf-8
+""" BaseTuner: a subclass of BasePipeline.
+"""
+from lmflow.pipeline.base_pipeline import BasePipeline
+class BaseAligner(BasePipeline):
+    """ A subclass of BasePipeline which is alignable.
+    """
+    def __init__(self, *args, **kwargs):
+        pass
+    def _check_if_alignable(self, model, dataset, reward_model):
+        # TODO: check if the model is alignable and dataset is compatible
+        # TODO: add reward_model
+        pass
+    def align(self, model, dataset, reward_model):
+        raise NotImplementedError(".align is not implemented")

lmflow/pipeline/base_pipeline.py ADDED Viewed

	@@ -0,0 +1,9 @@

+#!/usr/bin/env python
+# coding=utf-8
+""" BasePipeline.
+"""
+from abc import ABC         # abstract class
+class BasePipeline(ABC):
+    pass

lmflow/pipeline/base_tuner.py ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/usr/bin/env python
+# coding=utf-8
+""" BaseTuner: a subclass of BasePipeline.
+"""
+from lmflow.pipeline.base_pipeline import BasePipeline
+class BaseTuner(BasePipeline):
+    """ A subclass of BasePipeline which is tunable.
+    """
+    def __init__(self, *args, **kwargs):
+        pass
+    def _check_if_tunable(self, model, dataset):
+        # TODO: check if the model is tunable and dataset is compatible
+        pass
+    def tune(self, model, dataset):
+        raise NotImplementedError(".tune is not implemented")

lmflow/pipeline/evaluator.py ADDED Viewed

	@@ -0,0 +1,387 @@

+"""The Evaluator class simplifies the process of running evaluation on a language model provided by a HFDecoderModel instance imported from the lmflow package. The class constructor takes three dictionaries as arguments: model_args containing arguments related to the language model, data_args containing arguments related to the data used for evaluation, and evaluator_args containing other arguments for the evaluation process.
+The class has two methods: create_dataloader() that loads the data from the test file, creates a data loader, and returns it with the size of the data, and evaluate(model) that generates output text given input text. It uses the create_dataloader() method to load the data, iterates over the data in mini-batches, and encodes the input text with the encode() method of the HFDecoderModel class. Then, it generates output text using the evaluate() method of the HFDecoderModel class, decodes the generated output text using the decode() method of the HFDecoderModel class, and writes the output to a file in the output directory. The method also logs some information to the console and Weights and Biases if the use_wandb argument is True.
+"""
+import os
+# import deepspeed
+import torch
+import wandb
+import deepspeed
+import sys
+import numpy as np
+import datetime
+import json
+# TODO: remove later
+from transformers import AutoConfig
+import torch.distributed as dist
+from lmflow.datasets.dataset import Dataset
+from lmflow.pipeline.base_pipeline import BasePipeline
+from lmflow.models.hf_decoder_model import HFDecoderModel
+from lmflow.utils.data_utils import set_random_seed, batchlize, answer_extraction
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
+class Evaluator(BasePipeline):
+    """
+    Initializes the `Evaluator` class with given arguments.
+    Parameters
+    ------------
+    model_args : ModelArguments object.
+        Contains the arguments required to load the model.
+    data_args : DatasetArguments object.
+        Contains the arguments required to load the dataset.
+    evaluator_args : EvaluatorArguments object.
+        Contains the arguments required to perform evaluation.
+    """
+    def __init__(self, model_args, data_args, evaluator_args):
+    # our method
+        self.data_args = data_args
+        self.evaluator_args = evaluator_args
+        self.model_args = model_args
+        print("--------Begin Evaluator Arguments----------")
+        print(f"model_args : {self.model_args}")
+        print(f"data_args : {self.data_args}")
+        print(f"evaluator_args : {self.evaluator_args}")
+        print("--------End Evaluator Arguments----------")
+        # logger
+        if(self.evaluator_args.use_wandb == True):
+            wandb.init(project="lmflow_evaluation")
+        # random seed
+        set_random_seed(self.evaluator_args.random_seed)
+        self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        self.world_size = int(os.getenv("WORLD_SIZE", "1"))
+        torch.cuda.set_device(self.local_rank)  # NOTE: cpu-only machine will have error
+        deepspeed.init_distributed()
+        self.config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+        try:
+            self.model_hidden_size = self.config.hidden_size
+        except:
+            print("Error in setting hidden size, use the default size 1024")
+            self.model_hidden_size = 1024 # gpt2 seems do not have hidden_size in config
+        print(f"model_hidden_size = {self.model_hidden_size}")
+        # batch size has to be divisible by world_size, but can be bigger than world_size
+        train_batch_size = 1 * self.world_size
+        self.evaluator_args.minibatch_size = train_batch_size
+        self.block_size = evaluator_args.evaluate_block_size
+        # dataloader, data_size = create_dataloader(args)    # load dataset
+    def create_dataloader(self, dataset: Dataset):
+        data_dict = dataset.to_dict()
+        inputs = [ instance["input"] for instance in data_dict["instances"] ]
+        outputs = [ instance["output"] for instance in data_dict["instances"] ]
+        dataset_size = len(outputs)
+        dataset_buf = []
+        for idx in range(dataset_size):
+            dataset_buf.append({
+                "input": inputs[idx],
+                "output": outputs[idx],
+                "input_idx": idx
+            })
+        dataloader = batchlize(
+            dataset_buf,
+            self.evaluator_args.minibatch_size,
+            self.evaluator_args.random_shuffle
+        )
+        print(f"Successfully create dataloader with size {len(dataloader)}.")
+        return dataloader, dataset_size
+    # TODO: Split for better unittest
+    def _match(self, predicted_answer, groundtruth, answer_type=None):
+        case_insensitive_types = [
+            "strategyqa",
+            "coin_flip",
+            "pubmedqa",
+            "binary_choice",
+            "medmcqa",
+            "usmle",
+        ]
+        if answer_type in case_insensitive_types:
+            return predicted_answer.lower() == groundtruth.lower()
+        else:
+            return predicted_answer == groundtruth
+        return False
+    def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
+        """
+        Perform Evaluation for a model
+        Parameters
+        ------------
+        model : TunableModel object.
+            TunableModel to perform inference
+        dataset : Dataset object.
+        """
+        if metric in ["acc", "accuracy"]:
+            dataloader, data_size = self.create_dataloader(dataset)
+            if not dist.is_initialized() or dist.get_rank() == 0:
+                if not os.path.exists(self.evaluator_args.output_dir):
+                    os.makedirs(self.evaluator_args.output_dir)
+                output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
+            acc_list = []
+            total = 0
+            # ds_engine = deepspeed.initialize(model=model.get_model(), config_params=self.ds_config)[0]
+            # ds_engine.module.eval()
+            for batch_index, batch in enumerate(dataloader):
+                if batch_index * self.world_size >= self.data_args.max_eval_samples:
+                    break
+                if self.local_rank >= len(batch):
+                    current_batch = batch[0]
+                else:
+                    # the batch in current process
+                    current_batch = batch[self.local_rank]
+                prompt_structure = self.evaluator_args.prompt_structure
+                input = prompt_structure.format(input=current_batch['input'])
+                output = current_batch['output']
+                input_idx = current_batch['input_idx']
+                inputs = model.encode(input, return_tensors="pt").to(device=self.local_rank)
+                # with torch.no_grad():
+                    # outputs = ds_engine.module.generate(inputs, synced_gpus=True, pad_token_id=model.get_tokenizer().eos_token_id, min_length=5, max_length=100,temperature=0.0, do_sample=False)
+                outputs = model.inference(inputs, max_new_tokens=100, temperature=0.0)
+                text_out = model.decode(outputs[0], skip_special_tokens=True)
+                # # only return the generation, trucating the input
+                prompt_length = len(model.decode(inputs[0], skip_special_tokens=True,))
+                text_out = text_out[prompt_length:]
+                answer_type = self.evaluator_args.answer_type
+                pred_answer = answer_extraction(
+                    text_out,
+                    answer_type=answer_type,
+                )
+                print(f"batch_index{batch_index} rank{self.local_rank}:\n   question={input}\n  prediction={text_out}\n")
+                print(f"predicted answer: {pred_answer} \n")
+                print(f"groundtruth answer: {output} \n")
+                if self.local_rank >= len(batch): # for last batch, the padding examples are ignored and donot contribute to the accuracy
+                    correct_ = 0
+                    total_ = 0
+                else:
+                    correct_ = 0
+                    total_ = 1
+                    if self._match(pred_answer, output, answer_type):
+                        correct_ = 1
+                # collect accuracy from all gpus
+                all_process = torch.tensor([correct_, total_], dtype=torch.float32, device=self.local_rank)
+                dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
+                correct_, total_ = all_process.tolist()
+                avg = correct_ / total_
+                acc_list.append(avg)
+                total += total_
+                # collect predictions from all gpus
+                output_dict = {"question": input,
+                            "prediction": text_out,
+                            "pred_answer": pred_answer,
+                            "answer": output}
+                all_process_list = [{}] * self.world_size
+                dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None, dst=0)
+                if not dist.is_initialized() or dist.get_rank() == 0:
+                    current_accuracy = np.mean(acc_list)
+                    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current accuracy = {}".format(int(total), data_size, current_accuracy))
+                    if(self.evaluator_args.use_wandb == True):
+                        wandb.log({"Accuracy": current_accuracy})
+                    for index, output in enumerate(all_process_list):
+                        output_json = json.dumps(output)
+                        output_writer.write(output_json + '\n')
+            if not dist.is_initialized() or dist.get_rank() == 0:
+                current_accuracy = np.mean(acc_list)
+                print("Final accuracy = ", current_accuracy)
+                output_writer.close()
+        elif metric in ["ppl", "perplexity"]:
+            ppl =  self._evaluate_ppl(model, dataset)
+            print(f"Evaluating final ppl: {ppl}")
+        elif metric in ["nll", "neg_log_likelihood"]:
+            neg_log_likelihood = self._evaluate_neg_log_likelihood(model, dataset)
+            print(f"Evaluating final negative log likelihood: {neg_log_likelihood}")
+        else:
+            raise NotImplementedError(f"{metric} is not implemented or not match with our defined metrics")
+    def _evaluate_ppl(self, model, dataset: Dataset):
+        data_dict = dataset.to_dict()
+        if data_dict['type'] == 'text2text':
+            raise NotImplementedError("ppl evaluation is currently not supported for text2text dataset, please use text_only dataset.")
+        texts = [ instance["text"] for instance in data_dict["instances"] ]
+        encodings = model.get_tokenizer()("\n\n".join(texts), return_tensors="pt")
+        # Define some constant
+        try:
+            max_length = min(model.get_backend_model().config.n_positions, model.get_max_length())
+        except:
+            max_length = min(1024, model.get_max_length())
+        print(f"The maximum sequence length : {max_length}")
+        seq_len = encodings.input_ids.size(1)
+        nlls = []
+        prev_end_loc = 0
+        for begin_loc in range(0, seq_len, self.block_size):
+            end_loc = min(begin_loc + max_length, seq_len)
+            trg_len = end_loc - prev_end_loc  # may be different from block_size on last loop
+            input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device=self.local_rank)
+            target_ids = input_ids.clone()
+            target_ids[:, :-trg_len] = -100
+            with torch.no_grad():
+                outputs = model.get_backend_model()(input_ids, labels=target_ids)
+                # loss is calculated using CrossEntropyLoss which averages over valid labels
+                # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
+                # to the left by 1.
+                neg_log_likelihood = outputs.loss
+            nlls.append(neg_log_likelihood)
+            prev_end_loc = end_loc
+            print(f"Evaluating PPL: {int(begin_loc/self.block_size) + 1} / {int(seq_len/self.block_size)} Complete, current ppl : {torch.exp(torch.stack(nlls).mean())}")
+            if end_loc == seq_len:
+                break
+        ppl = torch.exp(torch.stack(nlls).mean())
+        return ppl
+    def _evaluate_neg_log_likelihood(self, model, dataset: Dataset):
+        """
+        Evaluates negative log likelihood of the model over a dataset.
+        NLL = -1/N sum_{i=1}^N sum_{j=1}^|w_i| ln(p(w_{i,j}|context_window)),
+        where N is the number of data samples, w_{i,j} is the j-th token in
+        i-th sample. Here "context_window" = p(w_{i,start}, w_{i,start+1}, ...,
+        p_{i,j-1} with start = max(0, j - window_length + 1). "window_length"
+        is normally the maximum length accepted by the model.
+        Returns:
+            A float which represents the negative log likelihood.
+        """
+        data_dict = dataset.to_dict()
+        # Handles prompt structure
+        if dataset.get_type() == "text2text":
+            prompt = self.evaluator_args.prompt_structure
+            data_dict["instances"] = [
+                {
+                    "input": prompt.format(input=instance["input"]),
+                    "output": instance["output"]
+                }
+                for instance in data_dict["instances"]
+            ]
+        dataset = dataset.from_dict(data_dict)
+        tokenized_dataset = model.tokenize(dataset, add_special_tokens=False)
+        tokenized_dataset = tokenized_dataset.get_backend_dataset()
+        encoding_list = [
+            {
+                "input_ids": torch.tensor([input_ids]),
+                "labels": torch.tensor([labels]),
+            }
+            for input_ids, labels in zip(tokenized_dataset["input_ids"],
+                                         tokenized_dataset["labels"])
+        ]
+        # Gets context window length
+        try:
+            max_length = min(model.get_backend_model().config.n_positions,
+                             model.get_max_length())
+        except:
+            max_length = min(1024, model.get_max_length())
+        nlls = []
+        full_nlls = []
+        num_samples = len(encoding_list)
+        for sample_idx, encodings in enumerate(encoding_list):
+            seq_len = encodings["input_ids"].size(1)
+            prev_end_loc = 0
+            for begin_loc in range(0, seq_len, self.block_size):
+                end_loc = min(begin_loc + max_length, seq_len)
+                # may be different from block_size on last loop
+                trg_len = end_loc - prev_end_loc
+                input_ids = encodings["input_ids"][:, begin_loc:end_loc]
+                input_ids = input_ids.to(device=self.local_rank)
+                labels = encodings["labels"][:, begin_loc:end_loc]
+                target_ids = labels.clone()
+                full_target_ids = input_ids.clone()
+                def get_nll(label_ids, nll_list):
+                    label_ids[:, :-trg_len] = -100
+                    label_ids = label_ids.to(device=self.local_rank)
+                    # Valid labels are from 0 to `vocab_size`
+                    num_valid_labels = torch.count_nonzero(label_ids >= 0)
+                    if label_ids[0, 0] != -100:
+                        num_valid_labels -= 1
+                    if not torch.all(label_ids == -100):
+                        with torch.no_grad():
+                            outputs = model.get_backend_model()(
+                                input_ids, labels=label_ids
+                            )
+                            # loss is calculated using CrossEntropyLoss which
+                            # sums over valid labels N.B. the model only
+                            # calculates loss over trg_len - 1 labels, because
+                            # it internally shifts the labels to the left by 1.
+                            neg_log_likelihood = outputs.loss * num_valid_labels
+                    else:
+                        neg_log_likelihood = torch.zeros([]).to(
+                            device=self.local_rank
+                        )
+                    nll_list.append(neg_log_likelihood)
+                get_nll(target_ids, nlls)
+                get_nll(full_target_ids, full_nlls)
+                current_output_nll = torch.stack(nlls).sum() / (sample_idx + 1)
+                current_full_nll = torch.stack(full_nlls).sum() / (sample_idx + 1)
+                prev_end_loc = end_loc
+                if dataset.get_type() == "text_only":
+                    print(
+                        f"Evaluating negative log likelihood:"
+                        f" {sample_idx + 1} / {num_samples} Complete,"
+                        f" current nll: {current_full_nll}"
+                    )
+                elif dataset.get_type() == "text2text":
+                    print(
+                        f"Evaluating negative log likelihood:"
+                        f" {sample_idx + 1} / {num_samples} Complete,"
+                        f" current full nll / input nll / output nll:"
+                        f" {current_full_nll} /"
+                        f" {current_full_nll - current_output_nll} /"
+                        f" {current_output_nll}"
+                    )
+                else:
+                    raise NotImplementedError(
+                        "f{dataset.get_type()} typed datasets are not supported"
+                    )
+                if end_loc == seq_len:
+                    break
+        mean_nll = torch.stack(nlls).sum() / num_samples
+        return mean_nll

lmflow/pipeline/finetuner.py ADDED Viewed

	@@ -0,0 +1,273 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""The Finetuner class simplifies the process of running finetuning process on a language model for a TunableModel instance with given dataset.
+"""
+import logging
+import os
+import sys
+import datasets
+import transformers
+from itertools import chain
+from transformers import (
+    Trainer,
+    default_data_collator,
+    set_seed,
+)
+from transformers.utils import send_example_telemetry
+from lmflow.datasets.dataset import Dataset
+from lmflow.pipeline.base_tuner import BaseTuner
+logger = logging.getLogger(__name__)
+class Finetuner(BaseTuner):
+    """
+    Initializes the `Finetuner` class with given arguments.
+    Parameters
+    ------------
+    model_args : ModelArguments object.
+        Contains the arguments required to load the model.
+    data_args : DatasetArguments object.
+        Contains the arguments required to load the dataset.
+    finetuner_args : FinetunerArguments object.
+        Contains the arguments required to perform finetuning.
+    args : Optional.
+        Positional arguments.
+    kwargs : Optional.
+        Keyword arguments.
+    """
+    def __init__(self, model_args, data_args, finetuner_args, *args, **kwargs):
+        self.model_args = model_args
+        self.data_args = data_args
+        self.finetuner_args = finetuner_args
+        # Sending telemetry. Tracking the example usage helps us better
+        # allocate resources to maintain them. The information sent is the one
+        # passed as arguments along with your Python/PyTorch versions.
+        send_example_telemetry("run_clm", model_args, data_args)
+        # Setup logging
+        logging.basicConfig(
+            format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+            datefmt="%m/%d/%Y %H:%M:%S",
+            handlers=[logging.StreamHandler(sys.stdout)],
+        )
+        log_level = finetuner_args.get_process_log_level()
+        logger.setLevel(log_level)
+        datasets.utils.logging.set_verbosity(log_level)
+        transformers.utils.logging.set_verbosity(log_level)
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+        # Log on each process the small summary:
+        logger.warning(
+            f"Process rank: {finetuner_args.local_rank},"
+            f" device: {finetuner_args.device},"
+            f" n_gpu: {finetuner_args.n_gpu}"
+            f"distributed training: {bool(finetuner_args.local_rank != -1)},"
+            f" 16-bits training: {finetuner_args.fp16}"
+        )
+        logger.info(f"Training/evaluation parameters {finetuner_args}")
+        # Detecting last checkpoint.
+        last_checkpoint = None
+        if os.path.isdir(finetuner_args.output_dir) and finetuner_args.do_train and not finetuner_args.overwrite_output_dir:
+            last_checkpoint = get_last_checkpoint(finetuner_args.output_dir)
+            if last_checkpoint is None and len(os.listdir(finetuner_args.output_dir)) > 0:
+                raise ValueError(
+                    f"Output directory ({finetuner_args.output_dir}) already"
+                    " exists and is not empty. "
+                    "Use --overwrite_output_dir to overcome."
+                )
+            elif last_checkpoint is not None and finetuner_args.resume_from_checkpoint is None:
+                logger.info(
+                    f"Checkpoint detected, resuming training at"
+                    f" {last_checkpoint}. To avoid this behavior, change"
+                    " the `--output_dir` or add `--overwrite_output_dir` to"
+                    " train from scratch."
+                )
+        self.last_checkpoint = last_checkpoint
+        # Set seed before initializing model.
+        set_seed(finetuner_args.seed)
+    def group_text(self, tokenized_datasets, model_max_length):
+        """
+        Groups texts together to form blocks of maximum length `model_max_length` and returns the processed data as
+        a dictionary.
+        """
+        data_args = self.data_args
+        finetuner_args = self.finetuner_args
+        if data_args.block_size is None:
+            block_size = model_max_length
+            if block_size > 1024:
+                logger.warning(
+	    			"The chosen tokenizer supports a `model_max_length` that is"
+	    			" longer than the default `block_size` value"
+	    			" of 1024. If you would like to use a longer `block_size`"
+	    			" up to `tokenizer.model_max_length` you can override this "
+	    			" default with `--block_size xxx`."
+                )
+                block_size = 1024
+        else:
+            if data_args.block_size > model_max_length:
+                logger.warning(
+                    f"The block_size passed ({data_args.block_size}) is larger"
+	    			f" than the maximum length for the model"
+                    f"({model_max_length})."
+                    f" Using block_size={model_max_length}."
+                )
+            block_size = min(data_args.block_size, model_max_length)
+        # Main data processing function that will concatenate all texts from
+        # our dataset and generate chunks of block_size.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model
+            # supported it instead of this drop, you can customize this part to
+            # your needs.
+            total_length = (total_length // block_size) * block_size
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+                for k, t in concatenated_examples.items()
+            }
+            return result
+        # Note that with `batched=True`, this map processes 1,000 texts
+        # together, so group_texts throws away a remainder for each of those
+        # groups of 1,000 texts. You can adjust that batch_size here but a
+        # higher value might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation
+        # of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        with finetuner_args.main_process_first(desc="grouping texts together"):
+            group_batch_size = 1000
+            if data_args.disable_group_texts:
+                group_batch_size = 1
+            if not data_args.streaming:
+                lm_datasets = tokenized_datasets.map(
+                    group_texts,
+                    batched=True,
+                    batch_size=group_batch_size,
+                    num_proc=data_args.preprocessing_num_workers,
+                    load_from_cache_file=not data_args.overwrite_cache,
+                    desc=f"Grouping texts in chunks of {block_size}",
+                )
+            else:
+                lm_datasets = tokenized_datasets.map(
+                    group_texts,
+                    batched=True,
+                    batch_size=group_batch_size,
+                )
+        return lm_datasets
+    def tune(self, model, dataset):
+        """
+        Perform tuning for a model
+        Parameters
+        ------------
+        model : TunableModel object.
+            TunableModel to perform tuning.
+        dataset:
+            dataset to train model.
+        """
+        model_args = self.model_args
+        data_args = self.data_args
+        finetuner_args = self.finetuner_args
+        # Tokenization and text grouping must be done in the main process
+        with finetuner_args.main_process_first(desc="dataset map tokenization"):
+            tokenized_dataset = model.tokenize(dataset)
+            lm_dataset = self.group_text(
+                tokenized_dataset,
+                model_max_length=model.get_max_length(),
+            )
+        train_dataset = lm_dataset.get_backend_dataset()
+        if finetuner_args.do_train:
+            if data_args.max_train_samples is not None:
+                max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+                train_dataset = train_dataset.select(range(max_train_samples))
+        # Initialize our Trainer
+        training_args = finetuner_args
+        trainer = Trainer(
+            model=model.get_backend_model(),
+            args=training_args,
+            train_dataset=train_dataset if training_args.do_train else None,
+            eval_dataset=None,
+            tokenizer=model.get_tokenizer(),
+            # Data collator will default to DataCollatorWithPadding, so we change it.
+            data_collator=default_data_collator,
+            compute_metrics=None,
+            preprocess_logits_for_metrics=None,
+        )
+        # Training
+        if training_args.do_train:
+            checkpoint = None
+            last_checkpoint = self.last_checkpoint
+            if training_args.resume_from_checkpoint is not None:
+                checkpoint = training_args.resume_from_checkpoint
+            elif last_checkpoint is not None:
+                checkpoint = last_checkpoint
+            train_result = trainer.train(resume_from_checkpoint=checkpoint)
+            if not model_args.use_lora:
+                trainer.save_model()  # Saves the tokenizer too for easy upload
+            else:
+                if model_args.save_aggregated_lora:
+                    model.merge_lora_weights()
+                model.save(finetuner_args.output_dir,model_args.save_aggregated_lora)
+            metrics = train_result.metrics
+            max_train_samples = (
+                data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+            )
+            metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+            trainer.log_metrics("train", metrics)
+            trainer.save_metrics("train", metrics)
+            trainer.save_state()
+        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
+        if data_args.dataset_name is not None:
+            kwargs["dataset_tags"] = data_args.dataset_name
+            if data_args.dataset_config_name is not None:
+                kwargs["dataset_args"] = data_args.dataset_config_name
+                kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+            else:
+                kwargs["dataset"] = data_args.dataset_name
+        if training_args.push_to_hub:
+            trainer.push_to_hub(**kwargs)
+        else:
+            trainer.create_model_card(**kwargs)
+        return model

lmflow/pipeline/inferencer.py ADDED Viewed

	@@ -0,0 +1,194 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""The Inferencer class simplifies the process of model inferencing."""
+import os
+import torch
+import wandb
+import deepspeed
+import sys
+import numpy as np
+import datetime
+import json
+from transformers import AutoConfig
+import torch.distributed as dist
+from lmflow.args import DatasetArguments
+from lmflow.datasets.dataset import Dataset
+from lmflow.pipeline.base_pipeline import BasePipeline
+from lmflow.models.hf_decoder_model import HFDecoderModel
+from lmflow.utils.data_utils import set_random_seed, batchlize, answer_extraction
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
+def rstrip_partial_utf8(string):
+    return string.replace("\ufffd", "")
+class Inferencer(BasePipeline):
+    """
+    Initializes the `Inferencer` class with given arguments.
+    Parameters
+    ------------
+    model_args : ModelArguments object.
+        Contains the arguments required to load the model.
+    data_args : DatasetArguments object.
+        Contains the arguments required to load the dataset.
+    inferencer_args : InferencerArguments object.
+        Contains the arguments required to perform inference.
+    """
+    def __init__(self, model_args, data_args, inferencer_args):
+        self.data_args = data_args
+        self.inferencer_args = inferencer_args
+        self.model_args = model_args
+        set_random_seed(self.inferencer_args.random_seed)
+        self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        self.world_size = int(os.getenv("WORLD_SIZE", "1"))
+        if inferencer_args.device == "gpu":
+            torch.cuda.set_device(self.local_rank)  # NOTE: cpu-only machine will have error
+            deepspeed.init_distributed()
+        else:
+            os.environ["MASTER_ADDR"] = "localhost"
+            os.environ["MASTER_PORT"] = "15000"
+            dist.init_process_group(
+                "gloo", rank=self.local_rank, world_size=self.world_size
+            )
+        self.config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
+        try:
+            self.model_hidden_size = self.config.hidden_size
+        except:
+            print("Error in setting hidden size, use the default size 1024")
+            self.model_hidden_size = 1024 # gpt2 seems do not have hidden_size in config
+    def create_dataloader(self, dataset: Dataset):
+        data_dict = dataset.to_dict()
+        inputs = [ instance["text"] for instance in data_dict["instances"] ]
+        dataset_size = len(inputs)
+        dataset_buf = []
+        for idx in range(dataset_size):
+            dataset_buf.append({
+                "input": inputs[idx],
+                "input_idx": idx
+            })
+        dataloader = batchlize(
+            dataset_buf,
+            batch_size=1,
+            random_shuffle=False,
+        )
+        return dataloader, dataset_size
+    def inference(
+        self,
+        model,
+        dataset: Dataset,
+        max_new_tokens: int=100,
+        temperature: float=0.0,
+        prompt_structure: str='{input}',
+    ):
+        """
+        Perform inference for a model
+        Parameters
+        ------------
+        model : TunableModel object.
+            TunableModel to perform inference
+        dataset : Dataset object.
+        Returns:
+        output_dataset: Dataset object.
+        """
+        if dataset.get_type() != "text_only":
+            raise NotImplementedError(
+                'input dataset should have type "text_only"'
+            )
+        dataloader, data_size = self.create_dataloader(dataset)
+        # The output dataset
+        output_dict = {
+            "type": "text_only",
+            "instances": [
+            ]
+        }
+        for batch_index, batch in enumerate(dataloader):
+            current_batch = batch[0]        # batch size is 1
+            input = prompt_structure.format(input=current_batch['input'])
+            if self.inferencer_args.device == "gpu":
+                inputs = model.encode(input, return_tensors="pt").to(device=self.local_rank)
+            elif self.inferencer_args.device == "cpu":
+                inputs = model.encode(input, return_tensors="pt").to(device='cpu')
+            else:
+                raise NotImplementedError(
+                    f"device \"{self.inferencer_args.device}\" is not supported"
+                )
+            outputs = model.inference(
+                inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                repetition_penalty=1.0,
+            )
+            text_out = model.decode(outputs[0], skip_special_tokens=True)
+            # only return the generation, trucating the input
+            prompt_length = len(model.decode(inputs[0], skip_special_tokens=True,))
+            text_out = text_out[prompt_length:]
+            output_dict["instances"].append({ "text": text_out })
+        output_dataset = Dataset(DatasetArguments(dataset_path = None))
+        output_dataset = output_dataset.from_dict(output_dict)
+        return output_dataset
+    def stream_inference(self, context, model, max_new_tokens, token_per_step, temperature, end_string, input_dataset):
+        response = ""
+        history = []
+        if "ChatGLMModel" in self.config.architectures:
+            for response, history in model.get_backend_model().stream_chat(model.get_tokenizer(), context, history=history):
+                response = rstrip_partial_utf8(response)
+                yield response, False
+        else:
+            for _ in range(0, max_new_tokens // token_per_step):
+                output_dataset = self.inference(
+                    model=model,
+                    dataset=input_dataset,
+                    max_new_tokens=token_per_step,
+                    temperature=temperature,
+                )
+                new_append_text = output_dataset.to_dict()["instances"][0]["text"]
+                new_append_text = rstrip_partial_utf8(new_append_text)
+                response += new_append_text
+                input_dict = input_dataset.to_dict()
+                input_dict["instances"][0]["text"] += new_append_text
+                input_dataset = input_dataset.from_dict(input_dict)
+                flag_break = False
+                try:
+                    index = response.index(end_string)
+                    flag_break = True
+                except ValueError:
+                    response += end_string
+                    index = response.index(end_string)
+                response = response[:index]
+                yield response, flag_break

lmflow/pipeline/raft_aligner.py ADDED Viewed

	@@ -0,0 +1,456 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""
+The Aligner class simplifies the process of running alignment.
+"""
+import logging
+import numpy as np
+import os
+import sys
+import time
+from itertools import chain
+import torch
+import torch.distributed as dist
+import transformers
+from datasets import (
+    set_caching_enabled,
+    Dataset,
+    DatasetDict,
+)
+from transformers import (
+    default_data_collator,
+    pipeline,
+    set_seed,
+)
+from transformers.testing_utils import CaptureLogger
+from lmflow.args import DatasetArguments
+from lmflow.datasets.dataset import Dataset as LMFlowDataset
+from lmflow.pipeline.base_aligner import BaseAligner
+from lmflow.pipeline.utils.raft_trainer import RaftTrainer
+logger = logging.getLogger(__name__)
+class RaftAligner(BaseAligner):
+    """
+    Initializes the `RaftAligner` class with given arguments.
+    Parameters
+    ------------
+    model_args : ModelArguments object.
+        Contains the arguments required to load the model.
+    data_args : DatasetArguments object.
+        Contains the arguments required to load the dataset.
+    raft_aligner_args : RaftAlignerArguments object.
+        Contains the arguments required to perform alignment.
+    args : Optional.
+        Positional arguments.
+    kwargs : Optional.
+        Keyword arguments.
+    """
+    def __init__(self, model_args, data_args, aligner_args, *args, **kwargs):
+        self.model_args = model_args
+        self.data_args = data_args
+        self.aligner_args = aligner_args
+        logging.basicConfig(
+            format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+            datefmt="%m/%d/%Y %H:%M:%S",
+            handlers=[logging.StreamHandler(sys.stdout)],
+        )
+        logger.setLevel(logging.INFO)
+        output_reward_path = aligner_args.output_reward_path
+        if output_reward_path is not None:
+            os.makedirs(os.path.dirname(output_reward_path), exist_ok=True)
+            # Deletes a maybe-exist file
+            try:
+                os.remove(output_reward_path)
+            except OSError:
+                pass
+    def _initialize_trainer(self, model, tokenizer, training_args):
+        """
+        This function takes the model and tokenizer as the input and initialize the trainer.
+        """
+        trainer = RaftTrainer(
+            model=model,
+            args=training_args,
+            train_dataset=Dataset.from_dict({"text": [ " " ] }),
+            eval_dataset=Dataset.from_dict({}),
+            tokenizer=tokenizer,
+            data_collator=default_data_collator,
+            compute_metrics=None,
+            preprocess_logits_for_metrics=None,
+        )
+        return trainer
+    def _load_dataset(
+        self,
+        selected_dataset,
+        model,
+        tokenizer,
+        model_args,
+        data_args,
+        training_args,
+    ):
+        '''
+        This function prepares the dataset for every iteration.
+        '''
+        raw_datasets = selected_dataset
+        if training_args.do_train:
+            column_names = list(raw_datasets["train"].features)
+        else:
+            column_names = list(raw_datasets["validation"].features)
+        text_column_name = "text" if "text" in column_names else column_names[0]
+        # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
+        tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
+        def tokenize_function(examples):
+            with CaptureLogger(tok_logger) as cl:
+                output = tokenizer(examples[text_column_name])
+            # clm input could be much much longer than block_size
+            if "Token indices sequence length is longer than the" in cl.out:
+                tok_logger.warning(
+                    "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
+                    " before being passed to the model."
+                )
+            return output
+        with training_args.main_process_first(desc="dataset map tokenization"):
+            if not data_args.streaming:
+                tokenized_datasets = raw_datasets.map(
+                    tokenize_function,
+                    batched=True,
+                    num_proc=data_args.preprocessing_num_workers,
+                    remove_columns=column_names,
+                    load_from_cache_file=not data_args.overwrite_cache,
+                    desc="Running tokenizer on dataset",
+                )
+            else:
+                tokenized_datasets = raw_datasets.map(
+                    tokenize_function,
+                    batched=True,
+                    remove_columns=column_names,
+                )
+        if data_args.block_size is None:
+            block_size = tokenizer.model_max_length
+            if block_size > 1024:
+                logger.warning(
+                    "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
+                    " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
+                    " override this default with `--block_size xxx`."
+                )
+                block_size = 512
+        else:
+            if data_args.block_size > tokenizer.model_max_length:
+                logger.warning(
+                    f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
+                    f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
+                )
+            block_size = min(data_args.block_size, tokenizer.model_max_length)
+        # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # customize this part to your needs.
+            if total_length >= block_size:
+                total_length = (total_length // block_size) * block_size
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+                for k, t in concatenated_examples.items()
+            }
+            result["labels"] = result["input_ids"].copy()
+            return result
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
+        # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
+        # to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        with training_args.main_process_first(desc="grouping texts together"):
+            group_batch_size = 1000
+            if data_args.disable_group_texts:
+                group_batch_size = 1
+            if not data_args.streaming:
+                lm_datasets = tokenized_datasets.map(
+                    group_texts,
+                    batched=True,
+                    batch_size=group_batch_size,
+                    num_proc=data_args.preprocessing_num_workers,
+                    load_from_cache_file=not data_args.overwrite_cache,
+                    desc=f"Grouping texts in chunks of {block_size}",
+                )
+            else:
+                lm_datasets = tokenized_datasets.map(
+                    group_texts,
+                    batched=True,
+                    batch_size=group_batch_size,
+                )
+        if training_args.do_train:
+            if "train" not in tokenized_datasets:
+                raise ValueError("--do_train requires a train dataset")
+            train_dataset = lm_datasets["train"]
+            if data_args.max_train_samples is not None:
+                max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+                train_dataset = train_dataset.select(range(max_train_samples))
+        return train_dataset
+    def _load_input_dataset(self, dataset, tokenizer):
+        """
+        Load input dataset (i.e. prompt/question dataset) for training.
+        Args:
+            dataset: A Dataset object.
+                The dataset to be loaded.
+        Returns:
+            dataloader (`torch.utils.data.DataLoader`):
+                The dataloader for the dataset.
+        """
+        ds = dataset.get_backend_dataset()
+        def tokenize(sample):
+            input_size = 16
+            review_encode = tokenizer.encode(sample["text"])
+            sample["input_ids"] = review_encode[:input_size]
+            sample['input'] = tokenizer.decode(sample["input_ids"])
+            return sample
+        ds = ds.map(tokenize, batched=False)
+        ds.set_format(type='torch')
+        return ds
+    def _get_batch_dataset_top(
+        self,
+        model,
+        batch_input,
+        alpha=0.2,
+        iter_id=0,
+        local_rank=0,
+        output_min_length=16,
+        output_max_length=48,
+        infer_batch_size=8,
+        generation_kwargs={},
+        tokenizer=None,
+        training_args=None,
+        reward_model=None,
+        output_reward_path=None,
+    ):
+        """
+        :param batch_input: input prompts
+        """
+        # we will get the batch dataset via Dataset.from_dict
+        start_time = time.time()
+        output_data = []
+        query_tensors = batch_input['input_ids']
+        querys = batch_input['input']
+        data_size = len(querys)
+        cnt = 0
+        reward_eva = []
+        reward_train = []
+        out_put_dataset_eval = {}
+        data_eval = []
+        input_texts = []
+        responses = []
+        for i, query_tensor in enumerate(query_tensors):
+            query = querys[i]
+            input_texts.append(query)
+            if (i + 1) % infer_batch_size == 0:
+                gen_len = np.random.randint(output_min_length, output_max_length)
+                generation_kwargs["max_new_tokens"] = gen_len
+                inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(training_args.device)
+                with torch.no_grad():
+                    outputs = model.generate(**inputs, **generation_kwargs)
+                generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+                generated_texts = [
+                    generated_text.replace(input_texts[i], "") for i, generated_text in enumerate(generated_texts)
+                ]
+                texts_for_rewards = [q + r for q, r in zip(input_texts, generated_texts)]
+                texts_for_reward_dataset = LMFlowDataset.create_from_dict({
+                    "type": "text_only",
+                    "instances": [
+                        { "text": text } for text in texts_for_rewards
+                    ],
+                })
+                reward_dataset = reward_model.inference(texts_for_reward_dataset)
+                rewards = [ sample["value"] for sample in reward_dataset.to_dict()["instances"] ]
+                reward_eva.extend(rewards)
+                responses.extend(generated_texts)
+                input_texts = []
+        data = []
+        idx = np.argsort(reward_eva)[::-1][:int(data_size * alpha)]
+        for j in range(len(reward_eva)):
+            sample = {}
+            sample["input"] = querys[j]
+            sample["output"] = [responses[j]]
+            data.append(sample)
+        output_data = [data[j] for j in idx]
+        logger.info(f"collected data of {len(output_data)}")
+        world_size = int(os.getenv("WORLD_SIZE", "1"))
+        all_process_list =[{}] * world_size
+        dist.all_gather_object(all_process_list, output_data)
+        gathered_data = []
+        for i in range(world_size):
+            gathered_data.extend(all_process_list[i])
+        reward_train = [reward_eva[j] for j in idx]
+        reward_to_send = [np.mean(reward_eva), np.mean(reward_train)]
+        all_process_rewards = [{}] * world_size
+        dist.all_gather_object(all_process_rewards, reward_to_send)
+        logger.info(all_process_rewards)
+        if training_args.local_rank == 0 and output_reward_path is not None:
+            with open(output_reward_path, mode='a') as fout:
+                fout.write('mean reward: ' + str(np.mean([all_process_rewards[i][0] for i in range(world_size)])) + 'mean reward in training set: ' + str([all_process_rewards[i][1] for i in range(world_size)]))
+                fout.write("\n")
+        prompt_structure = "{definition}{input}{output}"
+        output_dataset = {
+            "text": [ prompt_structure.format(
+                          definition="", input=sample["input"], output=sample["output"][0]
+                      ) for sample in gathered_data
+            ]
+        }
+        return DatasetDict({ "train": Dataset.from_dict(output_dataset) })
+    def align(self, model, dataset, reward_model):
+        """
+        Perform alignment for a model
+        Parameters
+        ------------
+        model : BaseModel object.
+        dataset: Dataset object.
+            Input dataset for model to generate outputs. The input and output
+                will then be feed into reward model to get the reward for
+                alignment.
+        reward_model: RegressionModel object.
+        """
+        tokenizer = model.get_tokenizer()
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokenizer.padding_side = "left"
+        dataset = self._load_input_dataset(dataset, tokenizer)
+        set_caching_enabled(False)
+        wrapped_model = model
+        model = model.get_backend_model()
+        generation_kwargs = {
+            "min_length": -1,
+            "top_k": 0.0,
+            "top_p": 1.0,
+            "do_sample": True,
+            "pad_token_id": tokenizer.eos_token_id,
+            "temperature":0.7
+        }
+        aligner_args = self.aligner_args
+        training_args = aligner_args
+        model_args = self.model_args
+        data_args = self.data_args
+        set_seed(42 + training_args.local_rank)
+        ITERATION = aligner_args.num_raft_iteration
+        M = aligner_args.raft_batch_size
+        alpha = aligner_args.top_reward_percentage
+        data_size = len(dataset['input'])
+        reward_seq = []
+        lr = training_args.learning_rate
+        raft_trainer = self._initialize_trainer(model, tokenizer, training_args)
+        raft_trainer.train(resume_from_checkpoint=False, is_first_time=True)
+        ##############
+        for iteration in range(ITERATION):
+            set_seed(88 + training_args.local_rank + 4 * (iteration+1))
+            batch_input = dataset.select(np.random.randint(low=0, high=data_size, size=M))
+            selected_dataset = self._get_batch_dataset_top(
+                raft_trainer.tmp_model,
+                batch_input,
+                alpha,
+                iteration,
+                training_args.local_rank,
+                output_min_length=aligner_args.output_min_length,
+                output_max_length=aligner_args.output_max_length,
+                infer_batch_size=aligner_args.inference_batch_size_per_device,
+                generation_kwargs=generation_kwargs,
+                tokenizer=tokenizer,
+                training_args=training_args,
+                reward_model=reward_model,
+                output_reward_path=aligner_args.output_reward_path,
+            )
+            raft_trainer.train_dataset = self._load_dataset(
+                selected_dataset,
+                raft_trainer.tmp_model,
+                tokenizer,
+                model_args,
+                data_args,
+                training_args,
+            )
+            logger.info(f"iter {iteration}")
+            start_time = time.time()
+            train_result = raft_trainer.train(resume_from_checkpoint=False)
+            end_time = time.time()
+            logger.info("It takes %.2f s to train one stage", end_time - start_time)
+        self._get_batch_dataset_top(
+            raft_trainer.tmp_model,
+            batch_input, alpha,
+            iteration,
+            training_args.local_rank,
+            output_min_length=aligner_args.output_min_length,
+            output_max_length=aligner_args.output_max_length,
+            infer_batch_size=aligner_args.inference_batch_size_per_device,
+            generation_kwargs=generation_kwargs,
+            tokenizer=tokenizer,
+            training_args=training_args,
+            reward_model=reward_model,
+            output_reward_path=aligner_args.output_reward_path,
+        )
+        if aligner_args.output_dir is not None:
+            wrapped_model.save(aligner_args.output_dir)
+        return wrapped_model

lmflow/pipeline/utils/__init__.py ADDED Viewed

File without changes

lmflow/pipeline/utils/raft_trainer.py ADDED Viewed

The diff for this file is too large to render. See raw diff

lmflow/utils/__init__.py ADDED Viewed

File without changes

lmflow/utils/constants.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""
+Commonly used constants.
+"""
+TEXT_ONLY_DATASET_DESCRIPTION = (
+"""
+"text_only": a dataset with only raw text instances, with following format:
+    {
+        "type": "text_only",
+        "instances": [
+            { "text": "TEXT_1" },
+            { "text": "TEXT_2" },
+            ...
+        ]
+    }
+"""
+).lstrip("\n")
+TEXT_ONLY_DATASET_DETAILS = (
+"""
+    For example,
+    ```python
+    from lmflow.datasets import Dataset
+    data_dict = {
+        "type": "text_only",
+        "instances": [
+            { "text": "Human: Hello. Bot: Hi!" },
+            { "text": "Human: How are you today? Bot: Fine, thank you!" },
+        ]
+    }
+    dataset = Dataset.create_from_dict(data_dict)
+    ```
+    You may also save the corresponding format to json,
+    ```python
+    import json
+    from lmflow.args import DatasetArguments
+    from lmflow.datasets import Dataset
+    data_dict = {
+        "type": "text_only",
+        "instances": [
+            { "text": "Human: Hello. Bot: Hi!" },
+            { "text": "Human: How are you today? Bot: Fine, thank you!" },
+        ]
+    }
+    with open("data.json", "w") as fout:
+        json.dump(data_dict, fout)
+    data_args = DatasetArgument(dataset_path="data.json")
+    dataset = Dataset(data_args)
+    new_data_dict = dataset.to_dict()
+    # `new_data_dict` Should have the same content as `data_dict`
+    ```
+"""
+).lstrip("\n")
+TEXT2TEXT_DATASET_DESCRIPTION = (
+"""
+"text2text": a dataset with input & output instances, with following format:
+    {
+        "type": "text2text",
+        "instances": [
+            { "input": "INPUT_1", "output": "OUTPUT_1" },
+            { "input": "INPUT_2", "output": "OUTPUT_2" },
+            ...
+        ]
+    }
+"""
+).lstrip("\n")
+TEXT2TEXT_DATASET_DETAILS = (
+"""
+    For example,
+    ```python
+    from lmflow.datasets import Dataset
+    data_dict = {
+        "type": "text2text",
+        "instances": [
+            {
+                "input": "Human: Hello.",
+                "output": "Bot: Hi!",
+            },
+            {
+                "input": "Human: How are you today?",
+                "output": "Bot: Fine, thank you! And you?",
+            }
+        ]
+    }
+    dataset = Dataset.create_from_dict(data_dict)
+    ```
+    You may also save the corresponding format to json,
+    ```python
+    import json
+    from lmflow.args import DatasetArguments
+    from lmflow.datasets import Dataset
+    data_dict = {
+        "type": "text2text",
+        "instances": [
+            {
+                "input": "Human: Hello.",
+                "output": "Bot: Hi!",
+            },
+            {
+                "input": "Human: How are you today?",
+                "output": "Bot: Fine, thank you! And you?",
+            }
+        ]
+    }
+    with open("data.json", "w") as fout:
+        json.dump(data_dict, fout)
+    data_args = DatasetArgument(dataset_path="data.json")
+    dataset = Dataset(data_args)
+    new_data_dict = dataset.to_dict()
+    # `new_data_dict` Should have the same content as `data_dict`
+    ```
+"""
+).lstrip("\n")
+TEXT_ONLY_DATASET_LONG_DESCRITION = (
+    TEXT_ONLY_DATASET_DESCRIPTION + TEXT_ONLY_DATASET_DETAILS
+)
+TEXT2TEXT_DATASET_LONG_DESCRITION = (
+    TEXT2TEXT_DATASET_DESCRIPTION + TEXT2TEXT_DATASET_DETAILS
+)

lmflow/utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""The program includes several functions: setting a random seed,
+loading data from a JSON file, batching data, and extracting answers from generated text.
+"""
+import random
+import numpy as np
+import torch
+import json
+import re
+def set_random_seed(seed: int):
+    """
+    Set the random seed for `random`, `numpy`, `torch`, `torch.cuda`.
+    Parameters
+    ------------
+    seed : int
+        The default seed.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def load_data(file_name: str):
+    """
+    Load data with file name.
+    Parameters
+    ------------
+    file_name : str.
+        The dataset file name.
+    Returns
+    ------------
+    inputs : list.
+        The input texts of the dataset.
+    outputs : list.
+        The output texts file datasets.
+    len : int.
+        The length of the dataset.
+    """
+    inputs = []
+    outputs = []
+    type = ""
+    with open(file_name, encoding='utf-8') as f:
+        json_data = json.load(f)
+        type = json_data["type"]
+        for line in json_data["instances"]:
+            inputs.append(line["input"])
+            outputs.append(line["output"])
+    print(f"load dataset {file_name} success.\n")
+    print(f"Type : {type}, datasize : {len(outputs)}")
+    return inputs, outputs, len(outputs)
+def batchlize(examples: list, batch_size: int, random_shuffle: bool):
+    """
+    Convert examples to a dataloader.
+    Parameters
+    ------------
+    examples : list.
+        Data list.
+    batch_size : int.
+    random_shuffle : bool
+        If true, the dataloader shuffle the training data.
+    Returns
+    ------------
+    dataloader:
+        Dataloader with batch generator.
+    """
+    size = 0
+    dataloader = []
+    length = len(examples)
+    if (random_shuffle):
+        random.shuffle(examples)
+    while size < length:
+        if length - size > batch_size:
+            dataloader.append(examples[size : size+batch_size])
+            size += batch_size
+        else:
+            dataloader.append(examples[size : size+(length-size)])
+            size += (length - size)
+    return dataloader
+def answer_extraction(response, answer_type=None):   #use this funtion to extract answers from generated text
+    """
+    Use this funtion to extract answers from generated text
+    Parameters
+    ------------
+    args :
+        Arguments.
+    response : str
+        plain string response.
+    Returns
+    ------------
+    answer:
+        Decoded answer (such as A, B, C, D, E for mutiple-choice QA).
+    """
+    # temp = response["generated_text"]
+    temp = response
+    if answer_type in ("gsm8k", "svamp", "asdiv", "addsub", "singleeq", "multiarith", "math"):
+        temp = temp.replace(",", "")
+        temp = [s for s in re.findall(r'-?\d+\.?\d*', temp)]
+    elif answer_type in ("aqua", "csqa", "multiple_choice"):
+        temp = re.findall(r'A|B|C|D|E', temp)
+    elif answer_type in ("strategyqa", "coin_flip"):
+        temp = temp.lower()
+        temp = re.sub("\"|\'|\n|\.|\s|\:|\,"," ", temp)
+        temp = temp.split(" ")
+        temp = [i for i in temp if i in ("yes", "no")]
+    elif answer_type in ("last_letters"):
+        temp = re.sub("\"|\'|\n|\.|\s","", temp)
+        temp = [temp]
+    elif answer_type in ("pubmedqa", "binary_choice"):
+        # pattern = "Output: (yes|no|maybe)"
+        # sttr = re.search(pattern, temp)
+        # answer = sttr.group(0)[8:] if sttr is not None else "N/A"
+        pattern = "(answer|Answer|ANSWER|output|Output|OUTPUT|A): \(*(yes|Yes|YES|no|No|NO|maybe|Maybe|MAYBE)"
+        sttr = re.search(pattern, temp)
+        if sttr is not None:
+            mid_answer = sttr.group(0)
+            mid_answer = mid_answer.split(":")[-1].strip()
+            answer = mid_answer.lower()
+        else:
+            pattern = "(yes|Yes|YES|no|No|NO|maybe|Maybe|MAYBE)(\.|\s)"
+            sttr = re.search(pattern, temp)
+            if sttr is not None:
+                answer = sttr.group(0)[:-1].lower()
+            else:
+                answer = "N/A"
+        return answer
+    elif answer_type == "medmcqa":
+        # pattern = "Output: (A|B|C|D)."
+        # sttr = re.search(pattern, temp)
+        # answer = sttr.group(0)[8:-1].lower() if sttr is not None else "N/A"
+        pattern = "(answer|Answer|ANSWER|output|Output|OUTPUT|A): \(*(A|B|C|D|a|b|c|d)"
+        sttr = re.search(pattern, temp)
+        if sttr is not None:
+            mid_answer = sttr.group(0)
+            answer = mid_answer[-1].lower()
+        else:
+            pattern = "\(*(A|B|C|D|a|b|c|d)\)*(\.|\s)"
+            sttr = re.search(pattern, temp)
+            if sttr is not None:
+                if '(' in sttr.group(0):
+                    answer = sttr.group(0)[1].lower()
+                else:
+                    answer = sttr.group(0)[0].lower()
+            else:
+                answer = "N/A"
+        return answer
+    elif answer_type == "usmle":
+        # pattern = "Output: (A|B|C|D)."
+        # sttr = re.search(pattern, temp)
+        # answer = sttr.group(0)[8:-1].lower() if sttr is not None else "N/A"
+        pattern = "(Answer|Output|A): \(*(A|B|C|D|a|b|c|d)"
+        sttr = re.search(pattern, temp)
+        if sttr is not None:
+            mid_answer = sttr.group(0)
+            answer = mid_answer[-1].lower()
+        else:
+            pattern = "\(*(A|B|C|D|a|b|c|d)\)*(\.|\s)"
+            sttr = re.search(pattern, temp)
+            if sttr is not None:
+                if '(' in sttr.group(0):
+                    answer = sttr.group(0)[1].lower()
+                else:
+                    answer = sttr.group(0)[0].lower()
+            else:
+                answer = "N/A"
+        return answer
+    elif answer_type == "text":
+        return response
+    else:
+        raise NotImplementedError(f"Unsupported answer type: {answer_type}")
+    if len(temp) != 0:
+        answer = temp[-1]
+        # if there is . at the end of answer, remove it
+        # e.g. answer = 64.
+        if answer != "":
+            if answer[-1] == ".":
+                answer = answer[:-1]
+            # round the answer to nearest integer
+        if answer_type in ("gsm8k", "svamp"):
+            try:
+                answer = str(round(float(answer)))
+            except:
+                answer = "" # no sol or sol doesn't have valid format
+        elif answer_type in ("last_letters"):
+            try:
+                answer = answer[-args.concat_length:]
+            except:
+                answer = ""
+    else:
+        answer = ""
+    return answer

lmflow/version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.0.1"

requirements.txt CHANGED Viewed

	@@ -1 +1,13 @@
1	- ~~lmflow @ git+https://github~~.~~com/OptimalScale/LMFlow~~.~~git@c21a511c8abcb2bc9fba3ae4de847806688bdd3d~~

+numpy==1.24.2
+datasets==2.10.1
+peft @ git+https://github.com/huggingface/peft.git@deff03f2c251534fffd2511fc2d440e84cc54b1b
+torch==2.0.0
+wandb==0.14.0
+deepspeed==0.8.3
+trl @ git+https://github.com/lvwerra/trl.git#egg=trl-0.4.1
+sentencepiece
+transformers @ git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda
+flask
+flask_cors
+icetk
+cpm_kernels==1.0.11