|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations |
|
|
|
from dataclasses import dataclass, field |
|
from typing import Literal, Optional, Union |
|
|
|
from peft_mora.config import PeftConfig |
|
from peft_mora.utils import PeftType |
|
|
|
|
|
@dataclass |
|
class LoftQConfig: |
|
""" |
|
This is the sub-configuration class to store the configuration of a [`LoraModel`]. |
|
|
|
Args: |
|
bits_pattern (`dict`): The mapping from layer names or regexp expression to bits which are different from the |
|
default bits specified by `bits`. For example, `{model.decoder.layers.0.encoder_attn.k_proj: 2`}. |
|
bits (`int`): Quantization bits for LoftQ. |
|
iter (`int`): Alternating iterations for LoftQ. |
|
fake (`bool`): True: use fp16/fp32; used for first time to save weights. False: use bitsandbytes 4bit linear |
|
models. weights can't be saved. Recommend to set to True, save the weights and load the saved weights in 4 |
|
bits. |
|
""" |
|
|
|
loftq_bits: int = field(default=4, metadata={"help": "Quantization bits for LoftQ"}) |
|
loftq_iter: int = field(default=1, metadata={"help": "Alternating iterations for LoftQ"}) |
|
|
|
|
|
@dataclass |
|
class LoraConfig(PeftConfig): |
|
""" |
|
This is the configuration class to store the configuration of a [`LoraModel`]. |
|
|
|
Args: |
|
r (`int`): |
|
Lora attention dimension (the "rank"). |
|
target_modules (`Optional[Union[List[str], str]]`): |
|
The names of the modules to apply the adapter to. If this is specified, only the modules with the specified |
|
names will be replaced. When passing a string, a regex match will be performed. When passing a list of |
|
strings, either an exact match will be performed or it is checked if the name of the module ends with any |
|
of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen, |
|
excluding the output layer. If this is not specified, modules will be chosen according to the model |
|
architecture. If the architecture is not known, an error will be raised -- in this case, you should specify |
|
the target modules manually. |
|
lora_alpha (`int`): |
|
The alpha parameter for Lora scaling. |
|
lora_dropout (`float`): |
|
The dropout probability for Lora layers. |
|
fan_in_fan_out (`bool`): |
|
Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses |
|
`Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. |
|
bias (`str`): |
|
Bias type for LoRA. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the corresponding biases |
|
will be updated during training. Be aware that this means that, even when disabling the adapters, the model |
|
will not produce the same output as the base model would have without adaptation. |
|
use_rslora (`bool`): |
|
When set to True, uses <a href='https://doi.org/10.48550/arXiv.2312.03732'>Rank-Stabilized LoRA</a> which |
|
sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it was proven to work better. |
|
Otherwise, it will use the original default value of `lora_alpha/r`. |
|
modules_to_save (`List[str]`): |
|
List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. |
|
init_lora_weights (`bool` | `Literal["gaussian", "loftq"]`): |
|
How to initialize the weights of the adapter layers. Passing True (default) results in the default |
|
initialization from the reference implementation from Microsoft. Passing 'gaussian' results in Gaussian |
|
initialization scaled by the LoRA rank for linear and layers. Setting the initialization to False leads to |
|
completely random initialization and is discouraged. Pass `'loftq'` to use LoftQ initialization. |
|
layers_to_transform (`Union[List[int], int]`): |
|
The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices |
|
that are specified in this list. If a single integer is passed, it will apply the transformations on the |
|
layer at this index. |
|
layers_pattern (`str`): |
|
The layer pattern name, used only if `layers_to_transform` is different from `None`. |
|
rank_pattern (`dict`): |
|
The mapping from layer names or regexp expression to ranks which are different from the default rank |
|
specified by `r`. |
|
alpha_pattern (`dict`): |
|
The mapping from layer names or regexp expression to alphas which are different from the default alpha |
|
specified by `lora_alpha`. |
|
megatron_config (`Optional[dict]`): |
|
The TransformerConfig arguments for Megatron. It is used to create LoRA's parallel linear layer. You can |
|
get it like this, `core_transformer_config_from_args(get_args())`, these two functions being from Megatron. |
|
The arguments will be used to initialize the TransformerConfig of Megatron. You need to specify this |
|
parameter when you want to apply LoRA to the ColumnParallelLinear and RowParallelLinear layers of megatron. |
|
megatron_core (`Optional[str]`): |
|
The core module from Megatron to use, defaults to `"megatron.core"`. |
|
loftq_config (`Optional[LoftQConfig]`): |
|
The configuration of LoftQ. If this is not None, then LoftQ will be used to quantize the backbone weights |
|
and initialize Lora layers. Also pass `init_lora_weights='loftq'`. Note that you should not pass a |
|
quantized model in this case, as LoftQ will quantize the model itself. |
|
use_dora (`bool`): |
|
Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the weights |
|
into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is |
|
handled by a separate learnable parameter. This can improve the performance of LoRA, especially at low |
|
ranks. Right now, DoRA only supports non-quantized linear layers. DoRA introduces a bigger overhead than |
|
pure LoRA, so it is recommended to merge weights for inference. For more information, see |
|
https://arxiv.org/abs/2402.09353. |
|
""" |
|
|
|
r: int = field(default=8, metadata={"help": "Lora attention dimension"}) |
|
target_modules: Optional[Union[list[str], str]] = field( |
|
default=None, |
|
metadata={ |
|
"help": ( |
|
"List of module names or regex expression of the module names to replace with LoRA." |
|
"For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'." |
|
"This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer." |
|
"If not specified, modules will be chosen according to the model architecture, If the architecture is " |
|
"not known, an error will be raised -- in this case, you should specify the target modules manually." |
|
), |
|
}, |
|
) |
|
lora_alpha: int = field(default=8, metadata={"help": "Lora alpha"}) |
|
lora_dropout: float = field(default=0.0, metadata={"help": "Lora dropout"}) |
|
fan_in_fan_out: bool = field( |
|
default=False, |
|
metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, |
|
) |
|
bias: Literal["none", "all", "lora_only"] = field( |
|
default="none", metadata={"help": "Bias type for Lora. Can be 'none', 'all' or 'lora_only'"} |
|
) |
|
use_rslora: bool = field( |
|
default=False, |
|
metadata={ |
|
"help": ( |
|
"When set to True, uses Rank-Stabilized LoRA doi.org/10.48550/arXiv.2312.03732" |
|
" which sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it" |
|
" was proven to work better. Otherwise, it will use the original default" |
|
" value of `lora_alpha/r`." |
|
) |
|
}, |
|
) |
|
modules_to_save: Optional[list[str]] = field( |
|
default=None, |
|
metadata={ |
|
"help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " |
|
"For example, in Sequence Classification or Token Classification tasks, " |
|
"the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." |
|
}, |
|
) |
|
init_lora_weights: bool | Literal["gaussian", "loftq"] = field( |
|
default=True, |
|
metadata={ |
|
"help": ( |
|
"How to initialize the weights of the LoRA layers. Passing True (default) results in the default " |
|
"initialization from the reference implementation from Microsoft. Passing 'gaussian' results " |
|
"in Gaussian initialization scaled by the LoRA rank for linear and layers. Setting the initialization " |
|
"to False leads to completely random initialization and is discouraged." |
|
"Pass `'loftq'` to use LoftQ initialization" |
|
), |
|
}, |
|
) |
|
layers_to_transform: Optional[Union[list[int], int]] = field( |
|
default=None, |
|
metadata={ |
|
"help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index. " |
|
"This only works when target_modules is a list of str." |
|
}, |
|
) |
|
layers_pattern: Optional[Union[list[str], str]] = field( |
|
default=None, |
|
metadata={ |
|
"help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern." |
|
"This only works when target_modules is a list of str." |
|
}, |
|
) |
|
rank_pattern: Optional[dict] = field( |
|
default_factory=dict, |
|
metadata={ |
|
"help": ( |
|
"The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. " |
|
"For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}" |
|
) |
|
}, |
|
) |
|
alpha_pattern: Optional[dict] = field( |
|
default_factory=dict, |
|
metadata={ |
|
"help": ( |
|
"The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `lora_alpha`. " |
|
"For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}" |
|
) |
|
}, |
|
) |
|
megatron_config: Optional[dict] = field( |
|
default=None, |
|
metadata={ |
|
"help": ( |
|
"The TransformerConfig from Megatron. It is used to create LoRA's parallel linear layer." |
|
"You can get it like this, `core_transformer_config_from_args(get_args())`, " |
|
"these two functions being from Megatron." |
|
"You need to specify this parameter when you want to apply LoRA to the ColumnParallelLinear and " |
|
"RowParallelLinear layers of megatron." |
|
"It should be noted that we may not be able to use the `save_pretrained` and `from_pretrained` " |
|
"functions, because TransformerConfig may not necessarily be serialized." |
|
"But when using megatron, we can use `get_peft_model_state_dict` function and " |
|
"megatron's framework, they can also save and load models and configurations." |
|
) |
|
}, |
|
) |
|
megatron_core: Optional[str] = field( |
|
default="megatron.core", |
|
metadata={ |
|
"help": ( |
|
"The core module from Megatron, it is used to create LoRA's parallel linear layer. " |
|
"It only needs to be passed in when you need to use your own modified megatron core module. " |
|
"Otherwise, it will use the default value `megatron.core`. " |
|
) |
|
}, |
|
) |
|
|
|
loftq_config: Union[LoftQConfig, dict] = field( |
|
default_factory=dict, |
|
metadata={ |
|
"help": ( |
|
"The configuration of LoftQ. If this is passed, then LoftQ will be used to quantize the backbone " |
|
"weights and initialize Lora layers. Also set `init_lora_weights='loftq'` in this case." |
|
) |
|
}, |
|
) |
|
use_dora: bool = field( |
|
default=False, |
|
metadata={ |
|
"help": ( |
|
"Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the " |
|
"weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the " |
|
"magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, " |
|
"especially at low ranks. Right now, DoRA only supports non-quantized linear layers. DoRA introduces " |
|
"a bigger overhead than pure LoRA, so it is recommended to merge weights for inference. For more " |
|
"information, see https://arxiv.org/abs/2402.09353." |
|
) |
|
}, |
|
) |
|
use_mora: bool = field( |
|
default=False, |
|
metadata={ |
|
"help": ( |
|
"Enable MoRA" |
|
) |
|
}, |
|
) |
|
|
|
mora_type: int = field( |
|
default=1, |
|
metadata={ |
|
"help": ( |
|
"Enable MoRA" |
|
) |
|
}, |
|
) |
|
|
|
def __post_init__(self): |
|
self.peft_type = PeftType.LORA |
|
self.target_modules = ( |
|
set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules |
|
) |
|
|
|
if isinstance(self.target_modules, str) and self.layers_to_transform is not None: |
|
raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.") |
|
|
|
|
|
if isinstance(self.target_modules, str) and self.layers_pattern is not None: |
|
raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.") |
|
|
|
if self.use_dora and (self.megatron_config or self.init_lora_weights == "loftq"): |
|
raise ValueError("DoRA does not support megatron_core or LoftQ. Please set `use_dora=False`.") |
|
|
|
|
|
if self.init_lora_weights == "loftq": |
|
import importlib |
|
|
|
if not importlib.util.find_spec("scipy"): |
|
raise ImportError("The required package 'scipy' is not installed. Please install it to continue.") |
|
if self.loftq_config is None: |
|
raise ValueError("`loftq_config` must be specified when `init_lora_weights` is 'loftq'.") |
|
|
|
|
|
if self.loftq_config and not isinstance(self.loftq_config, dict): |
|
self.loftq_config = vars(self.loftq_config) |
|
|