jupyterjazz commited on
Commit
79c3c93
·
1 Parent(s): c380b5a

feat: initialize models with or without adapters

Browse files

Signed-off-by: jupyterjazz <saba.sturua@jina.ai>

Files changed (2) hide show
  1. configuration_xlm_roberta.py +2 -0
  2. modeling_lora.py +13 -15
configuration_xlm_roberta.py CHANGED
@@ -22,6 +22,7 @@ class XLMRobertaFlashConfig(PretrainedConfig):
22
  use_cache=True,
23
  classifier_dropout=None,
24
  num_loras=5,
 
25
  **kwargs,
26
  ):
27
  super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -42,3 +43,4 @@ class XLMRobertaFlashConfig(PretrainedConfig):
42
  self.use_cache = use_cache
43
  self.classifier_dropout = classifier_dropout
44
  self.num_loras = num_loras
 
 
22
  use_cache=True,
23
  classifier_dropout=None,
24
  num_loras=5,
25
+ load_trained_adapters=False,
26
  **kwargs,
27
  ):
28
  super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
43
  self.use_cache = use_cache
44
  self.classifier_dropout = classifier_dropout
45
  self.num_loras = num_loras
46
+ self.load_trained_adapters = load_trained_adapters
modeling_lora.py CHANGED
@@ -61,8 +61,6 @@ class LoRAParametrization(nn.Module):
61
  fan_in_fan_out = layer_type == "embedding"
62
  self.swap = (lambda x: (x[1], x[0])) if fan_in_fan_out else (lambda x: x)
63
 
64
- # For the officially "correct" LoRA initialization, check here: https://github.com/microsoft/LoRA
65
- # TODO: Ensure that the initialization here is correct
66
  if layer_type == "linear":
67
  self.lora_A = nn.Parameter(
68
  initialized_weights((rank, fan_in), num_adaptions, init="kaiming")
@@ -207,13 +205,16 @@ class LoRAParametrization(nn.Module):
207
  class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
208
  def __init__(self, config: XLMRobertaFlashConfig, roberta: Optional[XLMRobertaModel] = None, add_pooling_layer=True):
209
  super().__init__(config)
 
210
  if roberta is None:
211
  self.roberta = XLMRobertaModel(config, add_pooling_layer=add_pooling_layer)
212
  else:
213
  self.roberta = roberta
 
214
  self._is_merged = False
215
  self._num_adaptions = config.num_loras
216
  self._register_lora(self._num_adaptions)
 
217
  self.main_params_trainable = False
218
  self._task_idx = None
219
  # By default, we select the first LoRA
@@ -236,12 +237,6 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
236
  if "lora" not in name:
237
  param.requires_grad_(val)
238
 
239
- @classmethod
240
- def from_roberta(cls, *args, **kwargs):
241
- roberta = XLMRobertaModel.from_pretrained(*args, **kwargs)
242
- config = XLMRobertaFlashConfig.from_pretrained(*args, **kwargs)
243
- return cls(config, roberta=roberta)
244
-
245
  def merge_lora(self):
246
  """Merges currently selected LoRA into main weights."""
247
  if self._is_merged:
@@ -264,13 +259,16 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
264
  use_safetensors: bool = None,
265
  **kwargs,
266
  ):
267
- """
268
- TODO: choose between from_roberta and super().from_pretrained
269
- We want to be able to load both a pretrained XLMRoBertaModel, and a trained
270
- XLMRobertaLoRA via this method. To this end, we need to check which of these
271
- models we are expected to load.
272
- """
273
- return cls.from_roberta(pretrained_model_name_or_path)
 
 
 
274
 
275
  def _register_lora(self, num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
276
  self.apply(
 
61
  fan_in_fan_out = layer_type == "embedding"
62
  self.swap = (lambda x: (x[1], x[0])) if fan_in_fan_out else (lambda x: x)
63
 
 
 
64
  if layer_type == "linear":
65
  self.lora_A = nn.Parameter(
66
  initialized_weights((rank, fan_in), num_adaptions, init="kaiming")
 
205
  class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
206
  def __init__(self, config: XLMRobertaFlashConfig, roberta: Optional[XLMRobertaModel] = None, add_pooling_layer=True):
207
  super().__init__(config)
208
+
209
  if roberta is None:
210
  self.roberta = XLMRobertaModel(config, add_pooling_layer=add_pooling_layer)
211
  else:
212
  self.roberta = roberta
213
+
214
  self._is_merged = False
215
  self._num_adaptions = config.num_loras
216
  self._register_lora(self._num_adaptions)
217
+
218
  self.main_params_trainable = False
219
  self._task_idx = None
220
  # By default, we select the first LoRA
 
237
  if "lora" not in name:
238
  param.requires_grad_(val)
239
 
 
 
 
 
 
 
240
  def merge_lora(self):
241
  """Merges currently selected LoRA into main weights."""
242
  if self._is_merged:
 
259
  use_safetensors: bool = None,
260
  **kwargs,
261
  ):
262
+ config = XLMRobertaFlashConfig.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
263
+ if config.load_trained_adapters:
264
+ return super().from_pretrained(
265
+ pretrained_model_name_or_path,
266
+ *model_args,
267
+ **kwargs
268
+ )
269
+ else:
270
+ roberta = XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
271
+ return cls(config, roberta=roberta)
272
 
273
  def _register_lora(self, num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
274
  self.apply(