Matt commited on
Commit
eddf7c4
1 Parent(s): 50ec422

Revert to Falcon naming

Browse files
config.json CHANGED
@@ -6,12 +6,12 @@
6
  ],
7
  "attention_dropout": 0.0,
8
  "auto_map": {
9
- "AutoConfig": "configuration_RW.RWConfig",
10
- "AutoModel": "modeling_RW.RWModel",
11
- "AutoModelForSequenceClassification": "modeling_RW.RWForSequenceClassification",
12
- "AutoModelForTokenClassification": "modeling_RW.RWForTokenClassification",
13
- "AutoModelForQuestionAnswering": "modeling_RW.RWForQuestionAnswering",
14
- "AutoModelForCausalLM": "modeling_RW.RWForCausalLM"
15
  },
16
  "bias": false,
17
  "bos_token_id": 11,
 
6
  ],
7
  "attention_dropout": 0.0,
8
  "auto_map": {
9
+ "AutoConfig": "configuration_falcon.FalconConfig",
10
+ "AutoModel": "modeling_falcon.FalconModel",
11
+ "AutoModelForSequenceClassification": "modeling_falcon.FalconForSequenceClassification",
12
+ "AutoModelForTokenClassification": "modeling_falcon.FalconForTokenClassification",
13
+ "AutoModelForQuestionAnswering": "modeling_falcon.FalconForQuestionAnswering",
14
+ "AutoModelForCausalLM": "modeling_falcon.FalconForCausalLM"
15
  },
16
  "bias": false,
17
  "bos_token_id": 11,
configuration_RW.py → configuration_falcon.py RENAMED
@@ -25,7 +25,7 @@ FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25
  }
26
 
27
 
28
- class RWConfig(PretrainedConfig):
29
  r"""
30
  This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon
31
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -80,10 +80,10 @@ class RWConfig(PretrainedConfig):
80
  Example:
81
 
82
  ```python
83
- >>> from transformers import FalconModel, RWConfig
84
 
85
  >>> # Initializing a small (2-layer) Falcon configuration
86
- >>> configuration = RWConfig(num_hidden_layers=2)
87
 
88
  >>> # Initializing a model from the small configuration
89
  >>> model = FalconModel(configuration)
 
25
  }
26
 
27
 
28
+ class FalconConfig(PretrainedConfig):
29
  r"""
30
  This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon
31
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
 
80
  Example:
81
 
82
  ```python
83
+ >>> from transformers import FalconModel, FalconConfig
84
 
85
  >>> # Initializing a small (2-layer) Falcon configuration
86
+ >>> configuration = FalconConfig(num_hidden_layers=2)
87
 
88
  >>> # Initializing a model from the small configuration
89
  >>> model = FalconModel(configuration)
modeling_RW.py → modeling_falcon.py RENAMED
@@ -32,7 +32,7 @@ from transformers.modeling_outputs import (
32
  )
33
  from transformers.modeling_utils import PreTrainedModel
34
  from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
35
- from .configuration_RW import RWConfig
36
 
37
 
38
  logger = logging.get_logger(__name__)
@@ -46,7 +46,7 @@ FALCON_PRETRAINED_MODEL_ARCHIVE_LIST = [
46
  "tiiuae/falcon-rw-1b",
47
  ]
48
  _CHECKPOINT_FOR_DOC = "Rocketknight1/falcon-rw-1b"
49
- _CONFIG_FOR_DOC = "RWConfig"
50
 
51
 
52
  # NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
@@ -188,7 +188,7 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
188
 
189
 
190
  class FalconAttention(nn.Module):
191
- def __init__(self, config: RWConfig):
192
  super().__init__()
193
 
194
  self.hidden_size = config.hidden_size
@@ -396,7 +396,7 @@ class FalconAttention(nn.Module):
396
 
397
 
398
  class FalconMLP(nn.Module):
399
- def __init__(self, config: RWConfig):
400
  super().__init__()
401
  hidden_size = config.hidden_size
402
 
@@ -412,7 +412,7 @@ class FalconMLP(nn.Module):
412
 
413
 
414
  class FalconDecoderLayer(nn.Module):
415
- def __init__(self, config: RWConfig):
416
  super().__init__()
417
  hidden_size = config.hidden_size
418
  self.num_heads = config.num_attention_heads
@@ -499,7 +499,7 @@ FALCON_START_DOCSTRING = r"""
499
  and behavior.
500
 
501
  Parameters:
502
- config ([`RWConfig`]): Model configuration class with all the parameters of the model.
503
  Initializing with a config file does not load the weights associated with the model, only the
504
  configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
505
  """
@@ -559,13 +559,13 @@ FALCON_INPUTS_DOCSTRING = r"""
559
  """
560
 
561
 
562
- class RWPreTrainedModel(PreTrainedModel):
563
  """
564
  An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
565
  models.
566
  """
567
 
568
- config_class = RWConfig
569
  base_model_prefix = "transformer"
570
  supports_gradient_checkpointing = True
571
  _no_split_modules = ["FalconDecoderLayer"]
@@ -589,9 +589,9 @@ class RWPreTrainedModel(PreTrainedModel):
589
  module.bias.data.zero_()
590
  module.weight.data.fill_(1.0)
591
 
592
- # Copied from transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._set_gradient_checkpointing with BloomModel->RWModel
593
  def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
594
- if isinstance(module, RWModel):
595
  module.gradient_checkpointing = value
596
 
597
  @staticmethod
@@ -635,8 +635,8 @@ class RWPreTrainedModel(PreTrainedModel):
635
  "The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.",
636
  FALCON_START_DOCSTRING,
637
  )
638
- class RWModel(RWPreTrainedModel):
639
- def __init__(self, config: RWConfig):
640
  super().__init__(config)
641
 
642
  self.embed_dim = config.hidden_size
@@ -835,12 +835,12 @@ class RWModel(RWPreTrainedModel):
835
  "The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).",
836
  FALCON_START_DOCSTRING,
837
  )
838
- class RWForCausalLM(RWPreTrainedModel):
839
  _tied_weights_keys = ["lm_head.weight"]
840
 
841
- def __init__(self, config: RWConfig):
842
  super().__init__(config)
843
- self.transformer = RWModel(config)
844
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
845
 
846
  # Initialize weights and apply final processing
@@ -965,7 +965,7 @@ class RWForCausalLM(RWPreTrainedModel):
965
  """
966
  The Falcon Model transformer with a sequence classification head on top (linear layer).
967
 
968
- [`RWForSequenceClassification`] uses the last token in order to do the classification, as other causal models
969
  (e.g. GPT-1) do.
970
 
971
  Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -976,11 +976,11 @@ class RWForCausalLM(RWPreTrainedModel):
976
  """,
977
  FALCON_START_DOCSTRING,
978
  )
979
- class RWForSequenceClassification(RWPreTrainedModel):
980
- def __init__(self, config: RWConfig):
981
  super().__init__(config)
982
  self.num_labels = config.num_labels
983
- self.transformer = RWModel(config)
984
  self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
985
 
986
  # Initialize weights and apply final processing
@@ -1092,12 +1092,12 @@ class RWForSequenceClassification(RWPreTrainedModel):
1092
  """,
1093
  FALCON_START_DOCSTRING,
1094
  )
1095
- class RWForTokenClassification(RWPreTrainedModel):
1096
- def __init__(self, config: RWConfig):
1097
  super().__init__(config)
1098
  self.num_labels = config.num_labels
1099
 
1100
- self.transformer = RWModel(config)
1101
  if getattr(config, "classifier_dropout", None) is not None:
1102
  classifier_dropout = config.classifier_dropout
1103
  elif getattr(config, "hidden_dropout", None) is not None:
@@ -1181,10 +1181,10 @@ class RWForTokenClassification(RWPreTrainedModel):
1181
  """,
1182
  FALCON_START_DOCSTRING,
1183
  )
1184
- class RWForQuestionAnswering(RWPreTrainedModel):
1185
  def __init__(self, config):
1186
  super().__init__(config)
1187
- self.transformer = RWModel(config)
1188
  self.qa_outputs = nn.Linear(config.hidden_size, 2)
1189
 
1190
  # Initialize weights and apply final processing
 
32
  )
33
  from transformers.modeling_utils import PreTrainedModel
34
  from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
35
+ from .configuration_falcon import FalconConfig
36
 
37
 
38
  logger = logging.get_logger(__name__)
 
46
  "tiiuae/falcon-rw-1b",
47
  ]
48
  _CHECKPOINT_FOR_DOC = "Rocketknight1/falcon-rw-1b"
49
+ _CONFIG_FOR_DOC = "FalconConfig"
50
 
51
 
52
  # NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
 
188
 
189
 
190
  class FalconAttention(nn.Module):
191
+ def __init__(self, config: FalconConfig):
192
  super().__init__()
193
 
194
  self.hidden_size = config.hidden_size
 
396
 
397
 
398
  class FalconMLP(nn.Module):
399
+ def __init__(self, config: FalconConfig):
400
  super().__init__()
401
  hidden_size = config.hidden_size
402
 
 
412
 
413
 
414
  class FalconDecoderLayer(nn.Module):
415
+ def __init__(self, config: FalconConfig):
416
  super().__init__()
417
  hidden_size = config.hidden_size
418
  self.num_heads = config.num_attention_heads
 
499
  and behavior.
500
 
501
  Parameters:
502
+ config ([`FalconConfig`]): Model configuration class with all the parameters of the model.
503
  Initializing with a config file does not load the weights associated with the model, only the
504
  configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
505
  """
 
559
  """
560
 
561
 
562
+ class FalconPreTrainedModel(PreTrainedModel):
563
  """
564
  An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
565
  models.
566
  """
567
 
568
+ config_class = FalconConfig
569
  base_model_prefix = "transformer"
570
  supports_gradient_checkpointing = True
571
  _no_split_modules = ["FalconDecoderLayer"]
 
589
  module.bias.data.zero_()
590
  module.weight.data.fill_(1.0)
591
 
592
+ # Copied from transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._set_gradient_checkpointing with BloomModel->FalconModel
593
  def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
594
+ if isinstance(module, FalconModel):
595
  module.gradient_checkpointing = value
596
 
597
  @staticmethod
 
635
  "The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.",
636
  FALCON_START_DOCSTRING,
637
  )
638
+ class FalconModel(FalconPreTrainedModel):
639
+ def __init__(self, config: FalconConfig):
640
  super().__init__(config)
641
 
642
  self.embed_dim = config.hidden_size
 
835
  "The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).",
836
  FALCON_START_DOCSTRING,
837
  )
838
+ class FalconForCausalLM(FalconPreTrainedModel):
839
  _tied_weights_keys = ["lm_head.weight"]
840
 
841
+ def __init__(self, config: FalconConfig):
842
  super().__init__(config)
843
+ self.transformer = FalconModel(config)
844
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
845
 
846
  # Initialize weights and apply final processing
 
965
  """
966
  The Falcon Model transformer with a sequence classification head on top (linear layer).
967
 
968
+ [`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models
969
  (e.g. GPT-1) do.
970
 
971
  Since it does classification on the last token, it requires to know the position of the last token. If a
 
976
  """,
977
  FALCON_START_DOCSTRING,
978
  )
979
+ class FalconForSequenceClassification(FalconPreTrainedModel):
980
+ def __init__(self, config: FalconConfig):
981
  super().__init__(config)
982
  self.num_labels = config.num_labels
983
+ self.transformer = FalconModel(config)
984
  self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
985
 
986
  # Initialize weights and apply final processing
 
1092
  """,
1093
  FALCON_START_DOCSTRING,
1094
  )
1095
+ class FalconForTokenClassification(FalconPreTrainedModel):
1096
+ def __init__(self, config: FalconConfig):
1097
  super().__init__(config)
1098
  self.num_labels = config.num_labels
1099
 
1100
+ self.transformer = FalconModel(config)
1101
  if getattr(config, "classifier_dropout", None) is not None:
1102
  classifier_dropout = config.classifier_dropout
1103
  elif getattr(config, "hidden_dropout", None) is not None:
 
1181
  """,
1182
  FALCON_START_DOCSTRING,
1183
  )
1184
+ class FalconForQuestionAnswering(FalconPreTrainedModel):
1185
  def __init__(self, config):
1186
  super().__init__(config)
1187
+ self.transformer = FalconModel(config)
1188
  self.qa_outputs = nn.Linear(config.hidden_size, 2)
1189
 
1190
  # Initialize weights and apply final processing