Matt
commited on
Commit
•
f875998
1
Parent(s):
d1fb690
Revert to Falcon naming
Browse files
config.json
CHANGED
@@ -5,12 +5,12 @@
|
|
5 |
"FalconForCausalLM"
|
6 |
],
|
7 |
"auto_map": {
|
8 |
-
"AutoConfig": "
|
9 |
-
"AutoModel": "
|
10 |
-
"AutoModelForSequenceClassification": "
|
11 |
-
"AutoModelForTokenClassification": "
|
12 |
-
"AutoModelForQuestionAnswering": "
|
13 |
-
"AutoModelForCausalLM": "
|
14 |
},
|
15 |
"attention_dropout": 0.0,
|
16 |
"bias": true,
|
|
|
5 |
"FalconForCausalLM"
|
6 |
],
|
7 |
"auto_map": {
|
8 |
+
"AutoConfig": "configuration_falcon.FalconConfig",
|
9 |
+
"AutoModel": "modeling_falcon.FalconModel",
|
10 |
+
"AutoModelForSequenceClassification": "modeling_falcon.FalconForSequenceClassification",
|
11 |
+
"AutoModelForTokenClassification": "modeling_falcon.FalconForTokenClassification",
|
12 |
+
"AutoModelForQuestionAnswering": "modeling_falcon.FalconForQuestionAnswering",
|
13 |
+
"AutoModelForCausalLM": "modeling_falcon.FalconForCausalLM"
|
14 |
},
|
15 |
"attention_dropout": 0.0,
|
16 |
"bias": true,
|
configuration_RW.py → configuration_falcon.py
RENAMED
@@ -25,7 +25,7 @@ FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|
25 |
}
|
26 |
|
27 |
|
28 |
-
class
|
29 |
r"""
|
30 |
This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon
|
31 |
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
@@ -80,10 +80,10 @@ class RWConfig(PretrainedConfig):
|
|
80 |
Example:
|
81 |
|
82 |
```python
|
83 |
-
>>> from transformers import FalconModel,
|
84 |
|
85 |
>>> # Initializing a small (2-layer) Falcon configuration
|
86 |
-
>>> configuration =
|
87 |
|
88 |
>>> # Initializing a model from the small configuration
|
89 |
>>> model = FalconModel(configuration)
|
|
|
25 |
}
|
26 |
|
27 |
|
28 |
+
class FalconConfig(PretrainedConfig):
|
29 |
r"""
|
30 |
This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon
|
31 |
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
|
|
80 |
Example:
|
81 |
|
82 |
```python
|
83 |
+
>>> from transformers import FalconModel, FalconConfig
|
84 |
|
85 |
>>> # Initializing a small (2-layer) Falcon configuration
|
86 |
+
>>> configuration = FalconConfig(num_hidden_layers=2)
|
87 |
|
88 |
>>> # Initializing a model from the small configuration
|
89 |
>>> model = FalconModel(configuration)
|
modeling_RW.py → modeling_falcon.py
RENAMED
@@ -32,7 +32,7 @@ from transformers.modeling_outputs import (
|
|
32 |
)
|
33 |
from transformers.modeling_utils import PreTrainedModel
|
34 |
from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
|
35 |
-
from .
|
36 |
|
37 |
|
38 |
logger = logging.get_logger(__name__)
|
@@ -46,7 +46,7 @@ FALCON_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
|
46 |
"tiiuae/falcon-rw-1b",
|
47 |
]
|
48 |
_CHECKPOINT_FOR_DOC = "Rocketknight1/falcon-rw-1b"
|
49 |
-
_CONFIG_FOR_DOC = "
|
50 |
|
51 |
|
52 |
# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
|
@@ -188,7 +188,7 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
|
|
188 |
|
189 |
|
190 |
class FalconAttention(nn.Module):
|
191 |
-
def __init__(self, config:
|
192 |
super().__init__()
|
193 |
|
194 |
self.hidden_size = config.hidden_size
|
@@ -396,7 +396,7 @@ class FalconAttention(nn.Module):
|
|
396 |
|
397 |
|
398 |
class FalconMLP(nn.Module):
|
399 |
-
def __init__(self, config:
|
400 |
super().__init__()
|
401 |
hidden_size = config.hidden_size
|
402 |
|
@@ -412,7 +412,7 @@ class FalconMLP(nn.Module):
|
|
412 |
|
413 |
|
414 |
class FalconDecoderLayer(nn.Module):
|
415 |
-
def __init__(self, config:
|
416 |
super().__init__()
|
417 |
hidden_size = config.hidden_size
|
418 |
self.num_heads = config.num_attention_heads
|
@@ -499,7 +499,7 @@ FALCON_START_DOCSTRING = r"""
|
|
499 |
and behavior.
|
500 |
|
501 |
Parameters:
|
502 |
-
config ([`
|
503 |
Initializing with a config file does not load the weights associated with the model, only the
|
504 |
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
505 |
"""
|
@@ -559,13 +559,13 @@ FALCON_INPUTS_DOCSTRING = r"""
|
|
559 |
"""
|
560 |
|
561 |
|
562 |
-
class
|
563 |
"""
|
564 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
565 |
models.
|
566 |
"""
|
567 |
|
568 |
-
config_class =
|
569 |
base_model_prefix = "transformer"
|
570 |
supports_gradient_checkpointing = True
|
571 |
_no_split_modules = ["FalconDecoderLayer"]
|
@@ -589,9 +589,9 @@ class RWPreTrainedModel(PreTrainedModel):
|
|
589 |
module.bias.data.zero_()
|
590 |
module.weight.data.fill_(1.0)
|
591 |
|
592 |
-
# Copied from transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._set_gradient_checkpointing with BloomModel->
|
593 |
def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
|
594 |
-
if isinstance(module,
|
595 |
module.gradient_checkpointing = value
|
596 |
|
597 |
@staticmethod
|
@@ -635,8 +635,8 @@ class RWPreTrainedModel(PreTrainedModel):
|
|
635 |
"The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.",
|
636 |
FALCON_START_DOCSTRING,
|
637 |
)
|
638 |
-
class
|
639 |
-
def __init__(self, config:
|
640 |
super().__init__(config)
|
641 |
|
642 |
self.embed_dim = config.hidden_size
|
@@ -835,12 +835,12 @@ class RWModel(RWPreTrainedModel):
|
|
835 |
"The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).",
|
836 |
FALCON_START_DOCSTRING,
|
837 |
)
|
838 |
-
class
|
839 |
_tied_weights_keys = ["lm_head.weight"]
|
840 |
|
841 |
-
def __init__(self, config:
|
842 |
super().__init__(config)
|
843 |
-
self.transformer =
|
844 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
845 |
|
846 |
# Initialize weights and apply final processing
|
@@ -965,7 +965,7 @@ class RWForCausalLM(RWPreTrainedModel):
|
|
965 |
"""
|
966 |
The Falcon Model transformer with a sequence classification head on top (linear layer).
|
967 |
|
968 |
-
[`
|
969 |
(e.g. GPT-1) do.
|
970 |
|
971 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
@@ -976,11 +976,11 @@ class RWForCausalLM(RWPreTrainedModel):
|
|
976 |
""",
|
977 |
FALCON_START_DOCSTRING,
|
978 |
)
|
979 |
-
class
|
980 |
-
def __init__(self, config:
|
981 |
super().__init__(config)
|
982 |
self.num_labels = config.num_labels
|
983 |
-
self.transformer =
|
984 |
self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
|
985 |
|
986 |
# Initialize weights and apply final processing
|
@@ -1092,12 +1092,12 @@ class RWForSequenceClassification(RWPreTrainedModel):
|
|
1092 |
""",
|
1093 |
FALCON_START_DOCSTRING,
|
1094 |
)
|
1095 |
-
class
|
1096 |
-
def __init__(self, config:
|
1097 |
super().__init__(config)
|
1098 |
self.num_labels = config.num_labels
|
1099 |
|
1100 |
-
self.transformer =
|
1101 |
if getattr(config, "classifier_dropout", None) is not None:
|
1102 |
classifier_dropout = config.classifier_dropout
|
1103 |
elif getattr(config, "hidden_dropout", None) is not None:
|
@@ -1181,10 +1181,10 @@ class RWForTokenClassification(RWPreTrainedModel):
|
|
1181 |
""",
|
1182 |
FALCON_START_DOCSTRING,
|
1183 |
)
|
1184 |
-
class
|
1185 |
def __init__(self, config):
|
1186 |
super().__init__(config)
|
1187 |
-
self.transformer =
|
1188 |
self.qa_outputs = nn.Linear(config.hidden_size, 2)
|
1189 |
|
1190 |
# Initialize weights and apply final processing
|
|
|
32 |
)
|
33 |
from transformers.modeling_utils import PreTrainedModel
|
34 |
from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
|
35 |
+
from .configuration_falcon import FalconConfig
|
36 |
|
37 |
|
38 |
logger = logging.get_logger(__name__)
|
|
|
46 |
"tiiuae/falcon-rw-1b",
|
47 |
]
|
48 |
_CHECKPOINT_FOR_DOC = "Rocketknight1/falcon-rw-1b"
|
49 |
+
_CONFIG_FOR_DOC = "FalconConfig"
|
50 |
|
51 |
|
52 |
# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
|
|
|
188 |
|
189 |
|
190 |
class FalconAttention(nn.Module):
|
191 |
+
def __init__(self, config: FalconConfig):
|
192 |
super().__init__()
|
193 |
|
194 |
self.hidden_size = config.hidden_size
|
|
|
396 |
|
397 |
|
398 |
class FalconMLP(nn.Module):
|
399 |
+
def __init__(self, config: FalconConfig):
|
400 |
super().__init__()
|
401 |
hidden_size = config.hidden_size
|
402 |
|
|
|
412 |
|
413 |
|
414 |
class FalconDecoderLayer(nn.Module):
|
415 |
+
def __init__(self, config: FalconConfig):
|
416 |
super().__init__()
|
417 |
hidden_size = config.hidden_size
|
418 |
self.num_heads = config.num_attention_heads
|
|
|
499 |
and behavior.
|
500 |
|
501 |
Parameters:
|
502 |
+
config ([`FalconConfig`]): Model configuration class with all the parameters of the model.
|
503 |
Initializing with a config file does not load the weights associated with the model, only the
|
504 |
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
505 |
"""
|
|
|
559 |
"""
|
560 |
|
561 |
|
562 |
+
class FalconPreTrainedModel(PreTrainedModel):
|
563 |
"""
|
564 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
565 |
models.
|
566 |
"""
|
567 |
|
568 |
+
config_class = FalconConfig
|
569 |
base_model_prefix = "transformer"
|
570 |
supports_gradient_checkpointing = True
|
571 |
_no_split_modules = ["FalconDecoderLayer"]
|
|
|
589 |
module.bias.data.zero_()
|
590 |
module.weight.data.fill_(1.0)
|
591 |
|
592 |
+
# Copied from transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._set_gradient_checkpointing with BloomModel->FalconModel
|
593 |
def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
|
594 |
+
if isinstance(module, FalconModel):
|
595 |
module.gradient_checkpointing = value
|
596 |
|
597 |
@staticmethod
|
|
|
635 |
"The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.",
|
636 |
FALCON_START_DOCSTRING,
|
637 |
)
|
638 |
+
class FalconModel(FalconPreTrainedModel):
|
639 |
+
def __init__(self, config: FalconConfig):
|
640 |
super().__init__(config)
|
641 |
|
642 |
self.embed_dim = config.hidden_size
|
|
|
835 |
"The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).",
|
836 |
FALCON_START_DOCSTRING,
|
837 |
)
|
838 |
+
class FalconForCausalLM(FalconPreTrainedModel):
|
839 |
_tied_weights_keys = ["lm_head.weight"]
|
840 |
|
841 |
+
def __init__(self, config: FalconConfig):
|
842 |
super().__init__(config)
|
843 |
+
self.transformer = FalconModel(config)
|
844 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
845 |
|
846 |
# Initialize weights and apply final processing
|
|
|
965 |
"""
|
966 |
The Falcon Model transformer with a sequence classification head on top (linear layer).
|
967 |
|
968 |
+
[`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
969 |
(e.g. GPT-1) do.
|
970 |
|
971 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
|
|
976 |
""",
|
977 |
FALCON_START_DOCSTRING,
|
978 |
)
|
979 |
+
class FalconForSequenceClassification(FalconPreTrainedModel):
|
980 |
+
def __init__(self, config: FalconConfig):
|
981 |
super().__init__(config)
|
982 |
self.num_labels = config.num_labels
|
983 |
+
self.transformer = FalconModel(config)
|
984 |
self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
|
985 |
|
986 |
# Initialize weights and apply final processing
|
|
|
1092 |
""",
|
1093 |
FALCON_START_DOCSTRING,
|
1094 |
)
|
1095 |
+
class FalconForTokenClassification(FalconPreTrainedModel):
|
1096 |
+
def __init__(self, config: FalconConfig):
|
1097 |
super().__init__(config)
|
1098 |
self.num_labels = config.num_labels
|
1099 |
|
1100 |
+
self.transformer = FalconModel(config)
|
1101 |
if getattr(config, "classifier_dropout", None) is not None:
|
1102 |
classifier_dropout = config.classifier_dropout
|
1103 |
elif getattr(config, "hidden_dropout", None) is not None:
|
|
|
1181 |
""",
|
1182 |
FALCON_START_DOCSTRING,
|
1183 |
)
|
1184 |
+
class FalconForQuestionAnswering(FalconPreTrainedModel):
|
1185 |
def __init__(self, config):
|
1186 |
super().__init__(config)
|
1187 |
+
self.transformer = FalconModel(config)
|
1188 |
self.qa_outputs = nn.Linear(config.hidden_size, 2)
|
1189 |
|
1190 |
# Initialize weights and apply final processing
|