LoneStriker commited on
Commit
8d146e9
1 Parent(s): 627bc8d

Spicyboros-3.1 + Yi-34B-Llama

Browse files
README.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - generated_from_trainer
4
+ model-index:
5
+ - name: airo-lora-out
6
+ results: []
7
+ ---
8
+
9
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
10
+ should probably proofread and complete it, then remove this comment. -->
11
+
12
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
13
+ # airo-lora-out
14
+
15
+ This model was trained from scratch on the None dataset.
16
+
17
+ ## Model description
18
+
19
+ More information needed
20
+
21
+ ## Intended uses & limitations
22
+
23
+ More information needed
24
+
25
+ ## Training and evaluation data
26
+
27
+ More information needed
28
+
29
+ ## Training procedure
30
+
31
+ ### Training hyperparameters
32
+
33
+ The following hyperparameters were used during training:
34
+ - learning_rate: 0.0001
35
+ - train_batch_size: 2
36
+ - eval_batch_size: 2
37
+ - seed: 42
38
+ - distributed_type: multi-GPU
39
+ - num_devices: 2
40
+ - total_train_batch_size: 4
41
+ - total_eval_batch_size: 4
42
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
43
+ - lr_scheduler_type: constant
44
+ - num_epochs: 1
45
+
46
+ ### Training results
47
+
48
+
49
+
50
+ ### Framework versions
51
+
52
+ - Transformers 4.34.1
53
+ - Pytorch 2.0.1+cu118
54
+ - Datasets 2.14.6
55
+ - Tokenizers 0.14.1
adapter_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/users/ubuntu/models/Yi-34B-Llama",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "lora_alpha": 16,
12
+ "lora_dropout": 0.07,
13
+ "modules_to_save": null,
14
+ "peft_type": "LORA",
15
+ "r": 64,
16
+ "rank_pattern": {},
17
+ "revision": null,
18
+ "target_modules": [
19
+ "down_proj",
20
+ "up_proj",
21
+ "o_proj",
22
+ "gate_proj",
23
+ "k_proj",
24
+ "q_proj",
25
+ "v_proj"
26
+ ],
27
+ "task_type": "CAUSAL_LM"
28
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38e1a702e8e2ecb0690c1b3d373c8da2ec30cb70a5a41ef3080f783897807e92
3
+ size 1966383405
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 64001,
3
+ "<s>": 64000
4
+ }
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/users/ubuntu/models/Yi-34B-Llama",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 7168,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 20480,
13
+ "max_position_embeddings": 4096,
14
+ "model_type": "llama",
15
+ "num_attention_heads": 56,
16
+ "num_hidden_layers": 60,
17
+ "num_key_value_heads": 8,
18
+ "pad_token_id": 0,
19
+ "pretraining_tp": 1,
20
+ "quantization_config": {
21
+ "bnb_4bit_compute_dtype": "bfloat16",
22
+ "bnb_4bit_quant_type": "nf4",
23
+ "bnb_4bit_use_double_quant": true,
24
+ "llm_int8_enable_fp32_cpu_offload": false,
25
+ "llm_int8_has_fp16_weight": false,
26
+ "llm_int8_skip_modules": null,
27
+ "llm_int8_threshold": 6.0,
28
+ "load_in_4bit": true,
29
+ "load_in_8bit": false,
30
+ "quant_method": "bitsandbytes"
31
+ },
32
+ "rms_norm_eps": 1e-05,
33
+ "rope_scaling": null,
34
+ "rope_theta": 5000000.0,
35
+ "tie_word_embeddings": false,
36
+ "torch_dtype": "bfloat16",
37
+ "transformers_version": "4.34.1",
38
+ "use_cache": false,
39
+ "vocab_size": 64002
40
+ }
configuration_yi.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Yi model configuration"""
2
+ from transformers.configuration_utils import PretrainedConfig
3
+ from transformers.utils import logging
4
+
5
+ logger = logging.get_logger(__name__)
6
+
7
+ Yi_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
8
+
9
+
10
+ class YiConfig(PretrainedConfig):
11
+ r"""
12
+ This is the configuration class to store the configuration of a [`YiModel`]. It is used to instantiate an Yi
13
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
14
+ defaults will yield a similar configuration to that of the Yi model.
15
+
16
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
17
+ documentation from [`PretrainedConfig`] for more information.
18
+
19
+
20
+ Args:
21
+ vocab_size (`int`, *optional*, defaults to 64000):
22
+ Vocabulary size of the Yi model. Defines the number of different tokens that can be represented by the
23
+ `inputs_ids` passed when calling [`YiModel`]
24
+ hidden_size (`int`, *optional*, defaults to 4096):
25
+ Dimension of the hidden representations.
26
+ intermediate_size (`int`, *optional*, defaults to 11008):
27
+ Dimension of the MLP representations.
28
+ num_hidden_layers (`int`, *optional*, defaults to 32):
29
+ Number of hidden layers in the Transformer encoder.
30
+ num_attention_heads (`int`, *optional*, defaults to 32):
31
+ Number of attention heads for each attention layer in the Transformer encoder.
32
+ num_key_value_heads (`int`, *optional*):
33
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
34
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
35
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
36
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
37
+ by meanpooling all the original heads within that group. For more details checkout [this
38
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
39
+ `num_attention_heads`.
40
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
41
+ The non-linear activation function (function or string) in the decoder.
42
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
43
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
44
+ just in case (e.g., 512 or 1024 or 2048 or 4096).
45
+ initializer_range (`float`, *optional*, defaults to 0.02):
46
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
47
+ rms_norm_eps (`float`, *optional*, defaults to 1e-5):
48
+ The epsilon used by the rms normalization layers.
49
+ use_cache (`bool`, *optional*, defaults to `True`):
50
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
51
+ relevant if `config.is_decoder=True`.
52
+ tie_word_embeddings(`bool`, *optional*, defaults to `False`):
53
+ Whether to tie weight embeddings
54
+ output_attentions (`bool`, *optional*, defaults to `False`):
55
+ Whether or not to output attentions.
56
+ rope_theta (`float`, *optional*, defaults to 5000000.0):
57
+ The base period of the RoPE embeddings.
58
+ Example:
59
+
60
+ ```python
61
+ >>> from transformers import YiModel, YiConfig
62
+
63
+ >>> # Initializing a Yi style configuration
64
+ >>> configuration = YiConfig()
65
+
66
+ >>> # Initializing a model from the Yi style configuration
67
+ >>> model = YiModel(configuration)
68
+
69
+ >>> # Accessing the model configuration
70
+ >>> configuration = model.config
71
+ ```"""
72
+ model_type = "Yi"
73
+ keys_to_ignore_at_inference = ["past_key_values"]
74
+
75
+ def __init__(
76
+ self,
77
+ vocab_size=64000,
78
+ hidden_size=4096,
79
+ intermediate_size=11008,
80
+ num_hidden_layers=32,
81
+ num_attention_heads=32,
82
+ num_key_value_heads=4,
83
+ hidden_act="silu",
84
+ max_position_embeddings=4096,
85
+ initializer_range=0.02,
86
+ rms_norm_eps=1e-5,
87
+ use_cache=True,
88
+ pad_token_id=0,
89
+ bos_token_id=1,
90
+ eos_token_id=2,
91
+ tie_word_embeddings=False,
92
+ output_attentions=False,
93
+ rope_theta=5000000.0,
94
+ **kwargs,
95
+ ):
96
+ self.vocab_size = vocab_size
97
+ self.max_position_embeddings = max_position_embeddings
98
+ self.hidden_size = hidden_size
99
+ self.intermediate_size = intermediate_size
100
+ self.num_hidden_layers = num_hidden_layers
101
+ self.num_attention_heads = num_attention_heads
102
+
103
+ # for backward compatibility
104
+ if num_key_value_heads is None:
105
+ num_key_value_heads = num_attention_heads
106
+
107
+ self.num_key_value_heads = num_key_value_heads
108
+ self.hidden_act = hidden_act
109
+ self.initializer_range = initializer_range
110
+ self.rms_norm_eps = rms_norm_eps
111
+ self.use_cache = use_cache
112
+ self.output_attentions = output_attentions
113
+ self.rope_theta = rope_theta
114
+
115
+ super().__init__(
116
+ pad_token_id=pad_token_id,
117
+ bos_token_id=bos_token_id,
118
+ eos_token_id=eos_token_id,
119
+ tie_word_embeddings=tie_word_embeddings,
120
+ **kwargs,
121
+ )
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:386c49cf943d71aa110361135338c50e38beeff0a66593480421f37b319e1a39
3
+ size 1033105
tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<|startoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "64000": {
30
+ "content": "<s>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "64001": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ }
45
+ },
46
+ "auto_map": {
47
+ "AutoTokenizer": [
48
+ "tokenization_yi.YiTokenizer",
49
+ null
50
+ ]
51
+ },
52
+ "bos_token": "<|startoftext|>",
53
+ "clean_up_tokenization_spaces": false,
54
+ "eos_token": "<|endoftext|>",
55
+ "legacy": true,
56
+ "model_max_length": 4096,
57
+ "pad_token": "</s>",
58
+ "sp_model_kwargs": {},
59
+ "spaces_between_special_tokens": false,
60
+ "tokenizer_class": "LlamaTokenizer",
61
+ "trust_remote_code": false,
62
+ "unk_token": "<unk>",
63
+ "use_default_system_prompt": true,
64
+ "use_fast": true
65
+ }