CleverShovel commited on
Commit
d3ccaab
1 Parent(s): c66175e

Upload 7 files

Browse files
Files changed (4) hide show
  1. README.md +6 -9
  2. adapter_config.json +5 -4
  3. adapter_model.safetensors +1 -1
  4. config.json +3 -1
README.md CHANGED
@@ -7,8 +7,6 @@ base_model: mistralai/Mistral-7B-v0.1
7
  model-index:
8
  - name: llm_train/test_out
9
  results: []
10
- datasets:
11
- - CleverShovel/paper_reviews
12
  ---
13
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -30,7 +28,6 @@ strict: false
30
 
31
  bnb_config_kwargs:
32
  llm_int8_has_fp16_weight: true
33
- bnb_4bit_compute_dtype: float16
34
  bnb_4bit_quant_type: nf4
35
  bnb_4bit_use_double_quant: false
36
 
@@ -44,7 +41,7 @@ output_dir: ./llm_train/test_out
44
  #using lora for lower cost
45
  adapter: qlora
46
  lora_r: 8
47
- lora_alpha: 16
48
  lora_dropout: 0.05
49
  lora_target_modules:
50
  - q_proj
@@ -107,7 +104,7 @@ special_tokens:
107
 
108
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
109
  It achieves the following results on the evaluation set:
110
- - Loss: 2.0924
111
 
112
  ## Model description
113
 
@@ -141,13 +138,13 @@ The following hyperparameters were used during training:
141
 
142
  | Training Loss | Epoch | Step | Validation Loss |
143
  |:-------------:|:-----:|:----:|:---------------:|
144
- | 2.075 | 0.13 | 300 | 2.0924 |
145
 
146
 
147
  ### Framework versions
148
 
149
- - PEFT 0.7.1
150
- - Transformers 4.37.0
151
- - Pytorch 2.1.2+cu121
152
  - Datasets 2.16.1
153
  - Tokenizers 0.15.0
 
7
  model-index:
8
  - name: llm_train/test_out
9
  results: []
 
 
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
28
 
29
  bnb_config_kwargs:
30
  llm_int8_has_fp16_weight: true
 
31
  bnb_4bit_quant_type: nf4
32
  bnb_4bit_use_double_quant: false
33
 
 
41
  #using lora for lower cost
42
  adapter: qlora
43
  lora_r: 8
44
+ lora_alpha: 32
45
  lora_dropout: 0.05
46
  lora_target_modules:
47
  - q_proj
 
104
 
105
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
106
  It achieves the following results on the evaluation set:
107
+ - Loss: 2.0276
108
 
109
  ## Model description
110
 
 
138
 
139
  | Training Loss | Epoch | Step | Validation Loss |
140
  |:-------------:|:-----:|:----:|:---------------:|
141
+ | 2.0121 | 0.13 | 300 | 2.0276 |
142
 
143
 
144
  ### Framework versions
145
 
146
+ - PEFT 0.8.2
147
+ - Transformers 4.38.0.dev0
148
+ - Pytorch 2.1.2+cu118
149
  - Datasets 2.16.1
150
  - Tokenizers 0.15.0
adapter_config.json CHANGED
@@ -9,7 +9,7 @@
9
  "layers_pattern": null,
10
  "layers_to_transform": null,
11
  "loftq_config": {},
12
- "lora_alpha": 16,
13
  "lora_dropout": 0.05,
14
  "megatron_config": null,
15
  "megatron_core": "megatron.core",
@@ -19,8 +19,9 @@
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
- "q_proj",
23
- "v_proj"
24
  ],
25
- "task_type": "CAUSAL_LM"
 
26
  }
 
9
  "layers_pattern": null,
10
  "layers_to_transform": null,
11
  "loftq_config": {},
12
+ "lora_alpha": 32,
13
  "lora_dropout": 0.05,
14
  "megatron_config": null,
15
  "megatron_core": "megatron.core",
 
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
+ "v_proj",
23
+ "q_proj"
24
  ],
25
+ "task_type": "CAUSAL_LM",
26
+ "use_rslora": false
27
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de8e409acca30ca66c0a950881e63eb5dd09be5416afd84b23b62440c853caf9
3
  size 13648432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f45a27de191344315748c5f9764a27df9eeec0b9462340f82616f139e43d900
3
  size 13648432
config.json CHANGED
@@ -16,6 +16,8 @@
16
  "num_hidden_layers": 32,
17
  "num_key_value_heads": 8,
18
  "quantization_config": {
 
 
19
  "bnb_4bit_compute_dtype": "float16",
20
  "bnb_4bit_quant_type": "nf4",
21
  "bnb_4bit_use_double_quant": false,
@@ -32,7 +34,7 @@
32
  "sliding_window": 4096,
33
  "tie_word_embeddings": false,
34
  "torch_dtype": "bfloat16",
35
- "transformers_version": "4.37.0",
36
  "use_cache": false,
37
  "vocab_size": 32000
38
  }
 
16
  "num_hidden_layers": 32,
17
  "num_key_value_heads": 8,
18
  "quantization_config": {
19
+ "_load_in_4bit": true,
20
+ "_load_in_8bit": false,
21
  "bnb_4bit_compute_dtype": "float16",
22
  "bnb_4bit_quant_type": "nf4",
23
  "bnb_4bit_use_double_quant": false,
 
34
  "sliding_window": 4096,
35
  "tie_word_embeddings": false,
36
  "torch_dtype": "bfloat16",
37
+ "transformers_version": "4.38.0.dev0",
38
  "use_cache": false,
39
  "vocab_size": 32000
40
  }