davidjbarnes commited on
Commit
db6efa8
·
verified ·
1 Parent(s): 5d52c29

Upload QLoRA adapter - DeepSeek-Coder 6.7B fine-tuned on Java/Spring Boot + React/TypeScript

Browse files
README.md ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: deepseek-ai/deepseek-coder-6.7b-instruct
5
+ tags:
6
+ - axolotl
7
+ - base_model:adapter:deepseek-ai/deepseek-coder-6.7b-instruct
8
+ - lora
9
+ - transformers
10
+ datasets:
11
+ - custom
12
+ pipeline_tag: text-generation
13
+ model-index:
14
+ - name: bridge-cli
15
+ results: []
16
+ ---
17
+
18
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
19
+ should probably proofread and complete it, then remove this comment. -->
20
+
21
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
22
+ <details><summary>See axolotl config</summary>
23
+
24
+ axolotl version: `0.15.0`
25
+ ```yaml
26
+ # Bridge CLI - Spring Boot Fine-Tuning Configuration
27
+ # Optimized for RunPod with budget GPU (RTX 4090/A5000 24GB)
28
+ # Using QLoRA for memory efficiency
29
+
30
+ base_model: deepseek-ai/deepseek-coder-6.7b-instruct
31
+ model_type: AutoModelForCausalLM
32
+ tokenizer_type: AutoTokenizer
33
+ trust_remote_code: true
34
+
35
+ # QLoRA Configuration (enables training on 24GB GPU)
36
+ load_in_4bit: true
37
+ adapter: qlora
38
+ lora_r: 16
39
+ lora_alpha: 32
40
+ lora_dropout: 0.05
41
+ lora_target_linear: true
42
+ lora_target_modules:
43
+ - q_proj
44
+ - v_proj
45
+ - k_proj
46
+ - o_proj
47
+ - gate_proj
48
+ - up_proj
49
+ - down_proj
50
+
51
+ # Dataset Configuration
52
+ datasets:
53
+ - path: /workspace/datasets/spring-boot-dataset.jsonl
54
+ type: alpaca
55
+ - path: /workspace/datasets/react-dataset.jsonl
56
+ type: alpaca
57
+
58
+ dataset_prepared_path: /workspace/prepared_data
59
+ val_set_size: 0.05
60
+ output_dir: /workspace/outputs/bridge-cli
61
+
62
+ # Training Parameters
63
+ sequence_len: 2048
64
+ sample_packing: true
65
+ pad_to_sequence_len: true
66
+
67
+ micro_batch_size: 4
68
+ gradient_accumulation_steps: 4
69
+ num_epochs: 3
70
+ learning_rate: 0.0002
71
+ lr_scheduler: cosine
72
+ warmup_ratio: 0.03
73
+ optimizer: adamw_8bit
74
+
75
+ # Memory Optimization
76
+ gradient_checkpointing: true
77
+ flash_attention: false
78
+ bf16: auto
79
+ tf32: false
80
+
81
+ # Training Settings
82
+ train_on_inputs: false
83
+ group_by_length: false
84
+ logging_steps: 10
85
+ save_strategy: steps
86
+ save_steps: 100
87
+ eval_steps: 100
88
+
89
+ # Weights & Biases (optional - remove if not using)
90
+ # wandb_project: bridge-cli
91
+ # wandb_run_id: spring-boot-finetune
92
+
93
+ # Early stopping
94
+ early_stopping_patience: 3
95
+
96
+ # For debugging - set to true to test config
97
+ debug: false
98
+
99
+ # Special tokens
100
+ special_tokens:
101
+ pad_token: "<|pad|>"
102
+
103
+ ```
104
+
105
+ </details><br>
106
+
107
+ # Bridge CLI - Fine-tuned Code Generation Model
108
+
109
+ This model is a fine-tuned version of [deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct) on custom Java/Spring Boot (2,307 examples) and React/TypeScript (4,041 examples) datasets in Alpaca instruction format.
110
+ It achieves the following results on the evaluation set:
111
+ - Loss: 0.4579
112
+ - Ppl: 1.5808
113
+ - Memory/max Active (gib): 6.45
114
+ - Memory/max Allocated (gib): 6.45
115
+ - Memory/device Reserved (gib): 10.47
116
+
117
+ ## Model description
118
+
119
+ More information needed
120
+
121
+ ## Intended uses & limitations
122
+
123
+ More information needed
124
+
125
+ ## Training and evaluation data
126
+
127
+ More information needed
128
+
129
+ ## Training procedure
130
+
131
+ ### Training hyperparameters
132
+
133
+ The following hyperparameters were used during training:
134
+ - learning_rate: 0.0002
135
+ - train_batch_size: 4
136
+ - eval_batch_size: 4
137
+ - seed: 42
138
+ - gradient_accumulation_steps: 4
139
+ - total_train_batch_size: 16
140
+ - optimizer: Use OptimizerNames.ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
141
+ - lr_scheduler_type: cosine
142
+ - lr_scheduler_warmup_steps: 6
143
+ - training_steps: 222
144
+
145
+ ### Training results
146
+
147
+ | Training Loss | Epoch | Step | Validation Loss | Ppl | Active (gib) | Allocated (gib) | Reserved (gib) |
148
+ |:-------------:|:------:|:----:|:---------------:|:-------:|:------------:|:---------------:|:--------------:|
149
+ | No log | 0 | 0 | 3.3726 | 29.1542 | 6.36 | 6.36 | 12.58 |
150
+ | 0.4787 | 1.3356 | 100 | 0.5115 | 1.6679 | 6.45 | 6.45 | 10.47 |
151
+ | 0.4037 | 2.6711 | 200 | 0.4579 | 1.5808 | 6.45 | 6.45 | 10.47 |
152
+
153
+
154
+ ### Framework versions
155
+
156
+ - PEFT 0.18.1
157
+ - Transformers 5.3.0
158
+ - Pytorch 2.10.0+cu128
159
+ - Datasets 4.5.0
160
+ - Tokenizers 0.22.2
adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-6.7b-instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "up_proj",
34
+ "o_proj",
35
+ "k_proj",
36
+ "down_proj",
37
+ "v_proj",
38
+ "gate_proj"
39
+ ],
40
+ "target_parameters": [],
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae7c90f5adffd21f676487c3ed53eaa4eb29d63e63a69b489a4891091c9fffdd
3
+ size 159967880
chat_template.jinja ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}
2
+ {% set add_generation_prompt = false %}
3
+ {% endif %}
4
+ {%- set ns = namespace(found=false) -%}
5
+ {%- for message in messages -%}
6
+ {%- if message['role'] == 'system' -%}
7
+ {%- set ns.found = true -%}
8
+ {%- endif -%}
9
+ {%- endfor -%}
10
+ {{bos_token}}{%- if not ns.found -%}
11
+ {{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n'}}
12
+ {%- endif %}
13
+ {%- for message in messages %}
14
+ {%- if message['role'] == 'system' %}
15
+ {{ message['content'] }}
16
+ {%- else %}
17
+ {%- if message['role'] == 'user' %}
18
+ {{'### Instruction:\n' + message['content'] + '\n'}}
19
+ {%- else %}
20
+ {{'### Response:\n' + message['content'] + '\n<|EOT|>\n'}}
21
+ {%- endif %}
22
+ {%- endif %}
23
+ {%- endfor %}
24
+ {% if add_generation_prompt %}
25
+ {{'### Response:'}}
26
+ {% endif %}
config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 32013,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 32021,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 11008,
15
+ "max_position_embeddings": 16384,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 32,
19
+ "num_hidden_layers": 32,
20
+ "num_key_value_heads": 32,
21
+ "pad_token_id": null,
22
+ "pretraining_tp": 1,
23
+ "quantization_config": {
24
+ "_load_in_4bit": true,
25
+ "_load_in_8bit": false,
26
+ "bnb_4bit_compute_dtype": "bfloat16",
27
+ "bnb_4bit_quant_storage": "bfloat16",
28
+ "bnb_4bit_quant_type": "nf4",
29
+ "bnb_4bit_use_double_quant": true,
30
+ "llm_int8_enable_fp32_cpu_offload": false,
31
+ "llm_int8_has_fp16_weight": false,
32
+ "llm_int8_skip_modules": null,
33
+ "llm_int8_threshold": 6.0,
34
+ "load_in_4bit": true,
35
+ "load_in_8bit": false,
36
+ "quant_method": "bitsandbytes"
37
+ },
38
+ "rms_norm_eps": 1e-06,
39
+ "rope_parameters": {
40
+ "factor": 4.0,
41
+ "rope_theta": 100000,
42
+ "rope_type": "linear",
43
+ "type": "linear"
44
+ },
45
+ "tie_word_embeddings": false,
46
+ "transformers_version": "5.3.0",
47
+ "use_cache": false,
48
+ "vocab_size": 32256
49
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": null,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|begin▁of▁sentence|>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|EOT|>",
7
+ "is_local": false,
8
+ "model_max_length": 16384,
9
+ "pad_token": "<|pad|>",
10
+ "sp_model_kwargs": {},
11
+ "tokenizer_class": "LlamaTokenizer",
12
+ "unk_token": null,
13
+ "use_default_system_prompt": false
14
+ }