Upload QLoRA adapter - DeepSeek-Coder 6.7B fine-tuned on Java/Spring Boot + React/TypeScript
Browse files- README.md +160 -0
- adapter_config.json +46 -0
- adapter_model.safetensors +3 -0
- chat_template.jinja +26 -0
- config.json +49 -0
- tokenizer.json +0 -0
- tokenizer_config.json +14 -0
README.md
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: peft
|
| 3 |
+
license: other
|
| 4 |
+
base_model: deepseek-ai/deepseek-coder-6.7b-instruct
|
| 5 |
+
tags:
|
| 6 |
+
- axolotl
|
| 7 |
+
- base_model:adapter:deepseek-ai/deepseek-coder-6.7b-instruct
|
| 8 |
+
- lora
|
| 9 |
+
- transformers
|
| 10 |
+
datasets:
|
| 11 |
+
- custom
|
| 12 |
+
pipeline_tag: text-generation
|
| 13 |
+
model-index:
|
| 14 |
+
- name: bridge-cli
|
| 15 |
+
results: []
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
| 19 |
+
should probably proofread and complete it, then remove this comment. -->
|
| 20 |
+
|
| 21 |
+
[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
|
| 22 |
+
<details><summary>See axolotl config</summary>
|
| 23 |
+
|
| 24 |
+
axolotl version: `0.15.0`
|
| 25 |
+
```yaml
|
| 26 |
+
# Bridge CLI - Spring Boot Fine-Tuning Configuration
|
| 27 |
+
# Optimized for RunPod with budget GPU (RTX 4090/A5000 24GB)
|
| 28 |
+
# Using QLoRA for memory efficiency
|
| 29 |
+
|
| 30 |
+
base_model: deepseek-ai/deepseek-coder-6.7b-instruct
|
| 31 |
+
model_type: AutoModelForCausalLM
|
| 32 |
+
tokenizer_type: AutoTokenizer
|
| 33 |
+
trust_remote_code: true
|
| 34 |
+
|
| 35 |
+
# QLoRA Configuration (enables training on 24GB GPU)
|
| 36 |
+
load_in_4bit: true
|
| 37 |
+
adapter: qlora
|
| 38 |
+
lora_r: 16
|
| 39 |
+
lora_alpha: 32
|
| 40 |
+
lora_dropout: 0.05
|
| 41 |
+
lora_target_linear: true
|
| 42 |
+
lora_target_modules:
|
| 43 |
+
- q_proj
|
| 44 |
+
- v_proj
|
| 45 |
+
- k_proj
|
| 46 |
+
- o_proj
|
| 47 |
+
- gate_proj
|
| 48 |
+
- up_proj
|
| 49 |
+
- down_proj
|
| 50 |
+
|
| 51 |
+
# Dataset Configuration
|
| 52 |
+
datasets:
|
| 53 |
+
- path: /workspace/datasets/spring-boot-dataset.jsonl
|
| 54 |
+
type: alpaca
|
| 55 |
+
- path: /workspace/datasets/react-dataset.jsonl
|
| 56 |
+
type: alpaca
|
| 57 |
+
|
| 58 |
+
dataset_prepared_path: /workspace/prepared_data
|
| 59 |
+
val_set_size: 0.05
|
| 60 |
+
output_dir: /workspace/outputs/bridge-cli
|
| 61 |
+
|
| 62 |
+
# Training Parameters
|
| 63 |
+
sequence_len: 2048
|
| 64 |
+
sample_packing: true
|
| 65 |
+
pad_to_sequence_len: true
|
| 66 |
+
|
| 67 |
+
micro_batch_size: 4
|
| 68 |
+
gradient_accumulation_steps: 4
|
| 69 |
+
num_epochs: 3
|
| 70 |
+
learning_rate: 0.0002
|
| 71 |
+
lr_scheduler: cosine
|
| 72 |
+
warmup_ratio: 0.03
|
| 73 |
+
optimizer: adamw_8bit
|
| 74 |
+
|
| 75 |
+
# Memory Optimization
|
| 76 |
+
gradient_checkpointing: true
|
| 77 |
+
flash_attention: false
|
| 78 |
+
bf16: auto
|
| 79 |
+
tf32: false
|
| 80 |
+
|
| 81 |
+
# Training Settings
|
| 82 |
+
train_on_inputs: false
|
| 83 |
+
group_by_length: false
|
| 84 |
+
logging_steps: 10
|
| 85 |
+
save_strategy: steps
|
| 86 |
+
save_steps: 100
|
| 87 |
+
eval_steps: 100
|
| 88 |
+
|
| 89 |
+
# Weights & Biases (optional - remove if not using)
|
| 90 |
+
# wandb_project: bridge-cli
|
| 91 |
+
# wandb_run_id: spring-boot-finetune
|
| 92 |
+
|
| 93 |
+
# Early stopping
|
| 94 |
+
early_stopping_patience: 3
|
| 95 |
+
|
| 96 |
+
# For debugging - set to true to test config
|
| 97 |
+
debug: false
|
| 98 |
+
|
| 99 |
+
# Special tokens
|
| 100 |
+
special_tokens:
|
| 101 |
+
pad_token: "<|pad|>"
|
| 102 |
+
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
</details><br>
|
| 106 |
+
|
| 107 |
+
# Bridge CLI - Fine-tuned Code Generation Model
|
| 108 |
+
|
| 109 |
+
This model is a fine-tuned version of [deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct) on custom Java/Spring Boot (2,307 examples) and React/TypeScript (4,041 examples) datasets in Alpaca instruction format.
|
| 110 |
+
It achieves the following results on the evaluation set:
|
| 111 |
+
- Loss: 0.4579
|
| 112 |
+
- Ppl: 1.5808
|
| 113 |
+
- Memory/max Active (gib): 6.45
|
| 114 |
+
- Memory/max Allocated (gib): 6.45
|
| 115 |
+
- Memory/device Reserved (gib): 10.47
|
| 116 |
+
|
| 117 |
+
## Model description
|
| 118 |
+
|
| 119 |
+
More information needed
|
| 120 |
+
|
| 121 |
+
## Intended uses & limitations
|
| 122 |
+
|
| 123 |
+
More information needed
|
| 124 |
+
|
| 125 |
+
## Training and evaluation data
|
| 126 |
+
|
| 127 |
+
More information needed
|
| 128 |
+
|
| 129 |
+
## Training procedure
|
| 130 |
+
|
| 131 |
+
### Training hyperparameters
|
| 132 |
+
|
| 133 |
+
The following hyperparameters were used during training:
|
| 134 |
+
- learning_rate: 0.0002
|
| 135 |
+
- train_batch_size: 4
|
| 136 |
+
- eval_batch_size: 4
|
| 137 |
+
- seed: 42
|
| 138 |
+
- gradient_accumulation_steps: 4
|
| 139 |
+
- total_train_batch_size: 16
|
| 140 |
+
- optimizer: Use OptimizerNames.ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
| 141 |
+
- lr_scheduler_type: cosine
|
| 142 |
+
- lr_scheduler_warmup_steps: 6
|
| 143 |
+
- training_steps: 222
|
| 144 |
+
|
| 145 |
+
### Training results
|
| 146 |
+
|
| 147 |
+
| Training Loss | Epoch | Step | Validation Loss | Ppl | Active (gib) | Allocated (gib) | Reserved (gib) |
|
| 148 |
+
|:-------------:|:------:|:----:|:---------------:|:-------:|:------------:|:---------------:|:--------------:|
|
| 149 |
+
| No log | 0 | 0 | 3.3726 | 29.1542 | 6.36 | 6.36 | 12.58 |
|
| 150 |
+
| 0.4787 | 1.3356 | 100 | 0.5115 | 1.6679 | 6.45 | 6.45 | 10.47 |
|
| 151 |
+
| 0.4037 | 2.6711 | 200 | 0.4579 | 1.5808 | 6.45 | 6.45 | 10.47 |
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
### Framework versions
|
| 155 |
+
|
| 156 |
+
- PEFT 0.18.1
|
| 157 |
+
- Transformers 5.3.0
|
| 158 |
+
- Pytorch 2.10.0+cu128
|
| 159 |
+
- Datasets 4.5.0
|
| 160 |
+
- Tokenizers 0.22.2
|
adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "deepseek-ai/deepseek-coder-6.7b-instruct",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": null,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 32,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.05,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 16,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"q_proj",
|
| 33 |
+
"up_proj",
|
| 34 |
+
"o_proj",
|
| 35 |
+
"k_proj",
|
| 36 |
+
"down_proj",
|
| 37 |
+
"v_proj",
|
| 38 |
+
"gate_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": [],
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae7c90f5adffd21f676487c3ed53eaa4eb29d63e63a69b489a4891091c9fffdd
|
| 3 |
+
size 159967880
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% if not add_generation_prompt is defined %}
|
| 2 |
+
{% set add_generation_prompt = false %}
|
| 3 |
+
{% endif %}
|
| 4 |
+
{%- set ns = namespace(found=false) -%}
|
| 5 |
+
{%- for message in messages -%}
|
| 6 |
+
{%- if message['role'] == 'system' -%}
|
| 7 |
+
{%- set ns.found = true -%}
|
| 8 |
+
{%- endif -%}
|
| 9 |
+
{%- endfor -%}
|
| 10 |
+
{{bos_token}}{%- if not ns.found -%}
|
| 11 |
+
{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n'}}
|
| 12 |
+
{%- endif %}
|
| 13 |
+
{%- for message in messages %}
|
| 14 |
+
{%- if message['role'] == 'system' %}
|
| 15 |
+
{{ message['content'] }}
|
| 16 |
+
{%- else %}
|
| 17 |
+
{%- if message['role'] == 'user' %}
|
| 18 |
+
{{'### Instruction:\n' + message['content'] + '\n'}}
|
| 19 |
+
{%- else %}
|
| 20 |
+
{{'### Response:\n' + message['content'] + '\n<|EOT|>\n'}}
|
| 21 |
+
{%- endif %}
|
| 22 |
+
{%- endif %}
|
| 23 |
+
{%- endfor %}
|
| 24 |
+
{% if add_generation_prompt %}
|
| 25 |
+
{{'### Response:'}}
|
| 26 |
+
{% endif %}
|
config.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LlamaForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 32013,
|
| 8 |
+
"dtype": "bfloat16",
|
| 9 |
+
"eos_token_id": 32021,
|
| 10 |
+
"head_dim": 128,
|
| 11 |
+
"hidden_act": "silu",
|
| 12 |
+
"hidden_size": 4096,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 11008,
|
| 15 |
+
"max_position_embeddings": 16384,
|
| 16 |
+
"mlp_bias": false,
|
| 17 |
+
"model_type": "llama",
|
| 18 |
+
"num_attention_heads": 32,
|
| 19 |
+
"num_hidden_layers": 32,
|
| 20 |
+
"num_key_value_heads": 32,
|
| 21 |
+
"pad_token_id": null,
|
| 22 |
+
"pretraining_tp": 1,
|
| 23 |
+
"quantization_config": {
|
| 24 |
+
"_load_in_4bit": true,
|
| 25 |
+
"_load_in_8bit": false,
|
| 26 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 27 |
+
"bnb_4bit_quant_storage": "bfloat16",
|
| 28 |
+
"bnb_4bit_quant_type": "nf4",
|
| 29 |
+
"bnb_4bit_use_double_quant": true,
|
| 30 |
+
"llm_int8_enable_fp32_cpu_offload": false,
|
| 31 |
+
"llm_int8_has_fp16_weight": false,
|
| 32 |
+
"llm_int8_skip_modules": null,
|
| 33 |
+
"llm_int8_threshold": 6.0,
|
| 34 |
+
"load_in_4bit": true,
|
| 35 |
+
"load_in_8bit": false,
|
| 36 |
+
"quant_method": "bitsandbytes"
|
| 37 |
+
},
|
| 38 |
+
"rms_norm_eps": 1e-06,
|
| 39 |
+
"rope_parameters": {
|
| 40 |
+
"factor": 4.0,
|
| 41 |
+
"rope_theta": 100000,
|
| 42 |
+
"rope_type": "linear",
|
| 43 |
+
"type": "linear"
|
| 44 |
+
},
|
| 45 |
+
"tie_word_embeddings": false,
|
| 46 |
+
"transformers_version": "5.3.0",
|
| 47 |
+
"use_cache": false,
|
| 48 |
+
"vocab_size": 32256
|
| 49 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": null,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": "<|begin▁of▁sentence|>",
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|EOT|>",
|
| 7 |
+
"is_local": false,
|
| 8 |
+
"model_max_length": 16384,
|
| 9 |
+
"pad_token": "<|pad|>",
|
| 10 |
+
"sp_model_kwargs": {},
|
| 11 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 12 |
+
"unk_token": null,
|
| 13 |
+
"use_default_system_prompt": false
|
| 14 |
+
}
|