likhonsheikh commited on
Commit
d4cfbf8
·
verified ·
1 Parent(s): f0e44a7

Add training_config.yaml

Browse files
Files changed (1) hide show
  1. training_config.yaml +132 -0
training_config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Training Configuration for Sheikh-2.5-Coder
2
+ # This file contains the training hyperparameters and settings
3
+
4
+ # Model Architecture
5
+ model:
6
+ name: "Sheikh-2.5-Coder"
7
+ num_parameters: 3.09e9
8
+ context_length: 32768
9
+ vocab_size: 50257
10
+ hidden_size: 3072
11
+ num_attention_heads: 16
12
+ num_key_value_heads: 2
13
+ num_hidden_layers: 36
14
+ intermediate_size: 8192
15
+ activation: "swiglu"
16
+ layer_norm_epsilon: 1e-6
17
+ max_position_embeddings: 32768
18
+
19
+ # Training Data
20
+ data:
21
+ total_tokens: 5.5e12
22
+ sources:
23
+ - name: "the-stack-v2"
24
+ description: "Diverse programming language dataset"
25
+ percentage: 40
26
+ - name: "github-code"
27
+ description: "High-quality GitHub repositories"
28
+ percentage: 25
29
+ - name: "synthetic-code-data"
30
+ description: "AI-generated code examples"
31
+ percentage: 20
32
+ - name: "natural-language"
33
+ description: "Code documentation and comments"
34
+ percentage: 15
35
+
36
+ # Training Hyperparameters
37
+ training:
38
+ # Optimization
39
+ learning_rate: 1.0e-4
40
+ weight_decay: 0.01
41
+ beta1: 0.9
42
+ beta2: 0.95
43
+ eps: 1.0e-8
44
+
45
+ # Training Schedule
46
+ warmup_steps: 2000
47
+ max_steps: 100000
48
+ train_batch_size: 64
49
+ gradient_accumulation_steps: 4
50
+ max_grad_norm: 1.0
51
+
52
+ # Mixed Precision
53
+ fp16: true
54
+ bf16: true
55
+ tf32: true
56
+
57
+ # Regularization
58
+ dropout: 0.1
59
+ attention_dropout: 0.1
60
+
61
+ # Evaluation
62
+ eval_steps: 1000
63
+ save_steps: 2000
64
+ logging_steps: 100
65
+
66
+ # Instruction Tuning
67
+ instruction_tuning:
68
+ enabled: true
69
+ data_sources:
70
+ - "code-instruct"
71
+ - "multi-turn-conversations"
72
+ - "programming-help"
73
+ learning_rate: 5.0e-6
74
+ train_batch_size: 16
75
+ max_sequence_length: 32768
76
+
77
+ # Efficiency Optimizations
78
+ efficiency:
79
+ flash_attention: true
80
+ gradient_checkpointing: true
81
+ deepspeed: false
82
+ fsdp: false
83
+ use_cache: true
84
+ rope_scaling:
85
+ type: "linear"
86
+ factor: 8.0
87
+
88
+ # Hardware Configuration
89
+ hardware:
90
+ gpus: 8
91
+ gpu_type: "A100"
92
+ gpu_memory: "80GB"
93
+ host_memory: "1TB"
94
+ network: "infiniband"
95
+
96
+ # Checkpointing
97
+ checkpointing:
98
+ save_total_limit: 3
99
+ load_best_model_at_end: true
100
+ metric_for_best_model: "loss"
101
+ greater_is_better: false
102
+
103
+ # Distributed Training
104
+ distributed:
105
+ world_size: 8
106
+ rank: 0
107
+ master_addr: "localhost"
108
+ master_port: 12355
109
+
110
+ # Logging and Monitoring
111
+ logging:
112
+ wandb:
113
+ enabled: true
114
+ project: "sheikh-2.5-coder"
115
+ tensorboard:
116
+ enabled: true
117
+ log_dir: "./logs"
118
+ mlflow:
119
+ enabled: false
120
+
121
+ # Evaluation Metrics
122
+ evaluation:
123
+ benchmarks:
124
+ - name: "HumanEval"
125
+ evaluation_steps: 1000
126
+ batch_size: 10
127
+ - name: "MBPP"
128
+ evaluation_steps: 1000
129
+ batch_size: 10
130
+ - name: "MultiPL-E"
131
+ evaluation_steps: 2000
132
+ batch_size: 5