mjschock commited on
Commit
e68039e
·
unverified ·
1 Parent(s): 7461cd0

Refactor unsloth_SmolLM2 notebook to improve model training parameters and output handling. Update dropout settings and layer configurations for enhanced performance. Adjust execution counts for consistency and streamline output messages. Remove redundant display data outputs to clean up notebook structure.

Browse files
notebooks/unsloth_SmolLM2-135M-Instruct-bnb-4bit_xingyaoww_code-act.ipynb CHANGED
@@ -152,118 +152,6 @@
152
  " \"-____-\" Free license: http://github.com/unslothai/unsloth\n",
153
  "Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!\n"
154
  ]
155
- },
156
- {
157
- "data": {
158
- "application/vnd.jupyter.widget-view+json": {
159
- "model_id": "995593967f9841cc93c09e6ce0f60909",
160
- "version_major": 2,
161
- "version_minor": 0
162
- },
163
- "text/plain": [
164
- "model.safetensors: 0%| | 0.00/112M [00:00<?, ?B/s]"
165
- ]
166
- },
167
- "metadata": {},
168
- "output_type": "display_data"
169
- },
170
- {
171
- "data": {
172
- "application/vnd.jupyter.widget-view+json": {
173
- "model_id": "0cd3b1d5d23540429ae5bc3264ac3f54",
174
- "version_major": 2,
175
- "version_minor": 0
176
- },
177
- "text/plain": [
178
- "generation_config.json: 0%| | 0.00/158 [00:00<?, ?B/s]"
179
- ]
180
- },
181
- "metadata": {},
182
- "output_type": "display_data"
183
- },
184
- {
185
- "data": {
186
- "application/vnd.jupyter.widget-view+json": {
187
- "model_id": "848fbd96ade24e15bfc90fdf071efc11",
188
- "version_major": 2,
189
- "version_minor": 0
190
- },
191
- "text/plain": [
192
- "tokenizer_config.json: 0%| | 0.00/3.96k [00:00<?, ?B/s]"
193
- ]
194
- },
195
- "metadata": {},
196
- "output_type": "display_data"
197
- },
198
- {
199
- "data": {
200
- "application/vnd.jupyter.widget-view+json": {
201
- "model_id": "137e28acb2b648e4a8e02e7380e3911c",
202
- "version_major": 2,
203
- "version_minor": 0
204
- },
205
- "text/plain": [
206
- "vocab.json: 0%| | 0.00/801k [00:00<?, ?B/s]"
207
- ]
208
- },
209
- "metadata": {},
210
- "output_type": "display_data"
211
- },
212
- {
213
- "data": {
214
- "application/vnd.jupyter.widget-view+json": {
215
- "model_id": "787f11d5fa5c4896ba343661feb25007",
216
- "version_major": 2,
217
- "version_minor": 0
218
- },
219
- "text/plain": [
220
- "merges.txt: 0%| | 0.00/466k [00:00<?, ?B/s]"
221
- ]
222
- },
223
- "metadata": {},
224
- "output_type": "display_data"
225
- },
226
- {
227
- "data": {
228
- "application/vnd.jupyter.widget-view+json": {
229
- "model_id": "53395d55c2194eaa8c52c6caa9096114",
230
- "version_major": 2,
231
- "version_minor": 0
232
- },
233
- "text/plain": [
234
- "added_tokens.json: 0%| | 0.00/29.0 [00:00<?, ?B/s]"
235
- ]
236
- },
237
- "metadata": {},
238
- "output_type": "display_data"
239
- },
240
- {
241
- "data": {
242
- "application/vnd.jupyter.widget-view+json": {
243
- "model_id": "aa15988ef4d14900b1b3c0453ef9bff5",
244
- "version_major": 2,
245
- "version_minor": 0
246
- },
247
- "text/plain": [
248
- "special_tokens_map.json: 0%| | 0.00/423 [00:00<?, ?B/s]"
249
- ]
250
- },
251
- "metadata": {},
252
- "output_type": "display_data"
253
- },
254
- {
255
- "data": {
256
- "application/vnd.jupyter.widget-view+json": {
257
- "model_id": "8441305aa0a84352b80dc23bc5626789",
258
- "version_major": 2,
259
- "version_minor": 0
260
- },
261
- "text/plain": [
262
- "tokenizer.json: 0%| | 0.00/3.52M [00:00<?, ?B/s]"
263
- ]
264
- },
265
- "metadata": {},
266
- "output_type": "display_data"
267
  }
268
  ],
269
  "source": [
@@ -319,18 +207,20 @@
319
  "name": "stderr",
320
  "output_type": "stream",
321
  "text": [
322
- "Unsloth 2025.4.3 patched 30 layers with 30 QKV layers, 30 O layers and 30 MLP layers.\n"
 
 
323
  ]
324
  }
325
  ],
326
  "source": [
327
  "model = FastLanguageModel.get_peft_model(\n",
328
  " model,\n",
329
- " r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n",
330
  " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
331
  " \"gate_proj\", \"up_proj\", \"down_proj\",],\n",
332
- " lora_alpha = 16,\n",
333
- " lora_dropout = 0, # Supports any, but = 0 is optimized\n",
334
  " bias = \"none\", # Supports any, but = \"none\" is optimized\n",
335
  " # [NEW] \"unsloth\" uses 30% less VRAM, fits 2x larger batch sizes!\n",
336
  " use_gradient_checkpointing = \"unsloth\", # True or \"unsloth\" for very long context\n",
@@ -372,7 +262,7 @@
372
  },
373
  {
374
  "cell_type": "code",
375
- "execution_count": 5,
376
  "metadata": {
377
  "colab": {
378
  "base_uri": "https://localhost:8080/",
@@ -429,18 +319,11 @@
429
  },
430
  "outputs": [
431
  {
432
- "data": {
433
- "application/vnd.jupyter.widget-view+json": {
434
- "model_id": "1fa85cf9abd44c75b1263cbeea89c683",
435
- "version_major": 2,
436
- "version_minor": 0
437
- },
438
- "text/plain": [
439
- "Map: 0%| | 0/7139 [00:00<?, ? examples/s]"
440
- ]
441
- },
442
- "metadata": {},
443
- "output_type": "display_data"
444
  }
445
  ],
446
  "source": [
@@ -476,7 +359,7 @@
476
  },
477
  {
478
  "cell_type": "code",
479
- "execution_count": 6,
480
  "metadata": {
481
  "colab": {
482
  "base_uri": "https://localhost:8080/"
@@ -508,7 +391,7 @@
508
  " 'role': 'assistant'}]"
509
  ]
510
  },
511
- "execution_count": 6,
512
  "metadata": {},
513
  "output_type": "execute_result"
514
  }
@@ -519,7 +402,7 @@
519
  },
520
  {
521
  "cell_type": "code",
522
- "execution_count": 7,
523
  "metadata": {
524
  "colab": {
525
  "base_uri": "https://localhost:8080/"
@@ -680,7 +563,7 @@
680
  },
681
  {
682
  "cell_type": "code",
683
- "execution_count": 8,
684
  "metadata": {
685
  "id": "p31Z-S6FUieB"
686
  },
@@ -724,7 +607,7 @@
724
  },
725
  {
726
  "cell_type": "code",
727
- "execution_count": 9,
728
  "metadata": {
729
  "colab": {
730
  "base_uri": "https://localhost:8080/",
@@ -753,20 +636,6 @@
753
  "text": [
754
  "Unsloth: We found double BOS tokens - we shall remove one automatically.\n"
755
  ]
756
- },
757
- {
758
- "data": {
759
- "application/vnd.jupyter.widget-view+json": {
760
- "model_id": "848b52808244439c8276a96cca7ec482",
761
- "version_major": 2,
762
- "version_minor": 0
763
- },
764
- "text/plain": [
765
- "Unsloth: Tokenizing [\"text\"] (num_proc=2): 0%| | 0/7139 [00:00<?, ? examples/s]"
766
- ]
767
- },
768
- "metadata": {},
769
- "output_type": "display_data"
770
  }
771
  ],
772
  "source": [
@@ -784,25 +653,26 @@
784
  " packing = False, # Can make training 5x faster for short sequences.\n",
785
  " args = TrainingArguments(\n",
786
  " per_device_train_batch_size = 2,\n",
787
- " gradient_accumulation_steps = 4,\n",
788
- " warmup_steps = 5,\n",
789
  " max_steps = 60,\n",
790
- " learning_rate = 2e-4,\n",
791
  " fp16 = not is_bfloat16_supported(),\n",
792
  " bf16 = is_bfloat16_supported(),\n",
793
  " logging_steps = 1,\n",
794
  " optim = \"adamw_8bit\",\n",
795
  " weight_decay = 0.01,\n",
796
- " lr_scheduler_type = \"linear\",\n",
797
  " seed = 3407,\n",
798
  " output_dir = \"outputs\",\n",
 
799
  " ),\n",
800
  ")"
801
  ]
802
  },
803
  {
804
  "cell_type": "code",
805
- "execution_count": 10,
806
  "metadata": {
807
  "cellView": "form",
808
  "colab": {
@@ -817,7 +687,7 @@
817
  "output_type": "stream",
818
  "text": [
819
  "GPU = NVIDIA GeForce GTX 1050 Ti. Max memory = 3.94 GB.\n",
820
- "0.211 GB of memory reserved.\n"
821
  ]
822
  }
823
  ],
@@ -832,7 +702,7 @@
832
  },
833
  {
834
  "cell_type": "code",
835
- "execution_count": 11,
836
  "metadata": {
837
  "colab": {
838
  "base_uri": "https://localhost:8080/",
@@ -848,9 +718,9 @@
848
  "text": [
849
  "==((====))== Unsloth - 2x faster free finetuning | Num GPUs used = 1\n",
850
  " \\\\ /| Num examples = 7,139 | Num Epochs = 1 | Total steps = 60\n",
851
- "O^O/ \\_/ \\ Batch size per device = 2 | Gradient accumulation steps = 4\n",
852
- "\\ / Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8\n",
853
- " \"-____-\" Trainable parameters = 4,884,480/4,000,000,000 (0.12% trained)\n",
854
  "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.\n",
855
  "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mmjschock\u001b[0m to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
856
  ]
@@ -870,7 +740,7 @@
870
  {
871
  "data": {
872
  "text/html": [
873
- "Run data is saved locally in <code>/home/mjschock/Projects/hf-agents-course/agents/Final_Assignment_Template/wandb/run-20250430_100456-dnxza16p</code>"
874
  ],
875
  "text/plain": [
876
  "<IPython.core.display.HTML object>"
@@ -882,7 +752,7 @@
882
  {
883
  "data": {
884
  "text/html": [
885
- "Syncing run <strong><a href='https://wandb.ai/mjschock/huggingface/runs/dnxza16p' target=\"_blank\">outputs</a></strong> to <a href='https://wandb.ai/mjschock/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
886
  ],
887
  "text/plain": [
888
  "<IPython.core.display.HTML object>"
@@ -906,7 +776,7 @@
906
  {
907
  "data": {
908
  "text/html": [
909
- " View run at <a href='https://wandb.ai/mjschock/huggingface/runs/dnxza16p' target=\"_blank\">https://wandb.ai/mjschock/huggingface/runs/dnxza16p</a>"
910
  ],
911
  "text/plain": [
912
  "<IPython.core.display.HTML object>"
@@ -929,7 +799,7 @@
929
  " <div>\n",
930
  " \n",
931
  " <progress value='60' max='60' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
932
- " [60/60 1:29:55, Epoch 0/1]\n",
933
  " </div>\n",
934
  " <table border=\"1\" class=\"dataframe\">\n",
935
  " <thead>\n",
@@ -941,243 +811,243 @@
941
  " <tbody>\n",
942
  " <tr>\n",
943
  " <td>1</td>\n",
944
- " <td>1.747000</td>\n",
945
  " </tr>\n",
946
  " <tr>\n",
947
  " <td>2</td>\n",
948
- " <td>1.573700</td>\n",
949
  " </tr>\n",
950
  " <tr>\n",
951
  " <td>3</td>\n",
952
- " <td>1.813500</td>\n",
953
  " </tr>\n",
954
  " <tr>\n",
955
  " <td>4</td>\n",
956
- " <td>1.572600</td>\n",
957
  " </tr>\n",
958
  " <tr>\n",
959
  " <td>5</td>\n",
960
- " <td>1.674200</td>\n",
961
  " </tr>\n",
962
  " <tr>\n",
963
  " <td>6</td>\n",
964
- " <td>1.782500</td>\n",
965
  " </tr>\n",
966
  " <tr>\n",
967
  " <td>7</td>\n",
968
- " <td>1.794000</td>\n",
969
  " </tr>\n",
970
  " <tr>\n",
971
  " <td>8</td>\n",
972
- " <td>1.857400</td>\n",
973
  " </tr>\n",
974
  " <tr>\n",
975
  " <td>9</td>\n",
976
- " <td>1.649000</td>\n",
977
  " </tr>\n",
978
  " <tr>\n",
979
  " <td>10</td>\n",
980
- " <td>1.611200</td>\n",
981
  " </tr>\n",
982
  " <tr>\n",
983
  " <td>11</td>\n",
984
- " <td>1.778100</td>\n",
985
  " </tr>\n",
986
  " <tr>\n",
987
  " <td>12</td>\n",
988
- " <td>1.486100</td>\n",
989
  " </tr>\n",
990
  " <tr>\n",
991
  " <td>13</td>\n",
992
- " <td>1.518400</td>\n",
993
  " </tr>\n",
994
  " <tr>\n",
995
  " <td>14</td>\n",
996
- " <td>1.964800</td>\n",
997
  " </tr>\n",
998
  " <tr>\n",
999
  " <td>15</td>\n",
1000
- " <td>1.300400</td>\n",
1001
  " </tr>\n",
1002
  " <tr>\n",
1003
  " <td>16</td>\n",
1004
- " <td>1.650300</td>\n",
1005
  " </tr>\n",
1006
  " <tr>\n",
1007
  " <td>17</td>\n",
1008
- " <td>1.887100</td>\n",
1009
  " </tr>\n",
1010
  " <tr>\n",
1011
  " <td>18</td>\n",
1012
- " <td>1.636800</td>\n",
1013
  " </tr>\n",
1014
  " <tr>\n",
1015
  " <td>19</td>\n",
1016
- " <td>1.870800</td>\n",
1017
  " </tr>\n",
1018
  " <tr>\n",
1019
  " <td>20</td>\n",
1020
- " <td>1.711000</td>\n",
1021
  " </tr>\n",
1022
  " <tr>\n",
1023
  " <td>21</td>\n",
1024
- " <td>1.543400</td>\n",
1025
  " </tr>\n",
1026
  " <tr>\n",
1027
  " <td>22</td>\n",
1028
- " <td>1.788600</td>\n",
1029
  " </tr>\n",
1030
  " <tr>\n",
1031
  " <td>23</td>\n",
1032
- " <td>1.570100</td>\n",
1033
  " </tr>\n",
1034
  " <tr>\n",
1035
  " <td>24</td>\n",
1036
- " <td>1.422700</td>\n",
1037
  " </tr>\n",
1038
  " <tr>\n",
1039
  " <td>25</td>\n",
1040
- " <td>1.344900</td>\n",
1041
  " </tr>\n",
1042
  " <tr>\n",
1043
  " <td>26</td>\n",
1044
- " <td>1.586200</td>\n",
1045
  " </tr>\n",
1046
  " <tr>\n",
1047
  " <td>27</td>\n",
1048
- " <td>1.633000</td>\n",
1049
  " </tr>\n",
1050
  " <tr>\n",
1051
  " <td>28</td>\n",
1052
- " <td>1.574100</td>\n",
1053
  " </tr>\n",
1054
  " <tr>\n",
1055
  " <td>29</td>\n",
1056
- " <td>1.420200</td>\n",
1057
  " </tr>\n",
1058
  " <tr>\n",
1059
  " <td>30</td>\n",
1060
- " <td>1.551000</td>\n",
1061
  " </tr>\n",
1062
  " <tr>\n",
1063
  " <td>31</td>\n",
1064
- " <td>1.353400</td>\n",
1065
  " </tr>\n",
1066
  " <tr>\n",
1067
  " <td>32</td>\n",
1068
- " <td>1.596900</td>\n",
1069
  " </tr>\n",
1070
  " <tr>\n",
1071
  " <td>33</td>\n",
1072
- " <td>1.268700</td>\n",
1073
  " </tr>\n",
1074
  " <tr>\n",
1075
  " <td>34</td>\n",
1076
- " <td>1.553900</td>\n",
1077
  " </tr>\n",
1078
  " <tr>\n",
1079
  " <td>35</td>\n",
1080
- " <td>1.484200</td>\n",
1081
  " </tr>\n",
1082
  " <tr>\n",
1083
  " <td>36</td>\n",
1084
- " <td>1.354600</td>\n",
1085
  " </tr>\n",
1086
  " <tr>\n",
1087
  " <td>37</td>\n",
1088
- " <td>1.933200</td>\n",
1089
  " </tr>\n",
1090
  " <tr>\n",
1091
  " <td>38</td>\n",
1092
- " <td>1.448200</td>\n",
1093
  " </tr>\n",
1094
  " <tr>\n",
1095
  " <td>39</td>\n",
1096
- " <td>1.213000</td>\n",
1097
  " </tr>\n",
1098
  " <tr>\n",
1099
  " <td>40</td>\n",
1100
- " <td>1.571000</td>\n",
1101
  " </tr>\n",
1102
  " <tr>\n",
1103
  " <td>41</td>\n",
1104
- " <td>1.460700</td>\n",
1105
  " </tr>\n",
1106
  " <tr>\n",
1107
  " <td>42</td>\n",
1108
- " <td>1.341500</td>\n",
1109
  " </tr>\n",
1110
  " <tr>\n",
1111
  " <td>43</td>\n",
1112
- " <td>1.299500</td>\n",
1113
  " </tr>\n",
1114
  " <tr>\n",
1115
  " <td>44</td>\n",
1116
- " <td>1.368200</td>\n",
1117
  " </tr>\n",
1118
  " <tr>\n",
1119
  " <td>45</td>\n",
1120
- " <td>1.402200</td>\n",
1121
  " </tr>\n",
1122
  " <tr>\n",
1123
  " <td>46</td>\n",
1124
- " <td>1.249300</td>\n",
1125
  " </tr>\n",
1126
  " <tr>\n",
1127
  " <td>47</td>\n",
1128
- " <td>1.710000</td>\n",
1129
  " </tr>\n",
1130
  " <tr>\n",
1131
  " <td>48</td>\n",
1132
- " <td>1.330300</td>\n",
1133
  " </tr>\n",
1134
  " <tr>\n",
1135
  " <td>49</td>\n",
1136
- " <td>1.339600</td>\n",
1137
  " </tr>\n",
1138
  " <tr>\n",
1139
  " <td>50</td>\n",
1140
- " <td>1.116900</td>\n",
1141
  " </tr>\n",
1142
  " <tr>\n",
1143
  " <td>51</td>\n",
1144
- " <td>1.670000</td>\n",
1145
  " </tr>\n",
1146
  " <tr>\n",
1147
  " <td>52</td>\n",
1148
- " <td>1.154900</td>\n",
1149
  " </tr>\n",
1150
  " <tr>\n",
1151
  " <td>53</td>\n",
1152
- " <td>1.417100</td>\n",
1153
  " </tr>\n",
1154
  " <tr>\n",
1155
  " <td>54</td>\n",
1156
- " <td>1.308600</td>\n",
1157
  " </tr>\n",
1158
  " <tr>\n",
1159
  " <td>55</td>\n",
1160
- " <td>1.266300</td>\n",
1161
  " </tr>\n",
1162
  " <tr>\n",
1163
  " <td>56</td>\n",
1164
- " <td>1.396200</td>\n",
1165
  " </tr>\n",
1166
  " <tr>\n",
1167
  " <td>57</td>\n",
1168
- " <td>1.500100</td>\n",
1169
  " </tr>\n",
1170
  " <tr>\n",
1171
  " <td>58</td>\n",
1172
- " <td>1.264800</td>\n",
1173
  " </tr>\n",
1174
  " <tr>\n",
1175
  " <td>59</td>\n",
1176
- " <td>1.424700</td>\n",
1177
  " </tr>\n",
1178
  " <tr>\n",
1179
  " <td>60</td>\n",
1180
- " <td>1.673300</td>\n",
1181
  " </tr>\n",
1182
  " </tbody>\n",
1183
  "</table><p>"
@@ -1196,7 +1066,7 @@
1196
  },
1197
  {
1198
  "cell_type": "code",
1199
- "execution_count": 12,
1200
  "metadata": {
1201
  "cellView": "form",
1202
  "colab": {
@@ -1210,12 +1080,12 @@
1210
  "name": "stdout",
1211
  "output_type": "stream",
1212
  "text": [
1213
- "5510.3346 seconds used for training.\n",
1214
- "91.84 minutes used for training.\n",
1215
- "Peak reserved memory = 2.182 GB.\n",
1216
- "Peak reserved memory for training = 1.971 GB.\n",
1217
- "Peak reserved memory % of max memory = 55.381 %.\n",
1218
- "Peak reserved memory for training % of max memory = 50.025 %.\n"
1219
  ]
1220
  }
1221
  ],
@@ -1246,7 +1116,7 @@
1246
  },
1247
  {
1248
  "cell_type": "code",
1249
- "execution_count": 13,
1250
  "metadata": {
1251
  "colab": {
1252
  "base_uri": "https://localhost:8080/"
@@ -1265,10 +1135,10 @@
1265
  {
1266
  "data": {
1267
  "text/plain": [
1268
- "['<|im_start|>user\\nContinue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,<|im_end|>\\n<|im_start|>assistant\\nThe Fibonacci sequence is a series of numbers where each number is the sum of the two preceding ones, starting from 1 and 1. To find the first few Fibonacci numbers, we can use a simple iterative approach.\\n\\nWe begin with the first two Fibonacci numbers: 1 and 1.']"
1269
  ]
1270
  },
1271
- "execution_count": 13,
1272
  "metadata": {},
1273
  "output_type": "execute_result"
1274
  }
@@ -1310,7 +1180,7 @@
1310
  },
1311
  {
1312
  "cell_type": "code",
1313
- "execution_count": 14,
1314
  "metadata": {
1315
  "colab": {
1316
  "base_uri": "https://localhost:8080/"
@@ -1328,9 +1198,7 @@
1328
  "<|im_start|>assistant\n",
1329
  "The Fibonacci sequence is a series of numbers where each number is the sum of the two preceding ones, starting from 1 and 1. To find the first few Fibonacci numbers, we can use a simple iterative approach.\n",
1330
  "\n",
1331
- "We begin with the first two Fibonacci numbers: 1 and 1. Then, we add the two preceding numbers to get the next Fibonacci number: 2 + 1 = 3, 3 + 2 = 5, 5 + 3 = 8, and so on.\n",
1332
- "\n",
1333
- "Next, we add the two preceding numbers to get the next Fibonacci number\n"
1334
  ]
1335
  }
1336
  ],
@@ -1367,7 +1235,7 @@
1367
  },
1368
  {
1369
  "cell_type": "code",
1370
- "execution_count": 15,
1371
  "metadata": {
1372
  "id": "upcOlWe7A1vc"
1373
  },
@@ -1383,7 +1251,7 @@
1383
  " 'lora_model/tokenizer.json')"
1384
  ]
1385
  },
1386
- "execution_count": 15,
1387
  "metadata": {},
1388
  "output_type": "execute_result"
1389
  }
@@ -1406,7 +1274,7 @@
1406
  },
1407
  {
1408
  "cell_type": "code",
1409
- "execution_count": 16,
1410
  "metadata": {
1411
  "colab": {
1412
  "base_uri": "https://localhost:8080/"
@@ -1422,7 +1290,9 @@
1422
  "<|im_start|>user\n",
1423
  "What is a famous tall tower in Paris?<|im_end|>\n",
1424
  "<|im_start|>assistant\n",
1425
- "The famous Tall Tower in Paris is the Arc de Triomphe, a 13th-century iron arch built in 1865 to commemorate the 100th anniversary of the French Revolution. It is a symbol of the city's history and culture, and its presence is a testament to the enduring legacy of the French people. The Arc de Triomphe is a 130 meters tall iron arch built in 1865, constructed by the French arch-masons Gustave Eiffel and Gustave Eiffel. It is located in the Latin Quarter of Paris, and its height is \n"
 
 
1426
  ]
1427
  }
1428
  ],
@@ -1463,7 +1333,7 @@
1463
  },
1464
  {
1465
  "cell_type": "code",
1466
- "execution_count": 17,
1467
  "metadata": {
1468
  "id": "yFfaXG0WsQuE"
1469
  },
@@ -1493,7 +1363,7 @@
1493
  },
1494
  {
1495
  "cell_type": "code",
1496
- "execution_count": 18,
1497
  "metadata": {
1498
  "id": "iHjt_SMYsd3P"
1499
  },
@@ -1529,7 +1399,7 @@
1529
  },
1530
  {
1531
  "cell_type": "code",
1532
- "execution_count": 19,
1533
  "metadata": {
1534
  "id": "FqfebeAdT073"
1535
  },
 
152
  " \"-____-\" Free license: http://github.com/unslothai/unsloth\n",
153
  "Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!\n"
154
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  }
156
  ],
157
  "source": [
 
207
  "name": "stderr",
208
  "output_type": "stream",
209
  "text": [
210
+ "Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.\n",
211
+ "Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.\n",
212
+ "Unsloth 2025.4.3 patched 30 layers with 0 QKV layers, 0 O layers and 0 MLP layers.\n"
213
  ]
214
  }
215
  ],
216
  "source": [
217
  "model = FastLanguageModel.get_peft_model(\n",
218
  " model,\n",
219
+ " r=64, # Increased from 16\n",
220
  " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
221
  " \"gate_proj\", \"up_proj\", \"down_proj\",],\n",
222
+ " lora_alpha=128, # 2*r\n",
223
+ " lora_dropout=0.1, # Added dropout\n",
224
  " bias = \"none\", # Supports any, but = \"none\" is optimized\n",
225
  " # [NEW] \"unsloth\" uses 30% less VRAM, fits 2x larger batch sizes!\n",
226
  " use_gradient_checkpointing = \"unsloth\", # True or \"unsloth\" for very long context\n",
 
262
  },
263
  {
264
  "cell_type": "code",
265
+ "execution_count": 4,
266
  "metadata": {
267
  "colab": {
268
  "base_uri": "https://localhost:8080/",
 
319
  },
320
  "outputs": [
321
  {
322
+ "name": "stderr",
323
+ "output_type": "stream",
324
+ "text": [
325
+ "Unsloth: Will map <|im_end|> to EOS = <|im_end|>.\n"
326
+ ]
 
 
 
 
 
 
 
327
  }
328
  ],
329
  "source": [
 
359
  },
360
  {
361
  "cell_type": "code",
362
+ "execution_count": 5,
363
  "metadata": {
364
  "colab": {
365
  "base_uri": "https://localhost:8080/"
 
391
  " 'role': 'assistant'}]"
392
  ]
393
  },
394
+ "execution_count": 5,
395
  "metadata": {},
396
  "output_type": "execute_result"
397
  }
 
402
  },
403
  {
404
  "cell_type": "code",
405
+ "execution_count": 6,
406
  "metadata": {
407
  "colab": {
408
  "base_uri": "https://localhost:8080/"
 
563
  },
564
  {
565
  "cell_type": "code",
566
+ "execution_count": 7,
567
  "metadata": {
568
  "id": "p31Z-S6FUieB"
569
  },
 
607
  },
608
  {
609
  "cell_type": "code",
610
+ "execution_count": 8,
611
  "metadata": {
612
  "colab": {
613
  "base_uri": "https://localhost:8080/",
 
636
  "text": [
637
  "Unsloth: We found double BOS tokens - we shall remove one automatically.\n"
638
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
639
  }
640
  ],
641
  "source": [
 
653
  " packing = False, # Can make training 5x faster for short sequences.\n",
654
  " args = TrainingArguments(\n",
655
  " per_device_train_batch_size = 2,\n",
656
+ " gradient_accumulation_steps=8, # Increased from 4\n",
657
+ " warmup_steps=50,\n",
658
  " max_steps = 60,\n",
659
+ " learning_rate=5e-5, # Reduced from 2e-4\n",
660
  " fp16 = not is_bfloat16_supported(),\n",
661
  " bf16 = is_bfloat16_supported(),\n",
662
  " logging_steps = 1,\n",
663
  " optim = \"adamw_8bit\",\n",
664
  " weight_decay = 0.01,\n",
665
+ " lr_scheduler_type=\"cosine\", # Changed to cosine schedule\n",
666
  " seed = 3407,\n",
667
  " output_dir = \"outputs\",\n",
668
+ " gradient_checkpointing=True, # Added gradient checkpointing\n",
669
  " ),\n",
670
  ")"
671
  ]
672
  },
673
  {
674
  "cell_type": "code",
675
+ "execution_count": 9,
676
  "metadata": {
677
  "cellView": "form",
678
  "colab": {
 
687
  "output_type": "stream",
688
  "text": [
689
  "GPU = NVIDIA GeForce GTX 1050 Ti. Max memory = 3.94 GB.\n",
690
+ "0.225 GB of memory reserved.\n"
691
  ]
692
  }
693
  ],
 
702
  },
703
  {
704
  "cell_type": "code",
705
+ "execution_count": 10,
706
  "metadata": {
707
  "colab": {
708
  "base_uri": "https://localhost:8080/",
 
718
  "text": [
719
  "==((====))== Unsloth - 2x faster free finetuning | Num GPUs used = 1\n",
720
  " \\\\ /| Num examples = 7,139 | Num Epochs = 1 | Total steps = 60\n",
721
+ "O^O/ \\_/ \\ Batch size per device = 2 | Gradient accumulation steps = 8\n",
722
+ "\\ / Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16\n",
723
+ " \"-____-\" Trainable parameters = 19,537,920/4,000,000,000 (0.49% trained)\n",
724
  "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.\n",
725
  "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mmjschock\u001b[0m to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
726
  ]
 
740
  {
741
  "data": {
742
  "text/html": [
743
+ "Run data is saved locally in <code>/home/mjschock/Projects/hf-agents-course/agents/Final_Assignment_Template/notebooks/wandb/run-20250430_141858-0ph0gbln</code>"
744
  ],
745
  "text/plain": [
746
  "<IPython.core.display.HTML object>"
 
752
  {
753
  "data": {
754
  "text/html": [
755
+ "Syncing run <strong><a href='https://wandb.ai/mjschock/huggingface/runs/0ph0gbln' target=\"_blank\">outputs</a></strong> to <a href='https://wandb.ai/mjschock/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
756
  ],
757
  "text/plain": [
758
  "<IPython.core.display.HTML object>"
 
776
  {
777
  "data": {
778
  "text/html": [
779
+ " View run at <a href='https://wandb.ai/mjschock/huggingface/runs/0ph0gbln' target=\"_blank\">https://wandb.ai/mjschock/huggingface/runs/0ph0gbln</a>"
780
  ],
781
  "text/plain": [
782
  "<IPython.core.display.HTML object>"
 
799
  " <div>\n",
800
  " \n",
801
  " <progress value='60' max='60' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
802
+ " [60/60 3:06:36, Epoch 0/1]\n",
803
  " </div>\n",
804
  " <table border=\"1\" class=\"dataframe\">\n",
805
  " <thead>\n",
 
811
  " <tbody>\n",
812
  " <tr>\n",
813
  " <td>1</td>\n",
814
+ " <td>1.661700</td>\n",
815
  " </tr>\n",
816
  " <tr>\n",
817
  " <td>2</td>\n",
818
+ " <td>1.693100</td>\n",
819
  " </tr>\n",
820
  " <tr>\n",
821
  " <td>3</td>\n",
822
+ " <td>1.747500</td>\n",
823
  " </tr>\n",
824
  " <tr>\n",
825
  " <td>4</td>\n",
826
+ " <td>1.862600</td>\n",
827
  " </tr>\n",
828
  " <tr>\n",
829
  " <td>5</td>\n",
830
+ " <td>1.704500</td>\n",
831
  " </tr>\n",
832
  " <tr>\n",
833
  " <td>6</td>\n",
834
+ " <td>1.727900</td>\n",
835
  " </tr>\n",
836
  " <tr>\n",
837
  " <td>7</td>\n",
838
+ " <td>1.869500</td>\n",
839
  " </tr>\n",
840
  " <tr>\n",
841
  " <td>8</td>\n",
842
+ " <td>1.570300</td>\n",
843
  " </tr>\n",
844
  " <tr>\n",
845
  " <td>9</td>\n",
846
+ " <td>1.910300</td>\n",
847
  " </tr>\n",
848
  " <tr>\n",
849
  " <td>10</td>\n",
850
+ " <td>1.944300</td>\n",
851
  " </tr>\n",
852
  " <tr>\n",
853
  " <td>11</td>\n",
854
+ " <td>1.830000</td>\n",
855
  " </tr>\n",
856
  " <tr>\n",
857
  " <td>12</td>\n",
858
+ " <td>1.715700</td>\n",
859
  " </tr>\n",
860
  " <tr>\n",
861
  " <td>13</td>\n",
862
+ " <td>1.647800</td>\n",
863
  " </tr>\n",
864
  " <tr>\n",
865
  " <td>14</td>\n",
866
+ " <td>1.842400</td>\n",
867
  " </tr>\n",
868
  " <tr>\n",
869
  " <td>15</td>\n",
870
+ " <td>1.741600</td>\n",
871
  " </tr>\n",
872
  " <tr>\n",
873
  " <td>16</td>\n",
874
+ " <td>1.753800</td>\n",
875
  " </tr>\n",
876
  " <tr>\n",
877
  " <td>17</td>\n",
878
+ " <td>1.703500</td>\n",
879
  " </tr>\n",
880
  " <tr>\n",
881
  " <td>18</td>\n",
882
+ " <td>1.700800</td>\n",
883
  " </tr>\n",
884
  " <tr>\n",
885
  " <td>19</td>\n",
886
+ " <td>1.955300</td>\n",
887
  " </tr>\n",
888
  " <tr>\n",
889
  " <td>20</td>\n",
890
+ " <td>1.645000</td>\n",
891
  " </tr>\n",
892
  " <tr>\n",
893
  " <td>21</td>\n",
894
+ " <td>1.663200</td>\n",
895
  " </tr>\n",
896
  " <tr>\n",
897
  " <td>22</td>\n",
898
+ " <td>1.637100</td>\n",
899
  " </tr>\n",
900
  " <tr>\n",
901
  " <td>23</td>\n",
902
+ " <td>1.586000</td>\n",
903
  " </tr>\n",
904
  " <tr>\n",
905
  " <td>24</td>\n",
906
+ " <td>1.813300</td>\n",
907
  " </tr>\n",
908
  " <tr>\n",
909
  " <td>25</td>\n",
910
+ " <td>1.541500</td>\n",
911
  " </tr>\n",
912
  " <tr>\n",
913
  " <td>26</td>\n",
914
+ " <td>1.658000</td>\n",
915
  " </tr>\n",
916
  " <tr>\n",
917
  " <td>27</td>\n",
918
+ " <td>1.666100</td>\n",
919
  " </tr>\n",
920
  " <tr>\n",
921
  " <td>28</td>\n",
922
+ " <td>1.659300</td>\n",
923
  " </tr>\n",
924
  " <tr>\n",
925
  " <td>29</td>\n",
926
+ " <td>1.639000</td>\n",
927
  " </tr>\n",
928
  " <tr>\n",
929
  " <td>30</td>\n",
930
+ " <td>1.797200</td>\n",
931
  " </tr>\n",
932
  " <tr>\n",
933
  " <td>31</td>\n",
934
+ " <td>1.498100</td>\n",
935
  " </tr>\n",
936
  " <tr>\n",
937
  " <td>32</td>\n",
938
+ " <td>1.596800</td>\n",
939
  " </tr>\n",
940
  " <tr>\n",
941
  " <td>33</td>\n",
942
+ " <td>1.608000</td>\n",
943
  " </tr>\n",
944
  " <tr>\n",
945
  " <td>34</td>\n",
946
+ " <td>1.608700</td>\n",
947
  " </tr>\n",
948
  " <tr>\n",
949
  " <td>35</td>\n",
950
+ " <td>1.724600</td>\n",
951
  " </tr>\n",
952
  " <tr>\n",
953
  " <td>36</td>\n",
954
+ " <td>1.498700</td>\n",
955
  " </tr>\n",
956
  " <tr>\n",
957
  " <td>37</td>\n",
958
+ " <td>1.453600</td>\n",
959
  " </tr>\n",
960
  " <tr>\n",
961
  " <td>38</td>\n",
962
+ " <td>1.493500</td>\n",
963
  " </tr>\n",
964
  " <tr>\n",
965
  " <td>39</td>\n",
966
+ " <td>1.711300</td>\n",
967
  " </tr>\n",
968
  " <tr>\n",
969
  " <td>40</td>\n",
970
+ " <td>1.723000</td>\n",
971
  " </tr>\n",
972
  " <tr>\n",
973
  " <td>41</td>\n",
974
+ " <td>1.440200</td>\n",
975
  " </tr>\n",
976
  " <tr>\n",
977
  " <td>42</td>\n",
978
+ " <td>1.628000</td>\n",
979
  " </tr>\n",
980
  " <tr>\n",
981
  " <td>43</td>\n",
982
+ " <td>1.435800</td>\n",
983
  " </tr>\n",
984
  " <tr>\n",
985
  " <td>44</td>\n",
986
+ " <td>1.348700</td>\n",
987
  " </tr>\n",
988
  " <tr>\n",
989
  " <td>45</td>\n",
990
+ " <td>1.340700</td>\n",
991
  " </tr>\n",
992
  " <tr>\n",
993
  " <td>46</td>\n",
994
+ " <td>1.428200</td>\n",
995
  " </tr>\n",
996
  " <tr>\n",
997
  " <td>47</td>\n",
998
+ " <td>1.257900</td>\n",
999
  " </tr>\n",
1000
  " <tr>\n",
1001
  " <td>48</td>\n",
1002
+ " <td>1.462600</td>\n",
1003
  " </tr>\n",
1004
  " <tr>\n",
1005
  " <td>49</td>\n",
1006
+ " <td>1.520300</td>\n",
1007
  " </tr>\n",
1008
  " <tr>\n",
1009
  " <td>50</td>\n",
1010
+ " <td>1.403800</td>\n",
1011
  " </tr>\n",
1012
  " <tr>\n",
1013
  " <td>51</td>\n",
1014
+ " <td>1.472400</td>\n",
1015
  " </tr>\n",
1016
  " <tr>\n",
1017
  " <td>52</td>\n",
1018
+ " <td>1.245900</td>\n",
1019
  " </tr>\n",
1020
  " <tr>\n",
1021
  " <td>53</td>\n",
1022
+ " <td>1.113200</td>\n",
1023
  " </tr>\n",
1024
  " <tr>\n",
1025
  " <td>54</td>\n",
1026
+ " <td>1.338300</td>\n",
1027
  " </tr>\n",
1028
  " <tr>\n",
1029
  " <td>55</td>\n",
1030
+ " <td>1.375600</td>\n",
1031
  " </tr>\n",
1032
  " <tr>\n",
1033
  " <td>56</td>\n",
1034
+ " <td>1.158800</td>\n",
1035
  " </tr>\n",
1036
  " <tr>\n",
1037
  " <td>57</td>\n",
1038
+ " <td>1.208400</td>\n",
1039
  " </tr>\n",
1040
  " <tr>\n",
1041
  " <td>58</td>\n",
1042
+ " <td>1.160900</td>\n",
1043
  " </tr>\n",
1044
  " <tr>\n",
1045
  " <td>59</td>\n",
1046
+ " <td>1.203100</td>\n",
1047
  " </tr>\n",
1048
  " <tr>\n",
1049
  " <td>60</td>\n",
1050
+ " <td>1.167000</td>\n",
1051
  " </tr>\n",
1052
  " </tbody>\n",
1053
  "</table><p>"
 
1066
  },
1067
  {
1068
  "cell_type": "code",
1069
+ "execution_count": 11,
1070
  "metadata": {
1071
  "cellView": "form",
1072
  "colab": {
 
1080
  "name": "stdout",
1081
  "output_type": "stream",
1082
  "text": [
1083
+ "11430.7159 seconds used for training.\n",
1084
+ "190.51 minutes used for training.\n",
1085
+ "Peak reserved memory = 2.342 GB.\n",
1086
+ "Peak reserved memory for training = 2.117 GB.\n",
1087
+ "Peak reserved memory % of max memory = 59.442 %.\n",
1088
+ "Peak reserved memory for training % of max memory = 53.731 %.\n"
1089
  ]
1090
  }
1091
  ],
 
1116
  },
1117
  {
1118
  "cell_type": "code",
1119
+ "execution_count": 12,
1120
  "metadata": {
1121
  "colab": {
1122
  "base_uri": "https://localhost:8080/"
 
1135
  {
1136
  "data": {
1137
  "text/plain": [
1138
+ "['<|im_start|>user\\nContinue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,<|im_end|>\\n<|im_start|>assistant\\nThe Fibonacci sequence is a series of numbers where each number is the sum of the two preceding ones, starting from 1 and 1. To find the first few Fibonacci numbers, we can use a simple iterative approach.\\n\\nWe begin by assuming the first Fibonacci number is 0. Then, we']"
1139
  ]
1140
  },
1141
+ "execution_count": 12,
1142
  "metadata": {},
1143
  "output_type": "execute_result"
1144
  }
 
1180
  },
1181
  {
1182
  "cell_type": "code",
1183
+ "execution_count": 13,
1184
  "metadata": {
1185
  "colab": {
1186
  "base_uri": "https://localhost:8080/"
 
1198
  "<|im_start|>assistant\n",
1199
  "The Fibonacci sequence is a series of numbers where each number is the sum of the two preceding ones, starting from 1 and 1. To find the first few Fibonacci numbers, we can use a simple iterative approach.\n",
1200
  "\n",
1201
+ "We begin by assuming the first Fibonacci number is 0. Then, we multiply the current Fibonacci number by 1 and add it to the previous one. This results in a new Fibonacci number of 1 + 0 = 1. Next, we multiply the current Fibonacci number by 2 and add it to the previous one. This results in a new Fibonacci number of\n"
 
 
1202
  ]
1203
  }
1204
  ],
 
1235
  },
1236
  {
1237
  "cell_type": "code",
1238
+ "execution_count": 14,
1239
  "metadata": {
1240
  "id": "upcOlWe7A1vc"
1241
  },
 
1251
  " 'lora_model/tokenizer.json')"
1252
  ]
1253
  },
1254
+ "execution_count": 14,
1255
  "metadata": {},
1256
  "output_type": "execute_result"
1257
  }
 
1274
  },
1275
  {
1276
  "cell_type": "code",
1277
+ "execution_count": 15,
1278
  "metadata": {
1279
  "colab": {
1280
  "base_uri": "https://localhost:8080/"
 
1290
  "<|im_start|>user\n",
1291
  "What is a famous tall tower in Paris?<|im_end|>\n",
1292
  "<|im_start|>assistant\n",
1293
+ "The famous Tall Tower in Paris is the Arc de Triomphe, a monumental arch built in 1789 by Napoleon Bonaparte to commemorate the 18th birthday of his wife, Marie Antoinette. It is a symbol of the French monarchy and a symbol of the French Empire. The Arc de Triomphe is 140 meters tall and 150 meters wide, with a height of 150 meters. It is located in the Latin Quarter of Paris, a historic neighborhood that has been home to the French monarchy since the 18th century.\n",
1294
+ "\n",
1295
+ "The Arc de Triomphe\n"
1296
  ]
1297
  }
1298
  ],
 
1333
  },
1334
  {
1335
  "cell_type": "code",
1336
+ "execution_count": 16,
1337
  "metadata": {
1338
  "id": "yFfaXG0WsQuE"
1339
  },
 
1363
  },
1364
  {
1365
  "cell_type": "code",
1366
+ "execution_count": 17,
1367
  "metadata": {
1368
  "id": "iHjt_SMYsd3P"
1369
  },
 
1399
  },
1400
  {
1401
  "cell_type": "code",
1402
+ "execution_count": 18,
1403
  "metadata": {
1404
  "id": "FqfebeAdT073"
1405
  },