elements-7b-teacher / nohup.out
Ubuntu
init training set
623808a
ggml_init_cublas: found 1 CUDA devices:
Device 0: NVIDIA A10G, compute capability 8.6
llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from llama-2-7b-chat-q5_k_m.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor 0: token_embd.weight q5_K [ 4096, 32000, 1, 1 ]
llama_model_loader: - tensor 1: output_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 2: output.weight q6_K [ 4096, 32000, 1, 1 ]
llama_model_loader: - tensor 3: blk.0.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 4: blk.0.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 5: blk.0.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 6: blk.0.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 7: blk.0.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 8: blk.0.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 9: blk.0.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 10: blk.0.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 11: blk.0.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 12: blk.1.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 13: blk.1.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 14: blk.1.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 15: blk.1.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 16: blk.1.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 17: blk.1.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 18: blk.1.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 19: blk.1.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 20: blk.1.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 21: blk.2.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 22: blk.2.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 23: blk.2.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 24: blk.2.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 25: blk.2.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 26: blk.2.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 27: blk.2.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 28: blk.2.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 29: blk.2.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 30: blk.3.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 31: blk.3.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 32: blk.3.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 33: blk.3.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 34: blk.3.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 35: blk.3.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 36: blk.3.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 37: blk.3.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 38: blk.3.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 39: blk.4.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 40: blk.4.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 41: blk.4.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 42: blk.4.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 43: blk.4.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 44: blk.4.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 45: blk.4.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 46: blk.4.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 47: blk.4.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 48: blk.5.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 49: blk.5.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 50: blk.5.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 51: blk.5.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 52: blk.5.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 53: blk.5.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 54: blk.5.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 55: blk.5.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 56: blk.5.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 57: blk.6.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 58: blk.6.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 59: blk.6.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 60: blk.6.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 61: blk.6.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 62: blk.6.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 63: blk.6.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 64: blk.6.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 65: blk.6.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 66: blk.7.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 67: blk.7.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 68: blk.7.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 69: blk.7.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 70: blk.7.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 71: blk.7.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 72: blk.7.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 73: blk.7.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 74: blk.7.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 75: blk.8.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 76: blk.8.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 77: blk.8.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 78: blk.8.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 79: blk.8.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 80: blk.8.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 81: blk.8.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 82: blk.8.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 83: blk.8.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 84: blk.9.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 85: blk.9.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 86: blk.9.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 87: blk.9.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 88: blk.9.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 89: blk.9.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 90: blk.9.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 91: blk.9.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 92: blk.9.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 93: blk.10.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 94: blk.10.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 95: blk.10.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 96: blk.10.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 97: blk.10.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 98: blk.10.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 99: blk.10.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 100: blk.10.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 101: blk.10.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 102: blk.11.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 103: blk.11.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 104: blk.11.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 105: blk.11.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 106: blk.11.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 107: blk.11.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 108: blk.11.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 109: blk.11.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 110: blk.11.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 111: blk.12.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 112: blk.12.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 113: blk.12.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 114: blk.12.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 115: blk.12.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 116: blk.12.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 117: blk.12.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 118: blk.12.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 119: blk.12.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 120: blk.13.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 121: blk.13.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 122: blk.13.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 123: blk.13.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 124: blk.13.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 125: blk.13.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 126: blk.13.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 127: blk.13.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 128: blk.13.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 129: blk.14.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 130: blk.14.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 131: blk.14.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 132: blk.14.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 133: blk.14.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 134: blk.14.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 135: blk.14.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 136: blk.14.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 137: blk.14.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 138: blk.15.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 139: blk.15.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 140: blk.15.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 141: blk.15.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 142: blk.15.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 143: blk.15.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 144: blk.15.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 145: blk.15.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 146: blk.15.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 147: blk.16.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 148: blk.16.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 149: blk.16.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 150: blk.16.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 151: blk.16.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 152: blk.16.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 153: blk.16.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 154: blk.16.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 155: blk.16.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 156: blk.17.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 157: blk.17.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 158: blk.17.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 159: blk.17.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 160: blk.17.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 161: blk.17.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 162: blk.17.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 163: blk.17.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 164: blk.17.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 165: blk.18.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 166: blk.18.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 167: blk.18.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 168: blk.18.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 169: blk.18.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 170: blk.18.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 171: blk.18.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 172: blk.18.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 173: blk.18.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 174: blk.19.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 175: blk.19.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 176: blk.19.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 177: blk.19.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 178: blk.19.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 179: blk.19.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 180: blk.19.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 181: blk.19.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 182: blk.19.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 183: blk.20.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 184: blk.20.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 185: blk.20.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 186: blk.20.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 187: blk.20.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 188: blk.20.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 189: blk.20.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 190: blk.20.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 191: blk.20.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 192: blk.21.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 193: blk.21.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 194: blk.21.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 195: blk.21.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 196: blk.21.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 197: blk.21.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 198: blk.21.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 199: blk.21.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 200: blk.21.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 201: blk.22.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 202: blk.22.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 203: blk.22.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 204: blk.22.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 205: blk.22.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 206: blk.22.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 207: blk.22.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 208: blk.22.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 209: blk.22.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 210: blk.23.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 211: blk.23.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 212: blk.23.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 213: blk.23.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 214: blk.23.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 215: blk.23.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 216: blk.23.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 217: blk.23.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 218: blk.23.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 219: blk.24.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 220: blk.24.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 221: blk.24.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 222: blk.24.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 223: blk.24.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 224: blk.24.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 225: blk.24.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 226: blk.24.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 227: blk.24.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 228: blk.25.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 229: blk.25.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 230: blk.25.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 231: blk.25.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 232: blk.25.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 233: blk.25.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 234: blk.25.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 235: blk.25.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 236: blk.25.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 237: blk.26.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 238: blk.26.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 239: blk.26.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 240: blk.26.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 241: blk.26.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 242: blk.26.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 243: blk.26.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 244: blk.26.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 245: blk.26.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 246: blk.27.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 247: blk.27.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 248: blk.27.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 249: blk.27.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 250: blk.27.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 251: blk.27.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 252: blk.27.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 253: blk.27.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 254: blk.27.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 255: blk.28.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 256: blk.28.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 257: blk.28.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 258: blk.28.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 259: blk.28.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 260: blk.28.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 261: blk.28.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 262: blk.28.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 263: blk.28.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 264: blk.29.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 265: blk.29.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 266: blk.29.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 267: blk.29.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 268: blk.29.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 269: blk.29.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 270: blk.29.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 271: blk.29.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 272: blk.29.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 273: blk.30.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 274: blk.30.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 275: blk.30.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 276: blk.30.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 277: blk.30.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 278: blk.30.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 279: blk.30.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 280: blk.30.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 281: blk.30.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 282: blk.31.attn_q.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 283: blk.31.attn_k.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 284: blk.31.attn_v.weight q6_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 285: blk.31.attn_output.weight q5_K [ 4096, 4096, 1, 1 ]
llama_model_loader: - tensor 286: blk.31.ffn_gate.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 287: blk.31.ffn_down.weight q6_K [ 11008, 4096, 1, 1 ]
llama_model_loader: - tensor 288: blk.31.ffn_up.weight q5_K [ 4096, 11008, 1, 1 ]
llama_model_loader: - tensor 289: blk.31.attn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - tensor 290: blk.31.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]
llama_model_loader: - kv 0: general.architecture str
llama_model_loader: - kv 1: general.name str
llama_model_loader: - kv 2: llama.context_length u32
llama_model_loader: - kv 3: llama.embedding_length u32
llama_model_loader: - kv 4: llama.block_count u32
llama_model_loader: - kv 5: llama.feed_forward_length u32
llama_model_loader: - kv 6: llama.rope.dimension_count u32
llama_model_loader: - kv 7: llama.attention.head_count u32
llama_model_loader: - kv 8: llama.attention.head_count_kv u32
llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32
llama_model_loader: - kv 10: general.file_type u32
llama_model_loader: - kv 11: tokenizer.ggml.model str
llama_model_loader: - kv 12: tokenizer.ggml.tokens arr
llama_model_loader: - kv 13: tokenizer.ggml.scores arr
llama_model_loader: - kv 14: tokenizer.ggml.token_type arr
llama_model_loader: - kv 15: general.quantization_version u32
llama_model_loader: - type f32: 65 tensors
llama_model_loader: - type q5_K: 193 tensors
llama_model_loader: - type q6_K: 33 tensors
llm_load_print_meta: format = GGUF V2 (latest)
llm_load_print_meta: arch = llama
llm_load_print_meta: vocab type = SPM
llm_load_print_meta: n_vocab = 32000
llm_load_print_meta: n_merges = 0
llm_load_print_meta: n_ctx_train = 4096
llm_load_print_meta: n_embd = 4096
llm_load_print_meta: n_head = 32
llm_load_print_meta: n_head_kv = 32
llm_load_print_meta: n_layer = 32
llm_load_print_meta: n_rot = 128
llm_load_print_meta: n_gqa = 1
llm_load_print_meta: f_norm_eps = 0.0e+00
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
llm_load_print_meta: f_clamp_kqv = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: n_ff = 11008
llm_load_print_meta: freq_base_train = 10000.0
llm_load_print_meta: freq_scale_train = 1
llm_load_print_meta: model type = 7B
llm_load_print_meta: model ftype = mostly Q5_K - Medium
llm_load_print_meta: model params = 6.74 B
llm_load_print_meta: model size = 4.45 GiB (5.68 BPW)
llm_load_print_meta: general.name = LLaMA v2
llm_load_print_meta: BOS token = 1 '<s>'
llm_load_print_meta: EOS token = 2 '</s>'
llm_load_print_meta: UNK token = 0 '<unk>'
llm_load_print_meta: LF token = 13 '<0x0A>'
llm_load_tensors: ggml ctx size = 0.10 MB
llm_load_tensors: using CUDA for GPU acceleration
llm_load_tensors: mem required = 4560.96 MB
llm_load_tensors: offloading 0 repeating layers to GPU
llm_load_tensors: offloaded 0/35 layers to GPU
llm_load_tensors: VRAM used: 0.00 MB
..................................................................................................
llama_new_context_with_model: n_ctx = 512
llama_new_context_with_model: freq_base = 10000.0
llama_new_context_with_model: freq_scale = 1
llama_new_context_with_model: kv self size = 256.00 MB
llama_new_context_with_model: compute buffer total size = 76.63 MB
llama_new_context_with_model: VRAM scratch buffer: 70.50 MB
llama_new_context_with_model: total VRAM used: 70.50 MB (model: 0.00 MB, context: 70.50 MB)
main: seed: 1700014636
main: model base = 'llama-2-7b-chat-q5_k_m.gguf'
main: init model
print_params: n_vocab: 32000
print_params: n_ctx: 128
print_params: n_embd: 4096
print_params: n_ff: 11008
print_params: n_head: 32
print_params: n_head_kv: 32
print_params: n_layer: 32
print_params: norm_rms_eps : 0.000010
print_params: rope_freq_base : 10000.000000
print_params: rope_freq_scale : 1.000000
print_lora_params: n_rank_attention_norm : 1
print_lora_params: n_rank_wq : 4
print_lora_params: n_rank_wk : 4
print_lora_params: n_rank_wv : 4
print_lora_params: n_rank_wo : 4
print_lora_params: n_rank_ffn_norm : 1
print_lora_params: n_rank_w1 : 4
print_lora_params: n_rank_w2 : 4
print_lora_params: n_rank_w3 : 4
print_lora_params: n_rank_tok_embeddings : 4
print_lora_params: n_rank_norm : 1
print_lora_params: n_rank_output : 4
main: total train_iterations 0
main: seen train_samples 0
main: seen train_tokens 0
main: completed train_epochs 0
main: lora_size = 84807904 bytes (80.9 MB)
main: opt_size = 126592864 bytes (120.7 MB)
main: opt iter 0
main: input_size = 131076128 bytes (125.0 MB)
main: compute_size = 14064566880 bytes (13413.0 MB)
main: evaluation order = RIGHT_TO_LEFT
main: tokenize training data
tokenize_file: warning: found 7 samples (max length 222) that exceed context length of 128. samples will be cut off.
tokenize_file: warning: found 185 samples (min length 47) that are shorter than context length of 128.
tokenize_file: total number of samples: 192
main: number of training tokens: 12899
main: number of unique tokens: 963
main: train data seems to have changed. restarting shuffled epoch.
main: begin training
main: work_size = 1024512 bytes (1.0 MB)
train_opt_callback: iter= 0 sample=1/192 sched=0.000000 loss=0.000000 |->
train_opt_callback: iter= 1 sample=9/192 sched=0.010000 loss=6.896706 dt=00:33:27 eta=5d 22:10:22 |->
train_opt_callback: iter= 2 sample=17/192 sched=0.020000 loss=6.528623 dt=00:33:36 eta=5d 22:14:41 |----->
train_opt_callback: iter= 3 sample=25/192 sched=0.030000 loss=7.002708 dt=00:33:30 eta=5d 21:15:45 |>
train_opt_callback: iter= 4 sample=33/192 sched=0.040000 loss=6.527895 dt=00:33:36 eta=5d 21:08:35 |----->
train_opt_callback: iter= 5 sample=41/192 sched=0.050000 loss=5.247390 dt=00:33:36 eta=5d 20:36:45 |----------------->
train_opt_callback: iter= 6 sample=49/192 sched=0.060000 loss=5.971481 dt=00:33:37 eta=5d 20:05:20 |---------->
train_opt_callback: iter= 7 sample=57/192 sched=0.070000 loss=5.127095 dt=00:33:43 eta=5d 19:57:54 |------------------->
train_opt_callback: iter= 8 sample=65/192 sched=0.080000 loss=4.606475 dt=00:33:31 eta=5d 18:32:54 |------------------------>
train_opt_callback: iter= 9 sample=73/192 sched=0.090000 loss=4.218550 dt=00:33:36 eta=5d 18:21:27 |---------------------------->
save_checkpoint_lora_file: saving to checkpoint-10.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 10 sample=81/192 sched=0.100000 loss=4.159760 dt=00:33:36 eta=5d 17:45:45 |---------------------------->
train_opt_callback: iter= 11 sample=89/192 sched=0.110000 loss=3.662558 dt=00:33:33 eta=5d 17:03:45 |--------------------------------->
train_opt_callback: iter= 12 sample=97/192 sched=0.120000 loss=3.240861 dt=00:33:37 eta=5d 16:42:38 |-------------------------------------->
train_opt_callback: iter= 13 sample=105/192 sched=0.130000 loss=2.798615 dt=00:33:38 eta=5d 16:16:12 |------------------------------------------>
train_opt_callback: iter= 14 sample=113/192 sched=0.140000 loss=2.537306 dt=00:33:31 eta=5d 15:13:55 |--------------------------------------------->
train_opt_callback: iter= 15 sample=121/192 sched=0.150000 loss=2.133917 dt=00:33:37 eta=5d 15:03:38 |------------------------------------------------->
train_opt_callback: iter= 16 sample=129/192 sched=0.160000 loss=2.062928 dt=00:33:38 eta=5d 14:32:36 |------------------------------------------------->
train_opt_callback: iter= 17 sample=137/192 sched=0.170000 loss=1.867334 dt=00:33:40 eta=5d 14:09:46 |--------------------------------------------------->
train_opt_callback: iter= 18 sample=145/192 sched=0.180000 loss=1.751243 dt=00:33:44 eta=5d 13:50:56 |---------------------------------------------------->
train_opt_callback: iter= 19 sample=153/192 sched=0.190000 loss=1.845504 dt=00:33:39 eta=5d 12:58:13 |---------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-20.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 20 sample=161/192 sched=0.200000 loss=1.504754 dt=00:33:45 eta=5d 12:45:40 |------------------------------------------------------->
train_opt_callback: iter= 21 sample=169/192 sched=0.210000 loss=1.408401 dt=00:33:41 eta=5d 11:58:40 |-------------------------------------------------------->
train_opt_callback: iter= 22 sample=177/192 sched=0.220000 loss=1.236330 dt=00:33:34 eta=5d 10:54:53 |---------------------------------------------------------->
train_opt_callback: iter= 23 sample=185/192 sched=0.230000 loss=1.384089 dt=00:33:27 eta=5d 09:57:41 |-------------------------------------------------------->
train_opt_callback: reshuffle samples. completed epochs: 1
train_opt_callback: iter= 24 sample=1/192 sched=0.240000 loss=0.921234 dt=00:33:30 eta=5d 09:34:36 |------------------------------------------------------------->
train_opt_callback: iter= 25 sample=9/192 sched=0.250000 loss=0.896220 dt=00:33:32 eta=5d 09:08:42 |------------------------------------------------------------->
train_opt_callback: iter= 26 sample=17/192 sched=0.260000 loss=1.035781 dt=00:33:30 eta=5d 08:27:53 |------------------------------------------------------------>
train_opt_callback: iter= 27 sample=25/192 sched=0.270000 loss=0.825575 dt=00:33:35 eta=5d 08:12:54 |-------------------------------------------------------------->
train_opt_callback: iter= 28 sample=33/192 sched=0.280000 loss=0.563456 dt=00:33:45 eta=5d 08:18:26 |---------------------------------------------------------------->
train_opt_callback: iter= 29 sample=41/192 sched=0.290000 loss=0.375454 dt=00:33:42 eta=5d 07:32:46 |------------------------------------------------------------------>
save_checkpoint_lora_file: saving to checkpoint-30.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 30 sample=49/192 sched=0.300000 loss=0.350678 dt=00:33:40 eta=5d 06:51:11 |------------------------------------------------------------------>
train_opt_callback: iter= 31 sample=57/192 sched=0.310000 loss=0.310571 dt=00:33:36 eta=5d 06:02:22 |------------------------------------------------------------------->
train_opt_callback: iter= 32 sample=65/192 sched=0.320000 loss=0.404648 dt=00:33:30 eta=5d 05:07:29 |------------------------------------------------------------------>
train_opt_callback: iter= 33 sample=73/192 sched=0.330000 loss=0.441364 dt=00:33:30 eta=5d 04:33:49 |------------------------------------------------------------------>
train_opt_callback: iter= 34 sample=81/192 sched=0.340000 loss=0.592602 dt=00:33:22 eta=5d 03:30:54 |---------------------------------------------------------------->
train_opt_callback: iter= 35 sample=89/192 sched=0.350000 loss=0.393056 dt=00:33:31 eta=5d 03:30:48 |------------------------------------------------------------------>
train_opt_callback: iter= 36 sample=97/192 sched=0.360000 loss=0.386178 dt=00:33:32 eta=5d 02:58:00 |------------------------------------------------------------------>
train_opt_callback: iter= 37 sample=105/192 sched=0.370000 loss=0.435404 dt=00:33:30 eta=5d 02:17:20 |------------------------------------------------------------------>
train_opt_callback: iter= 38 sample=113/192 sched=0.380000 loss=0.390880 dt=00:33:38 eta=5d 02:12:36 |------------------------------------------------------------------>
train_opt_callback: iter= 39 sample=121/192 sched=0.390000 loss=0.262534 dt=00:33:33 eta=5d 01:22:05 |------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-40.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 40 sample=129/192 sched=0.400000 loss=0.316369 dt=00:33:33 eta=5d 00:47:05 |------------------------------------------------------------------->
train_opt_callback: iter= 41 sample=137/192 sched=0.410000 loss=0.325958 dt=00:33:31 eta=5d 00:09:31 |------------------------------------------------------------------->
train_opt_callback: iter= 42 sample=145/192 sched=0.420000 loss=0.301978 dt=00:33:29 eta=4d 23:25:36 |------------------------------------------------------------------->
train_opt_callback: iter= 43 sample=153/192 sched=0.430000 loss=0.200966 dt=00:33:18 eta=4d 22:13:21 |-------------------------------------------------------------------->
train_opt_callback: iter= 44 sample=161/192 sched=0.440000 loss=0.257932 dt=00:33:05 eta=4d 20:55:58 |------------------------------------------------------------------->
train_opt_callback: iter= 45 sample=169/192 sched=0.450000 loss=0.349884 dt=00:33:10 eta=4d 20:38:49 |------------------------------------------------------------------>
train_opt_callback: iter= 46 sample=177/192 sched=0.460000 loss=0.232009 dt=00:33:11 eta=4d 20:10:15 |-------------------------------------------------------------------->
train_opt_callback: iter= 47 sample=185/192 sched=0.470000 loss=0.351911 dt=00:33:03 eta=4d 19:08:45 |------------------------------------------------------------------>
train_opt_callback: reshuffle samples. completed epochs: 2
train_opt_callback: iter= 48 sample=1/192 sched=0.480000 loss=0.430637 dt=00:33:01 eta=4d 18:28:25 |------------------------------------------------------------------>
train_opt_callback: iter= 49 sample=9/192 sched=0.490000 loss=0.146809 dt=00:32:56 eta=4d 17:40:17 |-------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-50.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 50 sample=17/192 sched=0.500000 loss=0.184028 dt=00:32:55 eta=4d 17:03:13 |-------------------------------------------------------------------->
train_opt_callback: iter= 51 sample=25/192 sched=0.510000 loss=0.257756 dt=00:33:02 eta=4d 16:53:22 |------------------------------------------------------------------->
train_opt_callback: iter= 52 sample=33/192 sched=0.520000 loss=0.255507 dt=00:32:54 eta=4d 15:52:28 |------------------------------------------------------------------->
train_opt_callback: iter= 53 sample=41/192 sched=0.530000 loss=0.106297 dt=00:33:00 eta=4d 15:39:44 |--------------------------------------------------------------------->
train_opt_callback: iter= 54 sample=49/192 sched=0.540000 loss=0.109958 dt=00:33:09 eta=4d 15:39:23 |--------------------------------------------------------------------->
train_opt_callback: iter= 55 sample=57/192 sched=0.550000 loss=0.201428 dt=00:33:09 eta=4d 15:06:11 |-------------------------------------------------------------------->
train_opt_callback: iter= 56 sample=65/192 sched=0.560000 loss=0.243644 dt=00:32:58 eta=4d 13:54:14 |-------------------------------------------------------------------->
train_opt_callback: iter= 57 sample=73/192 sched=0.570000 loss=0.081309 dt=00:32:58 eta=4d 13:21:33 |--------------------------------------------------------------------->
train_opt_callback: iter= 58 sample=81/192 sched=0.580000 loss=0.096137 dt=00:32:55 eta=4d 12:38:31 |--------------------------------------------------------------------->
train_opt_callback: iter= 59 sample=89/192 sched=0.590000 loss=0.248359 dt=00:33:01 eta=4d 12:26:31 |------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-60.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 60 sample=97/192 sched=0.600000 loss=0.359471 dt=00:33:06 eta=4d 12:08:21 |------------------------------------------------------------------>
train_opt_callback: iter= 61 sample=105/192 sched=0.610000 loss=0.205663 dt=00:33:05 eta=4d 11:31:27 |-------------------------------------------------------------------->
train_opt_callback: iter= 62 sample=113/192 sched=0.620000 loss=0.208255 dt=00:33:12 eta=4d 11:21:58 |-------------------------------------------------------------------->
train_opt_callback: iter= 63 sample=121/192 sched=0.630000 loss=0.205131 dt=00:33:05 eta=4d 10:26:23 |-------------------------------------------------------------------->
train_opt_callback: iter= 64 sample=129/192 sched=0.640000 loss=0.190218 dt=00:33:16 eta=4d 10:28:43 |-------------------------------------------------------------------->
train_opt_callback: iter= 65 sample=137/192 sched=0.650000 loss=0.299253 dt=00:33:04 eta=4d 09:18:31 |------------------------------------------------------------------->
train_opt_callback: iter= 66 sample=145/192 sched=0.660000 loss=0.373301 dt=00:32:58 eta=4d 08:23:59 |------------------------------------------------------------------>
train_opt_callback: iter= 67 sample=153/192 sched=0.670000 loss=0.207523 dt=00:33:05 eta=4d 08:15:10 |-------------------------------------------------------------------->
train_opt_callback: iter= 68 sample=161/192 sched=0.680000 loss=0.296497 dt=00:32:54 eta=4d 07:06:44 |------------------------------------------------------------------->
train_opt_callback: iter= 69 sample=169/192 sched=0.690000 loss=0.145919 dt=00:32:59 eta=4d 06:50:53 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-70.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 70 sample=177/192 sched=0.700000 loss=0.275909 dt=00:33:13 eta=4d 07:01:18 |------------------------------------------------------------------->
train_opt_callback: iter= 71 sample=185/192 sched=0.710000 loss=0.232179 dt=00:33:18 eta=4d 06:42:22 |-------------------------------------------------------------------->
train_opt_callback: reshuffle samples. completed epochs: 3
train_opt_callback: iter= 72 sample=1/192 sched=0.720000 loss=0.249719 dt=00:33:03 eta=4d 05:23:48 |------------------------------------------------------------------->
train_opt_callback: iter= 73 sample=9/192 sched=0.730000 loss=0.117920 dt=00:33:16 eta=4d 05:27:57 |--------------------------------------------------------------------->
train_opt_callback: iter= 74 sample=17/192 sched=0.740000 loss=0.114252 dt=00:33:10 eta=4d 04:39:02 |--------------------------------------------------------------------->
train_opt_callback: iter= 75 sample=25/192 sched=0.750000 loss=0.249378 dt=00:33:05 eta=4d 03:49:32 |------------------------------------------------------------------->
train_opt_callback: iter= 76 sample=33/192 sched=0.760000 loss=0.093397 dt=00:32:57 eta=4d 02:53:29 |--------------------------------------------------------------------->
train_opt_callback: iter= 77 sample=41/192 sched=0.770000 loss=0.193217 dt=00:32:57 eta=4d 02:19:39 |-------------------------------------------------------------------->
train_opt_callback: iter= 78 sample=49/192 sched=0.780000 loss=0.112519 dt=00:33:01 eta=4d 01:57:22 |--------------------------------------------------------------------->
train_opt_callback: iter= 79 sample=57/192 sched=0.790000 loss=0.132837 dt=00:33:01 eta=4d 01:25:15 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-80.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 80 sample=65/192 sched=0.800000 loss=0.133788 dt=00:32:55 eta=4d 00:34:13 |--------------------------------------------------------------------->
train_opt_callback: iter= 81 sample=73/192 sched=0.810000 loss=0.228926 dt=00:32:52 eta=3d 23:52:02 |-------------------------------------------------------------------->
train_opt_callback: iter= 82 sample=81/192 sched=0.820000 loss=0.186090 dt=00:32:52 eta=3d 23:18:50 |-------------------------------------------------------------------->
train_opt_callback: iter= 83 sample=89/192 sched=0.830000 loss=0.127048 dt=00:32:55 eta=3d 22:55:51 |--------------------------------------------------------------------->
train_opt_callback: iter= 84 sample=97/192 sched=0.840000 loss=0.072938 dt=00:33:13 eta=3d 23:14:39 |--------------------------------------------------------------------->
train_opt_callback: iter= 85 sample=105/192 sched=0.850000 loss=0.103475 dt=00:33:19 eta=3d 22:57:52 |--------------------------------------------------------------------->
train_opt_callback: iter= 86 sample=113/192 sched=0.860000 loss=0.170867 dt=00:33:14 eta=3d 22:10:31 |-------------------------------------------------------------------->
train_opt_callback: iter= 87 sample=121/192 sched=0.870000 loss=0.195967 dt=00:33:10 eta=3d 21:26:40 |-------------------------------------------------------------------->
train_opt_callback: iter= 88 sample=129/192 sched=0.880000 loss=0.123484 dt=00:33:09 eta=3d 20:51:23 |--------------------------------------------------------------------->
train_opt_callback: iter= 89 sample=137/192 sched=0.890000 loss=0.119564 dt=00:33:12 eta=3d 20:25:45 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-90.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 90 sample=145/192 sched=0.900000 loss=0.097791 dt=00:33:16 eta=3d 20:04:57 |--------------------------------------------------------------------->
train_opt_callback: iter= 91 sample=153/192 sched=0.910000 loss=0.217274 dt=00:33:17 eta=3d 19:32:48 |-------------------------------------------------------------------->
train_opt_callback: iter= 92 sample=161/192 sched=0.920000 loss=0.112811 dt=00:33:26 eta=3d 19:23:40 |--------------------------------------------------------------------->
train_opt_callback: iter= 93 sample=169/192 sched=0.930000 loss=0.110431 dt=00:33:20 eta=3d 18:33:30 |--------------------------------------------------------------------->
train_opt_callback: iter= 94 sample=177/192 sched=0.940000 loss=0.139967 dt=00:33:15 eta=3d 17:48:43 |--------------------------------------------------------------------->
train_opt_callback: iter= 95 sample=185/192 sched=0.950000 loss=0.129218 dt=00:33:34 eta=3d 18:04:50 |--------------------------------------------------------------------->
train_opt_callback: reshuffle samples. completed epochs: 4
train_opt_callback: iter= 96 sample=1/192 sched=0.960000 loss=0.177266 dt=00:33:21 eta=3d 16:57:24 |-------------------------------------------------------------------->
train_opt_callback: iter= 97 sample=9/192 sched=0.970000 loss=0.138082 dt=00:33:14 eta=3d 16:06:13 |--------------------------------------------------------------------->
train_opt_callback: iter= 98 sample=17/192 sched=0.980000 loss=0.103335 dt=00:33:14 eta=3d 15:31:05 |--------------------------------------------------------------------->
train_opt_callback: iter= 99 sample=25/192 sched=0.990000 loss=0.096420 dt=00:33:12 eta=3d 14:54:30 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-100.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 100 sample=33/192 sched=0.977975 loss=0.134850 dt=00:33:11 eta=3d 14:18:05 |--------------------------------------------------------------------->
train_opt_callback: iter= 101 sample=41/192 sched=0.977536 loss=0.077353 dt=00:33:01 eta=3d 13:20:09 |--------------------------------------------------------------------->
train_opt_callback: iter= 102 sample=49/192 sched=0.977093 loss=0.103287 dt=00:33:10 eta=3d 13:09:59 |--------------------------------------------------------------------->
train_opt_callback: iter= 103 sample=57/192 sched=0.976646 loss=0.127698 dt=00:33:13 eta=3d 12:42:54 |--------------------------------------------------------------------->
train_opt_callback: iter= 104 sample=65/192 sched=0.976194 loss=0.082086 dt=00:33:19 eta=3d 12:26:08 |--------------------------------------------------------------------->
train_opt_callback: iter= 105 sample=73/192 sched=0.975738 loss=0.074085 dt=00:33:13 eta=3d 11:36:08 |--------------------------------------------------------------------->
train_opt_callback: iter= 106 sample=81/192 sched=0.975278 loss=0.072698 dt=00:33:20 eta=3d 11:20:26 |--------------------------------------------------------------------->
train_opt_callback: iter= 107 sample=89/192 sched=0.974814 loss=0.115449 dt=00:33:16 eta=3d 10:38:02 |--------------------------------------------------------------------->
train_opt_callback: iter= 108 sample=97/192 sched=0.974346 loss=0.053100 dt=00:33:09 eta=3d 09:46:39 |--------------------------------------------------------------------->
train_opt_callback: iter= 109 sample=105/192 sched=0.973873 loss=0.115875 dt=00:33:09 eta=3d 09:13:28 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-110.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 110 sample=113/192 sched=0.973396 loss=0.094730 dt=00:33:23 eta=3d 09:14:33 |--------------------------------------------------------------------->
train_opt_callback: iter= 111 sample=121/192 sched=0.972915 loss=0.091597 dt=00:33:18 eta=3d 08:30:50 |--------------------------------------------------------------------->
train_opt_callback: iter= 112 sample=129/192 sched=0.972430 loss=0.094059 dt=00:33:20 eta=3d 08:02:01 |--------------------------------------------------------------------->
train_opt_callback: iter= 113 sample=137/192 sched=0.971941 loss=0.103500 dt=00:33:17 eta=3d 07:20:04 |--------------------------------------------------------------------->
train_opt_callback: iter= 114 sample=145/192 sched=0.971447 loss=0.094864 dt=00:33:11 eta=3d 06:33:59 |--------------------------------------------------------------------->
train_opt_callback: iter= 115 sample=153/192 sched=0.970950 loss=0.108245 dt=00:32:53 eta=3d 05:17:53 |--------------------------------------------------------------------->
train_opt_callback: iter= 116 sample=161/192 sched=0.970448 loss=0.059437 dt=00:33:01 eta=3d 05:04:22 |--------------------------------------------------------------------->
train_opt_callback: iter= 117 sample=169/192 sched=0.969942 loss=0.059715 dt=00:33:06 eta=3d 04:40:54 |--------------------------------------------------------------------->
train_opt_callback: iter= 118 sample=177/192 sched=0.969432 loss=0.097534 dt=00:33:07 eta=3d 04:11:08 |--------------------------------------------------------------------->
train_opt_callback: iter= 119 sample=185/192 sched=0.968918 loss=0.131304 dt=00:33:05 eta=3d 03:34:13 |--------------------------------------------------------------------->
train_opt_callback: reshuffle samples. completed epochs: 5
save_checkpoint_lora_file: saving to checkpoint-120.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 120 sample=1/192 sched=0.968399 loss=0.059029 dt=00:32:57 eta=3d 02:41:25 |--------------------------------------------------------------------->
train_opt_callback: iter= 121 sample=9/192 sched=0.967877 loss=0.071099 dt=00:32:49 eta=3d 01:50:24 |--------------------------------------------------------------------->
train_opt_callback: iter= 122 sample=17/192 sched=0.967350 loss=0.102484 dt=00:32:47 eta=3d 01:14:10 |--------------------------------------------------------------------->
train_opt_callback: iter= 123 sample=25/192 sched=0.966820 loss=0.063735 dt=00:32:49 eta=3d 00:45:51 |--------------------------------------------------------------------->
train_opt_callback: iter= 124 sample=33/192 sched=0.966285 loss=0.076503 dt=00:32:36 eta=2d 23:43:27 |--------------------------------------------------------------------->
train_opt_callback: iter= 125 sample=41/192 sched=0.965746 loss=0.064077 dt=00:32:44 eta=2d 23:28:24 |--------------------------------------------------------------------->
train_opt_callback: iter= 126 sample=49/192 sched=0.965203 loss=0.102186 dt=00:32:56 eta=2d 23:22:06 |--------------------------------------------------------------------->
train_opt_callback: iter= 127 sample=57/192 sched=0.964656 loss=0.071543 dt=00:32:52 eta=2d 22:40:48 |--------------------------------------------------------------------->
train_opt_callback: iter= 128 sample=65/192 sched=0.964104 loss=0.076590 dt=00:32:52 eta=2d 22:08:49 |--------------------------------------------------------------------->
train_opt_callback: iter= 129 sample=73/192 sched=0.963549 loss=0.054899 dt=00:33:00 eta=2d 21:52:56 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-130.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 130 sample=81/192 sched=0.962990 loss=0.063754 dt=00:32:55 eta=2d 21:09:15 |--------------------------------------------------------------------->
train_opt_callback: iter= 131 sample=89/192 sched=0.962426 loss=0.061553 dt=00:32:56 eta=2d 20:37:51 |--------------------------------------------------------------------->
train_opt_callback: iter= 132 sample=97/192 sched=0.961859 loss=0.079578 dt=00:32:57 eta=2d 20:07:24 |--------------------------------------------------------------------->
train_opt_callback: iter= 133 sample=105/192 sched=0.961287 loss=0.084903 dt=00:33:08 eta=2d 19:55:53 |--------------------------------------------------------------------->
train_opt_callback: iter= 134 sample=113/192 sched=0.960711 loss=0.071449 dt=00:32:56 eta=2d 18:58:32 |--------------------------------------------------------------------->
train_opt_callback: iter= 135 sample=121/192 sched=0.960131 loss=0.075682 dt=00:33:03 eta=2d 18:39:27 |--------------------------------------------------------------------->
train_opt_callback: iter= 136 sample=129/192 sched=0.959548 loss=0.124118 dt=00:32:58 eta=2d 17:57:54 |--------------------------------------------------------------------->
train_opt_callback: iter= 137 sample=137/192 sched=0.958960 loss=0.092211 dt=00:33:07 eta=2d 17:41:49 |--------------------------------------------------------------------->
train_opt_callback: iter= 138 sample=145/192 sched=0.958368 loss=0.072774 dt=00:32:54 eta=2d 16:42:23 |--------------------------------------------------------------------->
train_opt_callback: iter= 139 sample=153/192 sched=0.957772 loss=0.063718 dt=00:33:01 eta=2d 16:23:57 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-140.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 140 sample=161/192 sched=0.957172 loss=0.092079 dt=00:32:58 eta=2d 15:45:41 |--------------------------------------------------------------------->
train_opt_callback: iter= 141 sample=169/192 sched=0.956568 loss=0.064457 dt=00:32:54 eta=2d 15:04:59 |--------------------------------------------------------------------->
train_opt_callback: iter= 142 sample=177/192 sched=0.955960 loss=0.066462 dt=00:32:53 eta=2d 14:29:26 |--------------------------------------------------------------------->
train_opt_callback: iter= 143 sample=185/192 sched=0.955348 loss=0.064275 dt=00:32:56 eta=2d 14:03:15 |--------------------------------------------------------------------->
train_opt_callback: reshuffle samples. completed epochs: 6
train_opt_callback: iter= 144 sample=1/192 sched=0.954732 loss=0.054712 dt=00:32:48 eta=2d 13:13:38 |--------------------------------------------------------------------->
train_opt_callback: iter= 145 sample=9/192 sched=0.954112 loss=0.063319 dt=00:33:04 eta=2d 13:11:19 |--------------------------------------------------------------------->
train_opt_callback: iter= 146 sample=17/192 sched=0.953488 loss=0.071428 dt=00:32:57 eta=2d 12:26:14 |--------------------------------------------------------------------->
train_opt_callback: iter= 147 sample=25/192 sched=0.952861 loss=0.060580 dt=00:33:02 eta=2d 12:02:11 |--------------------------------------------------------------------->
train_opt_callback: iter= 148 sample=33/192 sched=0.952229 loss=0.061905 dt=00:32:55 eta=2d 11:15:18 |--------------------------------------------------------------------->
train_opt_callback: iter= 149 sample=41/192 sched=0.951593 loss=0.060745 dt=00:32:51 eta=2d 10:36:39 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-150.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 150 sample=49/192 sched=0.950953 loss=0.068246 dt=00:33:00 eta=2d 10:18:56 |--------------------------------------------------------------------->
train_opt_callback: iter= 151 sample=57/192 sched=0.950309 loss=0.074121 dt=00:32:55 eta=2d 09:37:49 |--------------------------------------------------------------------->
train_opt_callback: iter= 152 sample=65/192 sched=0.949661 loss=0.067036 dt=00:32:59 eta=2d 09:11:26 |--------------------------------------------------------------------->
train_opt_callback: iter= 153 sample=73/192 sched=0.949010 loss=0.043553 dt=00:33:02 eta=2d 08:42:53 |---------------------------------------------------------------------->
train_opt_callback: iter= 154 sample=81/192 sched=0.948354 loss=0.074711 dt=00:33:10 eta=2d 08:24:25 |--------------------------------------------------------------------->
train_opt_callback: iter= 155 sample=89/192 sched=0.947695 loss=0.067758 dt=00:32:54 eta=2d 07:23:29 |--------------------------------------------------------------------->
train_opt_callback: iter= 156 sample=97/192 sched=0.947031 loss=0.046840 dt=00:33:02 eta=2d 07:03:31 |--------------------------------------------------------------------->
train_opt_callback: iter= 157 sample=105/192 sched=0.946364 loss=0.051977 dt=00:32:49 eta=2d 06:09:44 |--------------------------------------------------------------------->
train_opt_callback: iter= 158 sample=113/192 sched=0.945692 loss=0.055774 dt=00:32:52 eta=2d 05:41:40 |--------------------------------------------------------------------->
train_opt_callback: iter= 159 sample=121/192 sched=0.945017 loss=0.055962 dt=00:32:49 eta=2d 05:03:34 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-160.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 160 sample=129/192 sched=0.944338 loss=0.054285 dt=00:32:46 eta=2d 04:25:38 |--------------------------------------------------------------------->
train_opt_callback: iter= 161 sample=137/192 sched=0.943655 loss=0.056234 dt=00:32:46 eta=2d 03:53:38 |--------------------------------------------------------------------->
train_opt_callback: iter= 162 sample=145/192 sched=0.942968 loss=0.059660 dt=00:32:46 eta=2d 03:20:44 |--------------------------------------------------------------------->
train_opt_callback: iter= 163 sample=153/192 sched=0.942277 loss=0.055612 dt=00:32:56 eta=2d 03:04:16 |--------------------------------------------------------------------->
train_opt_callback: iter= 164 sample=161/192 sched=0.941583 loss=0.063125 dt=00:33:04 eta=2d 02:43:38 |--------------------------------------------------------------------->
train_opt_callback: iter= 165 sample=169/192 sched=0.940884 loss=0.055584 dt=00:32:55 eta=2d 01:55:37 |--------------------------------------------------------------------->
train_opt_callback: iter= 166 sample=177/192 sched=0.940182 loss=0.073498 dt=00:32:46 eta=2d 01:09:23 |--------------------------------------------------------------------->
train_opt_callback: iter= 167 sample=185/192 sched=0.939476 loss=0.063428 dt=00:32:43 eta=2d 00:32:45 |--------------------------------------------------------------------->
train_opt_callback: reshuffle samples. completed epochs: 7
train_opt_callback: iter= 168 sample=1/192 sched=0.938765 loss=0.067372 dt=00:32:43 eta=2d 00:00:00 |--------------------------------------------------------------------->
train_opt_callback: iter= 169 sample=9/192 sched=0.938052 loss=0.047034 dt=00:32:35 eta=1d 23:15:34 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-170.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 170 sample=17/192 sched=0.937334 loss=0.048806 dt=00:32:47 eta=1d 23:00:10 |--------------------------------------------------------------------->
train_opt_callback: iter= 171 sample=25/192 sched=0.936612 loss=0.050773 dt=00:32:56 eta=1d 22:39:56 |--------------------------------------------------------------------->
train_opt_callback: iter= 172 sample=33/192 sched=0.935887 loss=0.051670 dt=00:32:49 eta=1d 21:56:58 |--------------------------------------------------------------------->
train_opt_callback: iter= 173 sample=41/192 sched=0.935158 loss=0.051985 dt=00:32:58 eta=1d 21:36:48 |--------------------------------------------------------------------->
train_opt_callback: iter= 174 sample=49/192 sched=0.934425 loss=0.050922 dt=00:32:52 eta=1d 20:56:17 |--------------------------------------------------------------------->
train_opt_callback: iter= 175 sample=57/192 sched=0.933688 loss=0.056858 dt=00:32:50 eta=1d 20:19:41 |--------------------------------------------------------------------->
train_opt_callback: iter= 176 sample=65/192 sched=0.932948 loss=0.049171 dt=00:32:53 eta=1d 19:51:33 |--------------------------------------------------------------------->
train_opt_callback: iter= 177 sample=73/192 sched=0.932203 loss=0.049735 dt=00:32:46 eta=1d 19:08:48 |--------------------------------------------------------------------->
train_opt_callback: iter= 178 sample=81/192 sched=0.931455 loss=0.054104 dt=00:32:46 eta=1d 18:36:49 |--------------------------------------------------------------------->
train_opt_callback: iter= 179 sample=89/192 sched=0.930703 loss=0.056268 dt=00:32:49 eta=1d 18:08:04 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-180.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 180 sample=97/192 sched=0.929948 loss=0.054014 dt=00:32:41 eta=1d 17:24:59 |--------------------------------------------------------------------->
train_opt_callback: iter= 181 sample=105/192 sched=0.929188 loss=0.047580 dt=00:32:45 eta=1d 16:56:41 |--------------------------------------------------------------------->
train_opt_callback: iter= 182 sample=113/192 sched=0.928425 loss=0.047196 dt=00:33:08 eta=1d 16:52:28 |--------------------------------------------------------------------->
train_opt_callback: iter= 183 sample=121/192 sched=0.927658 loss=0.060058 dt=00:33:02 eta=1d 16:12:26 |--------------------------------------------------------------------->
train_opt_callback: iter= 184 sample=129/192 sched=0.926888 loss=0.063827 dt=00:32:57 eta=1d 15:33:01 |--------------------------------------------------------------------->
train_opt_callback: iter= 185 sample=137/192 sched=0.926113 loss=0.071899 dt=00:32:56 eta=1d 14:58:56 |--------------------------------------------------------------------->
train_opt_callback: iter= 186 sample=145/192 sched=0.925335 loss=0.042908 dt=00:32:55 eta=1d 14:24:41 |---------------------------------------------------------------------->
train_opt_callback: iter= 187 sample=153/192 sched=0.924554 loss=0.063576 dt=00:32:50 eta=1d 13:45:54 |--------------------------------------------------------------------->
train_opt_callback: iter= 188 sample=161/192 sched=0.923768 loss=0.051778 dt=00:29:00 eta=1d 08:52:57 |--------------------------------------------------------------------->
train_opt_callback: iter= 189 sample=169/192 sched=0.922979 loss=0.063585 dt=00:20:15 eta=22:37:13 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-190.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 190 sample=177/192 sched=0.922186 loss=0.049423 dt=00:17:41 eta=19:27:44 |--------------------------------------------------------------------->
train_opt_callback: iter= 191 sample=185/192 sched=0.921390 loss=0.057893 dt=00:16:47 eta=18:11:57 |--------------------------------------------------------------------->
train_opt_callback: reshuffle samples. completed epochs: 8
train_opt_callback: iter= 192 sample=1/192 sched=0.920590 loss=0.069274 dt=00:16:35 eta=17:41:57 |--------------------------------------------------------------------->
train_opt_callback: iter= 193 sample=9/192 sched=0.919786 loss=0.049640 dt=00:16:25 eta=17:15:05 |--------------------------------------------------------------------->
train_opt_callback: iter= 194 sample=17/192 sched=0.918978 loss=0.048456 dt=00:16:23 eta=16:56:01 |--------------------------------------------------------------------->
train_opt_callback: iter= 195 sample=25/192 sched=0.918167 loss=0.058435 dt=00:16:23 eta=16:40:02 |--------------------------------------------------------------------->
train_opt_callback: iter= 196 sample=33/192 sched=0.917353 loss=0.046197 dt=00:16:27 eta=16:27:37 |---------------------------------------------------------------------->
train_opt_callback: iter= 197 sample=41/192 sched=0.916534 loss=0.059254 dt=00:16:28 eta=16:11:54 |--------------------------------------------------------------------->
train_opt_callback: iter= 198 sample=49/192 sched=0.915712 loss=0.049534 dt=00:16:25 eta=15:52:56 |--------------------------------------------------------------------->
train_opt_callback: iter= 199 sample=57/192 sched=0.914887 loss=0.046052 dt=00:16:26 eta=15:37:11 |---------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-200.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 200 sample=65/192 sched=0.914058 loss=0.052144 dt=00:16:29 eta=15:23:15 |--------------------------------------------------------------------->
train_opt_callback: iter= 201 sample=73/192 sched=0.913225 loss=0.046060 dt=00:16:30 eta=15:08:21 |---------------------------------------------------------------------->
train_opt_callback: iter= 202 sample=81/192 sched=0.912389 loss=0.050882 dt=00:16:28 eta=14:49:32 |--------------------------------------------------------------------->
train_opt_callback: iter= 203 sample=89/192 sched=0.911549 loss=0.051957 dt=00:16:32 eta=14:36:45 |--------------------------------------------------------------------->
train_opt_callback: iter= 204 sample=97/192 sched=0.910705 loss=0.051343 dt=00:16:30 eta=14:18:08 |--------------------------------------------------------------------->
train_opt_callback: iter= 205 sample=105/192 sched=0.909858 loss=0.049534 dt=00:16:26 eta=13:58:25 |--------------------------------------------------------------------->
train_opt_callback: iter= 206 sample=113/192 sched=0.909007 loss=0.053146 dt=00:16:38 eta=13:51:49 |--------------------------------------------------------------------->
train_opt_callback: iter= 207 sample=121/192 sched=0.908153 loss=0.056046 dt=00:16:32 eta=13:30:17 |--------------------------------------------------------------------->
train_opt_callback: iter= 208 sample=129/192 sched=0.907296 loss=0.050121 dt=00:16:24 eta=13:07:49 |--------------------------------------------------------------------->
train_opt_callback: iter= 209 sample=137/192 sched=0.906434 loss=0.057322 dt=00:16:24 eta=12:51:03 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-210.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 210 sample=145/192 sched=0.905570 loss=0.051229 dt=00:16:29 eta=12:38:29 |--------------------------------------------------------------------->
train_opt_callback: iter= 211 sample=153/192 sched=0.904702 loss=0.063042 dt=00:16:37 eta=12:27:57 |--------------------------------------------------------------------->
train_opt_callback: iter= 212 sample=161/192 sched=0.903830 loss=0.047218 dt=00:16:27 eta=12:04:04 |--------------------------------------------------------------------->
train_opt_callback: iter= 213 sample=169/192 sched=0.902955 loss=0.068922 dt=00:16:25 eta=11:46:20 |--------------------------------------------------------------------->
train_opt_callback: iter= 214 sample=177/192 sched=0.902076 loss=0.047228 dt=00:16:30 eta=11:33:09 |--------------------------------------------------------------------->
train_opt_callback: iter= 215 sample=185/192 sched=0.901194 loss=0.047039 dt=00:16:19 eta=11:09:22 |--------------------------------------------------------------------->
train_opt_callback: reshuffle samples. completed epochs: 9
train_opt_callback: iter= 216 sample=1/192 sched=0.900308 loss=0.049039 dt=00:16:19 eta=10:53:13 |--------------------------------------------------------------------->
train_opt_callback: iter= 217 sample=9/192 sched=0.899419 loss=0.041163 dt=00:16:22 eta=10:38:19 |---------------------------------------------------------------------->
train_opt_callback: iter= 218 sample=17/192 sched=0.898526 loss=0.041811 dt=00:16:24 eta=10:23:14 |---------------------------------------------------------------------->
train_opt_callback: iter= 219 sample=25/192 sched=0.897630 loss=0.046526 dt=00:18:08 eta=11:11:03 |---------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-220.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 220 sample=33/192 sched=0.896731 loss=0.051750 dt=00:27:04 eta=16:14:27 |--------------------------------------------------------------------->
train_opt_callback: iter= 221 sample=41/192 sched=0.895828 loss=0.047243 dt=00:30:23 eta=17:43:34 |--------------------------------------------------------------------->
train_opt_callback: iter= 222 sample=49/192 sched=0.894922 loss=0.046640 dt=00:31:25 eta=17:48:40 |---------------------------------------------------------------------->
train_opt_callback: iter= 223 sample=57/192 sched=0.894012 loss=0.044438 dt=00:31:36 eta=17:22:51 |---------------------------------------------------------------------->
train_opt_callback: iter= 224 sample=65/192 sched=0.893099 loss=0.057114 dt=00:31:34 eta=16:50:24 |--------------------------------------------------------------------->
train_opt_callback: iter= 225 sample=73/192 sched=0.892183 loss=0.048878 dt=00:31:40 eta=16:21:54 |--------------------------------------------------------------------->
train_opt_callback: iter= 226 sample=81/192 sched=0.891263 loss=0.055839 dt=00:31:44 eta=15:52:12 |--------------------------------------------------------------------->
train_opt_callback: iter= 227 sample=89/192 sched=0.890340 loss=0.042806 dt=00:31:56 eta=15:26:14 |---------------------------------------------------------------------->
train_opt_callback: iter= 228 sample=97/192 sched=0.889413 loss=0.051473 dt=00:31:40 eta=14:46:56 |--------------------------------------------------------------------->
train_opt_callback: iter= 229 sample=105/192 sched=0.888483 loss=0.047850 dt=00:31:42 eta=14:16:04 |--------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-230.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 230 sample=113/192 sched=0.887550 loss=0.045053 dt=00:31:45 eta=13:45:54 |---------------------------------------------------------------------->
train_opt_callback: iter= 231 sample=121/192 sched=0.886613 loss=0.057748 dt=00:32:00 eta=13:20:06 |--------------------------------------------------------------------->
train_opt_callback: iter= 232 sample=129/192 sched=0.885674 loss=0.045751 dt=00:32:01 eta=12:48:28 |---------------------------------------------------------------------->
train_opt_callback: iter= 233 sample=137/192 sched=0.884730 loss=0.047692 dt=00:32:05 eta=12:18:05 |--------------------------------------------------------------------->
train_opt_callback: iter= 234 sample=145/192 sched=0.883784 loss=0.054099 dt=00:32:03 eta=11:45:18 |--------------------------------------------------------------------->
train_opt_callback: iter= 235 sample=153/192 sched=0.882834 loss=0.049681 dt=00:32:13 eta=11:16:44 |--------------------------------------------------------------------->
train_opt_callback: iter= 236 sample=161/192 sched=0.881881 loss=0.049282 dt=00:32:01 eta=10:40:22 |--------------------------------------------------------------------->
train_opt_callback: iter= 237 sample=169/192 sched=0.880924 loss=0.060196 dt=00:32:14 eta=10:12:35 |--------------------------------------------------------------------->
train_opt_callback: iter= 238 sample=177/192 sched=0.879965 loss=0.053252 dt=00:32:19 eta=09:41:47 |--------------------------------------------------------------------->
train_opt_callback: iter= 239 sample=185/192 sched=0.879002 loss=0.046051 dt=00:32:23 eta=09:10:46 |---------------------------------------------------------------------->
train_opt_callback: reshuffle samples. completed epochs: 10
save_checkpoint_lora_file: saving to checkpoint-240.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 240 sample=1/192 sched=0.878036 loss=0.050061 dt=00:32:24 eta=08:38:27 |--------------------------------------------------------------------->
train_opt_callback: iter= 241 sample=9/192 sched=0.877066 loss=0.046039 dt=00:32:15 eta=08:03:49 |---------------------------------------------------------------------->
train_opt_callback: iter= 242 sample=17/192 sched=0.876094 loss=0.045632 dt=00:32:12 eta=07:30:59 |---------------------------------------------------------------------->
train_opt_callback: iter= 243 sample=25/192 sched=0.875118 loss=0.044605 dt=00:32:24 eta=07:01:18 |---------------------------------------------------------------------->
train_opt_callback: iter= 244 sample=33/192 sched=0.874139 loss=0.044698 dt=00:32:33 eta=06:30:41 |---------------------------------------------------------------------->
train_opt_callback: iter= 245 sample=41/192 sched=0.873157 loss=0.048738 dt=00:32:27 eta=05:57:00 |--------------------------------------------------------------------->
train_opt_callback: iter= 246 sample=49/192 sched=0.872171 loss=0.047081 dt=00:32:16 eta=05:22:44 |--------------------------------------------------------------------->
train_opt_callback: iter= 247 sample=57/192 sched=0.871183 loss=0.042629 dt=00:32:21 eta=04:51:11 |---------------------------------------------------------------------->
train_opt_callback: iter= 248 sample=65/192 sched=0.870191 loss=0.044505 dt=00:32:12 eta=04:17:43 |---------------------------------------------------------------------->
train_opt_callback: iter= 249 sample=73/192 sched=0.869196 loss=0.043167 dt=00:32:18 eta=03:46:06 |---------------------------------------------------------------------->
save_checkpoint_lora_file: saving to checkpoint-250.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin
train_opt_callback: iter= 250 sample=81/192 sched=0.868198 loss=0.047185 dt=00:32:12 eta=03:13:17 |--------------------------------------------------------------------->
train_opt_callback: iter= 251 sample=89/192 sched=0.867197 loss=0.047900 dt=00:32:27 eta=02:42:15 |--------------------------------------------------------------------->
train_opt_callback: iter= 252 sample=97/192 sched=0.866192 loss=0.049573 dt=00:32:13 eta=02:08:53 |--------------------------------------------------------------------->
train_opt_callback: iter= 253 sample=105/192 sched=0.865185 loss=0.046385 dt=00:32:14 eta=01:36:43 |---------------------------------------------------------------------->
train_opt_callback: iter= 254 sample=113/192 sched=0.864174 loss=0.046368 dt=00:32:15 eta=01:04:30 |---------------------------------------------------------------------->
train_opt_callback: iter= 255 sample=121/192 sched=0.863161 loss=0.050979 dt=00:32:15 eta=00:32:15 |--------------------------------------------------------------------->
train_opt_callback: iter= 256 sample=129/192 sched=0.862144 loss=0.046455 dt=00:32:13 eta=0.0ms |---------------------------------------------------------------------->
main: total training time: 5d 12:39:39
save_checkpoint_lora_file: saving to checkpoint-256.gguf
save_checkpoint_lora_file: saving to checkpoint-LATEST.gguf
save_as_llama_lora: saving to lora.bin
save_as_llama_lora: saving to lora.bin