Text Generation
Transformers
Safetensors
qwen2
llama-factory
full
Generated from Trainer
conversational
text-generation-inference
Instructions to use mlfoundations-dev/reasoning_hp_ablations_bsz512 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use mlfoundations-dev/reasoning_hp_ablations_bsz512 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="mlfoundations-dev/reasoning_hp_ablations_bsz512") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("mlfoundations-dev/reasoning_hp_ablations_bsz512") model = AutoModelForCausalLM.from_pretrained("mlfoundations-dev/reasoning_hp_ablations_bsz512") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use mlfoundations-dev/reasoning_hp_ablations_bsz512 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "mlfoundations-dev/reasoning_hp_ablations_bsz512" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mlfoundations-dev/reasoning_hp_ablations_bsz512", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/mlfoundations-dev/reasoning_hp_ablations_bsz512
- SGLang
How to use mlfoundations-dev/reasoning_hp_ablations_bsz512 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "mlfoundations-dev/reasoning_hp_ablations_bsz512" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mlfoundations-dev/reasoning_hp_ablations_bsz512", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "mlfoundations-dev/reasoning_hp_ablations_bsz512" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mlfoundations-dev/reasoning_hp_ablations_bsz512", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use mlfoundations-dev/reasoning_hp_ablations_bsz512 with Docker Model Runner:
docker model run hf.co/mlfoundations-dev/reasoning_hp_ablations_bsz512
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9943851768669285, | |
| "eval_steps": 500, | |
| "global_step": 666, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004491858506457047, | |
| "grad_norm": 5.747499942779541, | |
| "learning_rate": 1.4925373134328358e-07, | |
| "loss": 0.8287, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008983717012914094, | |
| "grad_norm": 5.93737268447876, | |
| "learning_rate": 2.9850746268656716e-07, | |
| "loss": 0.8732, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.01347557551937114, | |
| "grad_norm": 5.791993618011475, | |
| "learning_rate": 4.4776119402985074e-07, | |
| "loss": 0.8552, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.017967434025828188, | |
| "grad_norm": 5.838237762451172, | |
| "learning_rate": 5.970149253731343e-07, | |
| "loss": 0.8678, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.022459292532285232, | |
| "grad_norm": 5.8886027336120605, | |
| "learning_rate": 7.462686567164179e-07, | |
| "loss": 0.8734, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02695115103874228, | |
| "grad_norm": 5.727405071258545, | |
| "learning_rate": 8.955223880597015e-07, | |
| "loss": 0.8517, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.031443009545199324, | |
| "grad_norm": 5.336740493774414, | |
| "learning_rate": 1.044776119402985e-06, | |
| "loss": 0.8407, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.035934868051656375, | |
| "grad_norm": 5.36053466796875, | |
| "learning_rate": 1.1940298507462686e-06, | |
| "loss": 0.8449, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.04042672655811342, | |
| "grad_norm": 4.371918678283691, | |
| "learning_rate": 1.3432835820895524e-06, | |
| "loss": 0.8134, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.044918585064570464, | |
| "grad_norm": 4.224687099456787, | |
| "learning_rate": 1.4925373134328358e-06, | |
| "loss": 0.809, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.049410443571027515, | |
| "grad_norm": 3.9456374645233154, | |
| "learning_rate": 1.6417910447761196e-06, | |
| "loss": 0.8054, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.05390230207748456, | |
| "grad_norm": 2.2551746368408203, | |
| "learning_rate": 1.791044776119403e-06, | |
| "loss": 0.7676, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.058394160583941604, | |
| "grad_norm": 2.1659226417541504, | |
| "learning_rate": 1.9402985074626867e-06, | |
| "loss": 0.7585, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.06288601909039865, | |
| "grad_norm": 1.9352374076843262, | |
| "learning_rate": 2.08955223880597e-06, | |
| "loss": 0.759, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.06737787759685569, | |
| "grad_norm": 1.8645031452178955, | |
| "learning_rate": 2.238805970149254e-06, | |
| "loss": 0.7515, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.07186973610331275, | |
| "grad_norm": 2.0890092849731445, | |
| "learning_rate": 2.3880597014925373e-06, | |
| "loss": 0.7335, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0763615946097698, | |
| "grad_norm": 3.110126256942749, | |
| "learning_rate": 2.537313432835821e-06, | |
| "loss": 0.7421, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.08085345311622684, | |
| "grad_norm": 3.361837148666382, | |
| "learning_rate": 2.686567164179105e-06, | |
| "loss": 0.7311, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.08534531162268388, | |
| "grad_norm": 3.211651086807251, | |
| "learning_rate": 2.835820895522388e-06, | |
| "loss": 0.7229, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.08983717012914093, | |
| "grad_norm": 3.045382022857666, | |
| "learning_rate": 2.9850746268656716e-06, | |
| "loss": 0.7301, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09432902863559797, | |
| "grad_norm": 2.593707799911499, | |
| "learning_rate": 3.1343283582089558e-06, | |
| "loss": 0.7085, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.09882088714205503, | |
| "grad_norm": 1.6424776315689087, | |
| "learning_rate": 3.283582089552239e-06, | |
| "loss": 0.6902, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.10331274564851207, | |
| "grad_norm": 1.2647095918655396, | |
| "learning_rate": 3.4328358208955225e-06, | |
| "loss": 0.6807, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.10780460415496912, | |
| "grad_norm": 1.1175804138183594, | |
| "learning_rate": 3.582089552238806e-06, | |
| "loss": 0.6662, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.11229646266142616, | |
| "grad_norm": 1.0176746845245361, | |
| "learning_rate": 3.73134328358209e-06, | |
| "loss": 0.6643, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.11678832116788321, | |
| "grad_norm": 0.9287970066070557, | |
| "learning_rate": 3.8805970149253735e-06, | |
| "loss": 0.6396, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.12128017967434025, | |
| "grad_norm": 1.0267021656036377, | |
| "learning_rate": 4.029850746268657e-06, | |
| "loss": 0.6461, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.1257720381807973, | |
| "grad_norm": 0.9184905886650085, | |
| "learning_rate": 4.17910447761194e-06, | |
| "loss": 0.6439, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.13026389668725435, | |
| "grad_norm": 0.7492115497589111, | |
| "learning_rate": 4.3283582089552236e-06, | |
| "loss": 0.6403, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.13475575519371139, | |
| "grad_norm": 0.6593955159187317, | |
| "learning_rate": 4.477611940298508e-06, | |
| "loss": 0.6358, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13924761370016844, | |
| "grad_norm": 0.6903685927391052, | |
| "learning_rate": 4.626865671641791e-06, | |
| "loss": 0.6318, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.1437394722066255, | |
| "grad_norm": 0.8214516639709473, | |
| "learning_rate": 4.7761194029850745e-06, | |
| "loss": 0.6202, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.14823133071308253, | |
| "grad_norm": 0.7563803195953369, | |
| "learning_rate": 4.925373134328359e-06, | |
| "loss": 0.6047, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.1527231892195396, | |
| "grad_norm": 0.5525519251823425, | |
| "learning_rate": 5.074626865671642e-06, | |
| "loss": 0.6116, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.15721504772599662, | |
| "grad_norm": 0.5453880429267883, | |
| "learning_rate": 5.2238805970149255e-06, | |
| "loss": 0.6055, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.16170690623245368, | |
| "grad_norm": 0.6540727615356445, | |
| "learning_rate": 5.37313432835821e-06, | |
| "loss": 0.5947, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.16619876473891074, | |
| "grad_norm": 0.650087833404541, | |
| "learning_rate": 5.522388059701493e-06, | |
| "loss": 0.5908, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.17069062324536777, | |
| "grad_norm": 0.5679633617401123, | |
| "learning_rate": 5.671641791044776e-06, | |
| "loss": 0.598, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.17518248175182483, | |
| "grad_norm": 0.44914695620536804, | |
| "learning_rate": 5.820895522388061e-06, | |
| "loss": 0.5927, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.17967434025828186, | |
| "grad_norm": 0.47071295976638794, | |
| "learning_rate": 5.970149253731343e-06, | |
| "loss": 0.596, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18416619876473891, | |
| "grad_norm": 0.5729812979698181, | |
| "learning_rate": 6.119402985074627e-06, | |
| "loss": 0.5919, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.18865805727119594, | |
| "grad_norm": 0.5159889459609985, | |
| "learning_rate": 6.2686567164179116e-06, | |
| "loss": 0.5767, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.193149915777653, | |
| "grad_norm": 0.39863625168800354, | |
| "learning_rate": 6.417910447761194e-06, | |
| "loss": 0.5741, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.19764177428411006, | |
| "grad_norm": 0.46757975220680237, | |
| "learning_rate": 6.567164179104478e-06, | |
| "loss": 0.5813, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.2021336327905671, | |
| "grad_norm": 0.43369781970977783, | |
| "learning_rate": 6.7164179104477625e-06, | |
| "loss": 0.5664, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.20662549129702415, | |
| "grad_norm": 0.35700419545173645, | |
| "learning_rate": 6.865671641791045e-06, | |
| "loss": 0.5718, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.21111734980348118, | |
| "grad_norm": 0.4011654853820801, | |
| "learning_rate": 7.014925373134329e-06, | |
| "loss": 0.5637, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.21560920830993824, | |
| "grad_norm": 0.43147680163383484, | |
| "learning_rate": 7.164179104477612e-06, | |
| "loss": 0.5688, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.2201010668163953, | |
| "grad_norm": 0.41290047764778137, | |
| "learning_rate": 7.313432835820896e-06, | |
| "loss": 0.5554, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.22459292532285233, | |
| "grad_norm": 0.40266507863998413, | |
| "learning_rate": 7.46268656716418e-06, | |
| "loss": 0.5759, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22908478382930939, | |
| "grad_norm": 0.30240580439567566, | |
| "learning_rate": 7.611940298507463e-06, | |
| "loss": 0.5586, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.23357664233576642, | |
| "grad_norm": 0.31717273592948914, | |
| "learning_rate": 7.761194029850747e-06, | |
| "loss": 0.5639, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.23806850084222347, | |
| "grad_norm": 0.35987257957458496, | |
| "learning_rate": 7.91044776119403e-06, | |
| "loss": 0.5561, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.2425603593486805, | |
| "grad_norm": 0.3291323781013489, | |
| "learning_rate": 8.059701492537314e-06, | |
| "loss": 0.5595, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.24705221785513756, | |
| "grad_norm": 0.28238141536712646, | |
| "learning_rate": 8.208955223880599e-06, | |
| "loss": 0.5374, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.2515440763615946, | |
| "grad_norm": 0.30300024151802063, | |
| "learning_rate": 8.35820895522388e-06, | |
| "loss": 0.5514, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.25603593486805165, | |
| "grad_norm": 0.27751174569129944, | |
| "learning_rate": 8.507462686567165e-06, | |
| "loss": 0.5465, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.2605277933745087, | |
| "grad_norm": 0.24813084304332733, | |
| "learning_rate": 8.656716417910447e-06, | |
| "loss": 0.5388, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.26501965188096577, | |
| "grad_norm": 0.2552523612976074, | |
| "learning_rate": 8.805970149253732e-06, | |
| "loss": 0.553, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.26951151038742277, | |
| "grad_norm": 0.2615125775337219, | |
| "learning_rate": 8.955223880597016e-06, | |
| "loss": 0.5414, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.27400336889387983, | |
| "grad_norm": 0.2671511173248291, | |
| "learning_rate": 9.104477611940299e-06, | |
| "loss": 0.5356, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.2784952274003369, | |
| "grad_norm": 0.2728058397769928, | |
| "learning_rate": 9.253731343283582e-06, | |
| "loss": 0.5413, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.28298708590679394, | |
| "grad_norm": 0.2823947072029114, | |
| "learning_rate": 9.402985074626867e-06, | |
| "loss": 0.5505, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.287478944413251, | |
| "grad_norm": 0.28685927391052246, | |
| "learning_rate": 9.552238805970149e-06, | |
| "loss": 0.5346, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.291970802919708, | |
| "grad_norm": 0.24975822865962982, | |
| "learning_rate": 9.701492537313434e-06, | |
| "loss": 0.5366, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.29646266142616506, | |
| "grad_norm": 0.2949586808681488, | |
| "learning_rate": 9.850746268656717e-06, | |
| "loss": 0.5247, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.3009545199326221, | |
| "grad_norm": 0.2750069797039032, | |
| "learning_rate": 1e-05, | |
| "loss": 0.538, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.3054463784390792, | |
| "grad_norm": 0.34335073828697205, | |
| "learning_rate": 9.999931232202689e-06, | |
| "loss": 0.5252, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.30993823694553624, | |
| "grad_norm": 0.24552962183952332, | |
| "learning_rate": 9.999724930702358e-06, | |
| "loss": 0.5312, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.31443009545199324, | |
| "grad_norm": 0.3122241795063019, | |
| "learning_rate": 9.999381101173765e-06, | |
| "loss": 0.5281, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3189219539584503, | |
| "grad_norm": 0.23177407681941986, | |
| "learning_rate": 9.99889975307467e-06, | |
| "loss": 0.5266, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.32341381246490736, | |
| "grad_norm": 0.2342972457408905, | |
| "learning_rate": 9.998280899645575e-06, | |
| "loss": 0.5419, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.3279056709713644, | |
| "grad_norm": 0.26972994208335876, | |
| "learning_rate": 9.997524557909353e-06, | |
| "loss": 0.528, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.3323975294778215, | |
| "grad_norm": 0.2486950308084488, | |
| "learning_rate": 9.996630748670788e-06, | |
| "loss": 0.5285, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.3368893879842785, | |
| "grad_norm": 0.2598794400691986, | |
| "learning_rate": 9.995599496515996e-06, | |
| "loss": 0.5339, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.34138124649073553, | |
| "grad_norm": 0.2780413329601288, | |
| "learning_rate": 9.99443082981175e-06, | |
| "loss": 0.5209, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.3458731049971926, | |
| "grad_norm": 0.23097486793994904, | |
| "learning_rate": 9.993124780704707e-06, | |
| "loss": 0.534, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.35036496350364965, | |
| "grad_norm": 0.23875422775745392, | |
| "learning_rate": 9.991681385120515e-06, | |
| "loss": 0.5152, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.35485682201010665, | |
| "grad_norm": 0.28855064511299133, | |
| "learning_rate": 9.99010068276283e-06, | |
| "loss": 0.5279, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.3593486805165637, | |
| "grad_norm": 0.24532583355903625, | |
| "learning_rate": 9.988382717112213e-06, | |
| "loss": 0.53, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.36384053902302077, | |
| "grad_norm": 0.2871251404285431, | |
| "learning_rate": 9.986527535424956e-06, | |
| "loss": 0.529, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.36833239752947783, | |
| "grad_norm": 0.23627132177352905, | |
| "learning_rate": 9.98453518873176e-06, | |
| "loss": 0.5331, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.3728242560359349, | |
| "grad_norm": 0.2671157419681549, | |
| "learning_rate": 9.982405731836343e-06, | |
| "loss": 0.5241, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.3773161145423919, | |
| "grad_norm": 0.2739470899105072, | |
| "learning_rate": 9.980139223313926e-06, | |
| "loss": 0.5232, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.38180797304884895, | |
| "grad_norm": 0.2618931531906128, | |
| "learning_rate": 9.977735725509632e-06, | |
| "loss": 0.5261, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.386299831555306, | |
| "grad_norm": 0.29810863733291626, | |
| "learning_rate": 9.97519530453676e-06, | |
| "loss": 0.5089, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.39079169006176306, | |
| "grad_norm": 0.26110532879829407, | |
| "learning_rate": 9.97251803027497e-06, | |
| "loss": 0.5082, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.3952835485682201, | |
| "grad_norm": 0.23007212579250336, | |
| "learning_rate": 9.969703976368368e-06, | |
| "loss": 0.5128, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.3997754070746771, | |
| "grad_norm": 0.2656469941139221, | |
| "learning_rate": 9.966753220223466e-06, | |
| "loss": 0.5213, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.4042672655811342, | |
| "grad_norm": 0.2582208812236786, | |
| "learning_rate": 9.963665843007066e-06, | |
| "loss": 0.5192, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.40875912408759124, | |
| "grad_norm": 0.24596616625785828, | |
| "learning_rate": 9.960441929644017e-06, | |
| "loss": 0.52, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.4132509825940483, | |
| "grad_norm": 0.22645823657512665, | |
| "learning_rate": 9.95708156881489e-06, | |
| "loss": 0.5114, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.41774284110050536, | |
| "grad_norm": 0.2343415766954422, | |
| "learning_rate": 9.95358485295353e-06, | |
| "loss": 0.5124, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.42223469960696236, | |
| "grad_norm": 0.22430844604969025, | |
| "learning_rate": 9.949951878244514e-06, | |
| "loss": 0.5202, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.4267265581134194, | |
| "grad_norm": 0.2198512703180313, | |
| "learning_rate": 9.946182744620512e-06, | |
| "loss": 0.5198, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.4312184166198765, | |
| "grad_norm": 0.2237306386232376, | |
| "learning_rate": 9.94227755575953e-06, | |
| "loss": 0.4995, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.43571027512633353, | |
| "grad_norm": 0.2366565316915512, | |
| "learning_rate": 9.93823641908206e-06, | |
| "loss": 0.5182, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.4402021336327906, | |
| "grad_norm": 0.24728058278560638, | |
| "learning_rate": 9.934059445748134e-06, | |
| "loss": 0.5124, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.4446939921392476, | |
| "grad_norm": 0.22210274636745453, | |
| "learning_rate": 9.92974675065425e-06, | |
| "loss": 0.4915, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.44918585064570465, | |
| "grad_norm": 0.22185443341732025, | |
| "learning_rate": 9.925298452430225e-06, | |
| "loss": 0.5033, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4536777091521617, | |
| "grad_norm": 0.278481662273407, | |
| "learning_rate": 9.920714673435931e-06, | |
| "loss": 0.5156, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.45816956765861877, | |
| "grad_norm": 0.2632293999195099, | |
| "learning_rate": 9.915995539757918e-06, | |
| "loss": 0.5114, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.4626614261650758, | |
| "grad_norm": 0.29412922263145447, | |
| "learning_rate": 9.91114118120596e-06, | |
| "loss": 0.5117, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.46715328467153283, | |
| "grad_norm": 0.2760590612888336, | |
| "learning_rate": 9.906151731309472e-06, | |
| "loss": 0.5013, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.4716451431779899, | |
| "grad_norm": 0.2319260984659195, | |
| "learning_rate": 9.901027327313847e-06, | |
| "loss": 0.4969, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.47613700168444695, | |
| "grad_norm": 0.22962868213653564, | |
| "learning_rate": 9.895768110176677e-06, | |
| "loss": 0.5045, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.480628860190904, | |
| "grad_norm": 0.2626447081565857, | |
| "learning_rate": 9.890374224563872e-06, | |
| "loss": 0.5033, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.485120718697361, | |
| "grad_norm": 0.2323921024799347, | |
| "learning_rate": 9.884845818845685e-06, | |
| "loss": 0.5136, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.48961257720381807, | |
| "grad_norm": 0.32458624243736267, | |
| "learning_rate": 9.879183045092628e-06, | |
| "loss": 0.5142, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.4941044357102751, | |
| "grad_norm": 0.23757074773311615, | |
| "learning_rate": 9.873386059071294e-06, | |
| "loss": 0.5086, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4985962942167322, | |
| "grad_norm": 0.27966201305389404, | |
| "learning_rate": 9.86745502024007e-06, | |
| "loss": 0.5095, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.5030881527231892, | |
| "grad_norm": 0.2681194543838501, | |
| "learning_rate": 9.861390091744738e-06, | |
| "loss": 0.4988, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.5075800112296462, | |
| "grad_norm": 0.24083542823791504, | |
| "learning_rate": 9.855191440414014e-06, | |
| "loss": 0.5152, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.5120718697361033, | |
| "grad_norm": 0.2831341326236725, | |
| "learning_rate": 9.848859236754936e-06, | |
| "loss": 0.5098, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.5165637282425604, | |
| "grad_norm": 0.27346932888031006, | |
| "learning_rate": 9.84239365494818e-06, | |
| "loss": 0.497, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.5210555867490174, | |
| "grad_norm": 0.27766865491867065, | |
| "learning_rate": 9.835794872843281e-06, | |
| "loss": 0.5183, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.5255474452554745, | |
| "grad_norm": 0.26060062646865845, | |
| "learning_rate": 9.829063071953715e-06, | |
| "loss": 0.5031, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.5300393037619315, | |
| "grad_norm": 0.2610337734222412, | |
| "learning_rate": 9.822198437451933e-06, | |
| "loss": 0.5094, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.5345311622683886, | |
| "grad_norm": 0.2221408635377884, | |
| "learning_rate": 9.815201158164253e-06, | |
| "loss": 0.5015, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.5390230207748455, | |
| "grad_norm": 0.2650430202484131, | |
| "learning_rate": 9.808071426565671e-06, | |
| "loss": 0.5021, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5435148792813026, | |
| "grad_norm": 0.2397095412015915, | |
| "learning_rate": 9.800809438774557e-06, | |
| "loss": 0.5096, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.5480067377877597, | |
| "grad_norm": 0.3001818358898163, | |
| "learning_rate": 9.793415394547274e-06, | |
| "loss": 0.5019, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.5524985962942167, | |
| "grad_norm": 0.22701479494571686, | |
| "learning_rate": 9.785889497272678e-06, | |
| "loss": 0.4856, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.5569904548006738, | |
| "grad_norm": 0.32053694128990173, | |
| "learning_rate": 9.778231953966519e-06, | |
| "loss": 0.5132, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.5614823133071308, | |
| "grad_norm": 0.2665109932422638, | |
| "learning_rate": 9.770442975265753e-06, | |
| "loss": 0.5144, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5659741718135879, | |
| "grad_norm": 0.2617216408252716, | |
| "learning_rate": 9.762522775422741e-06, | |
| "loss": 0.4918, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.570466030320045, | |
| "grad_norm": 0.27840656042099, | |
| "learning_rate": 9.754471572299363e-06, | |
| "loss": 0.494, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.574957888826502, | |
| "grad_norm": 0.2666041851043701, | |
| "learning_rate": 9.746289587361021e-06, | |
| "loss": 0.4986, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.5794497473329591, | |
| "grad_norm": 0.27760958671569824, | |
| "learning_rate": 9.737977045670549e-06, | |
| "loss": 0.5042, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.583941605839416, | |
| "grad_norm": 0.2891870141029358, | |
| "learning_rate": 9.729534175882016e-06, | |
| "loss": 0.4888, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5884334643458731, | |
| "grad_norm": 0.24741241335868835, | |
| "learning_rate": 9.72096121023445e-06, | |
| "loss": 0.5037, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.5929253228523301, | |
| "grad_norm": 0.3258378803730011, | |
| "learning_rate": 9.712258384545432e-06, | |
| "loss": 0.4874, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.5974171813587872, | |
| "grad_norm": 0.26103854179382324, | |
| "learning_rate": 9.703425938204627e-06, | |
| "loss": 0.4928, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.6019090398652442, | |
| "grad_norm": 0.2776138186454773, | |
| "learning_rate": 9.694464114167185e-06, | |
| "loss": 0.4993, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.6064008983717013, | |
| "grad_norm": 0.2927091717720032, | |
| "learning_rate": 9.685373158947067e-06, | |
| "loss": 0.4993, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.6108927568781584, | |
| "grad_norm": 0.31002023816108704, | |
| "learning_rate": 9.676153322610259e-06, | |
| "loss": 0.5032, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 0.31717565655708313, | |
| "learning_rate": 9.666804858767894e-06, | |
| "loss": 0.4964, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.6198764738910725, | |
| "grad_norm": 0.2911582887172699, | |
| "learning_rate": 9.65732802456928e-06, | |
| "loss": 0.4929, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.6243683323975294, | |
| "grad_norm": 0.315828800201416, | |
| "learning_rate": 9.647723080694822e-06, | |
| "loss": 0.4988, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.6288601909039865, | |
| "grad_norm": 0.2951522171497345, | |
| "learning_rate": 9.637990291348853e-06, | |
| "loss": 0.4908, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6333520494104435, | |
| "grad_norm": 0.26586809754371643, | |
| "learning_rate": 9.628129924252368e-06, | |
| "loss": 0.5017, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.6378439079169006, | |
| "grad_norm": 0.33742159605026245, | |
| "learning_rate": 9.618142250635658e-06, | |
| "loss": 0.4917, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.6423357664233577, | |
| "grad_norm": 0.24248960614204407, | |
| "learning_rate": 9.608027545230847e-06, | |
| "loss": 0.4844, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.6468276249298147, | |
| "grad_norm": 0.26963257789611816, | |
| "learning_rate": 9.597786086264339e-06, | |
| "loss": 0.4869, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.6513194834362718, | |
| "grad_norm": 0.2914552390575409, | |
| "learning_rate": 9.587418155449167e-06, | |
| "loss": 0.503, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.6558113419427288, | |
| "grad_norm": 0.2754093408584595, | |
| "learning_rate": 9.576924037977233e-06, | |
| "loss": 0.496, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.6603032004491859, | |
| "grad_norm": 0.278185099363327, | |
| "learning_rate": 9.566304022511477e-06, | |
| "loss": 0.4944, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.664795058955643, | |
| "grad_norm": 0.24680428206920624, | |
| "learning_rate": 9.555558401177927e-06, | |
| "loss": 0.4931, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.6692869174620999, | |
| "grad_norm": 0.29219385981559753, | |
| "learning_rate": 9.544687469557667e-06, | |
| "loss": 0.4965, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.673778775968557, | |
| "grad_norm": 0.27283161878585815, | |
| "learning_rate": 9.533691526678705e-06, | |
| "loss": 0.4893, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.678270634475014, | |
| "grad_norm": 0.2558472156524658, | |
| "learning_rate": 9.52257087500775e-06, | |
| "loss": 0.5045, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.6827624929814711, | |
| "grad_norm": 0.2973162829875946, | |
| "learning_rate": 9.51132582044189e-06, | |
| "loss": 0.512, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.6872543514879281, | |
| "grad_norm": 0.3723624050617218, | |
| "learning_rate": 9.49995667230018e-06, | |
| "loss": 0.4986, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.6917462099943852, | |
| "grad_norm": 0.24720445275306702, | |
| "learning_rate": 9.488463743315126e-06, | |
| "loss": 0.4925, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.6962380685008422, | |
| "grad_norm": 0.37346410751342773, | |
| "learning_rate": 9.476847349624097e-06, | |
| "loss": 0.4974, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.7007299270072993, | |
| "grad_norm": 0.31400275230407715, | |
| "learning_rate": 9.46510781076061e-06, | |
| "loss": 0.4942, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.7052217855137564, | |
| "grad_norm": 0.29658353328704834, | |
| "learning_rate": 9.453245449645562e-06, | |
| "loss": 0.4865, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.7097136440202133, | |
| "grad_norm": 0.2616643011569977, | |
| "learning_rate": 9.44126059257833e-06, | |
| "loss": 0.4932, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.7142055025266704, | |
| "grad_norm": 0.2881576716899872, | |
| "learning_rate": 9.4291535692278e-06, | |
| "loss": 0.5062, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.7186973610331274, | |
| "grad_norm": 0.24668999016284943, | |
| "learning_rate": 9.416924712623305e-06, | |
| "loss": 0.4994, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7231892195395845, | |
| "grad_norm": 0.2936095893383026, | |
| "learning_rate": 9.40457435914546e-06, | |
| "loss": 0.4885, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.7276810780460415, | |
| "grad_norm": 0.25287431478500366, | |
| "learning_rate": 9.392102848516901e-06, | |
| "loss": 0.4906, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.7321729365524986, | |
| "grad_norm": 0.2542520761489868, | |
| "learning_rate": 9.37951052379296e-06, | |
| "loss": 0.4747, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.7366647950589557, | |
| "grad_norm": 0.25072625279426575, | |
| "learning_rate": 9.36679773135221e-06, | |
| "loss": 0.4799, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.7411566535654127, | |
| "grad_norm": 0.24300134181976318, | |
| "learning_rate": 9.353964820886938e-06, | |
| "loss": 0.4963, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.7456485120718698, | |
| "grad_norm": 0.24435429275035858, | |
| "learning_rate": 9.341012145393546e-06, | |
| "loss": 0.4961, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.7501403705783268, | |
| "grad_norm": 0.27164775133132935, | |
| "learning_rate": 9.327940061162816e-06, | |
| "loss": 0.5013, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.7546322290847838, | |
| "grad_norm": 0.24927161633968353, | |
| "learning_rate": 9.314748927770126e-06, | |
| "loss": 0.4843, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.7591240875912408, | |
| "grad_norm": 0.2827945649623871, | |
| "learning_rate": 9.301439108065546e-06, | |
| "loss": 0.4902, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.7636159460976979, | |
| "grad_norm": 0.22944357991218567, | |
| "learning_rate": 9.288010968163874e-06, | |
| "loss": 0.4886, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.768107804604155, | |
| "grad_norm": 0.2653137445449829, | |
| "learning_rate": 9.274464877434548e-06, | |
| "loss": 0.4974, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.772599663110612, | |
| "grad_norm": 0.26817506551742554, | |
| "learning_rate": 9.260801208491499e-06, | |
| "loss": 0.4878, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.7770915216170691, | |
| "grad_norm": 0.21729539334774017, | |
| "learning_rate": 9.247020337182892e-06, | |
| "loss": 0.4778, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.7815833801235261, | |
| "grad_norm": 0.2882274091243744, | |
| "learning_rate": 9.233122642580796e-06, | |
| "loss": 0.4924, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.7860752386299832, | |
| "grad_norm": 0.2678658962249756, | |
| "learning_rate": 9.219108506970747e-06, | |
| "loss": 0.496, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7905670971364402, | |
| "grad_norm": 0.3008089065551758, | |
| "learning_rate": 9.204978315841238e-06, | |
| "loss": 0.4947, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.7950589556428973, | |
| "grad_norm": 0.25311562418937683, | |
| "learning_rate": 9.19073245787312e-06, | |
| "loss": 0.4751, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.7995508141493542, | |
| "grad_norm": 0.31886932253837585, | |
| "learning_rate": 9.1763713249289e-06, | |
| "loss": 0.4831, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.8040426726558113, | |
| "grad_norm": 0.2668146789073944, | |
| "learning_rate": 9.16189531204197e-06, | |
| "loss": 0.4866, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.8085345311622684, | |
| "grad_norm": 0.25939807295799255, | |
| "learning_rate": 9.147304817405741e-06, | |
| "loss": 0.4928, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8130263896687254, | |
| "grad_norm": 0.27832671999931335, | |
| "learning_rate": 9.132600242362682e-06, | |
| "loss": 0.5013, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.8175182481751825, | |
| "grad_norm": 0.24878981709480286, | |
| "learning_rate": 9.117781991393285e-06, | |
| "loss": 0.4839, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.8220101066816395, | |
| "grad_norm": 0.2747175097465515, | |
| "learning_rate": 9.102850472104945e-06, | |
| "loss": 0.4888, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.8265019651880966, | |
| "grad_norm": 0.2612956762313843, | |
| "learning_rate": 9.08780609522074e-06, | |
| "loss": 0.481, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.8309938236945537, | |
| "grad_norm": 0.2580430209636688, | |
| "learning_rate": 9.07264927456813e-06, | |
| "loss": 0.4806, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.8354856822010107, | |
| "grad_norm": 0.31817421317100525, | |
| "learning_rate": 9.057380427067584e-06, | |
| "loss": 0.4864, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.8399775407074677, | |
| "grad_norm": 0.28587251901626587, | |
| "learning_rate": 9.04199997272111e-06, | |
| "loss": 0.4882, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.8444693992139247, | |
| "grad_norm": 0.2851974368095398, | |
| "learning_rate": 9.02650833460069e-06, | |
| "loss": 0.4786, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.8489612577203818, | |
| "grad_norm": 0.2742379605770111, | |
| "learning_rate": 9.01090593883666e-06, | |
| "loss": 0.4793, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.8534531162268388, | |
| "grad_norm": 0.29192987084388733, | |
| "learning_rate": 8.995193214605972e-06, | |
| "loss": 0.495, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8579449747332959, | |
| "grad_norm": 0.3218548893928528, | |
| "learning_rate": 8.979370594120404e-06, | |
| "loss": 0.4962, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.862436833239753, | |
| "grad_norm": 0.21388249099254608, | |
| "learning_rate": 8.963438512614657e-06, | |
| "loss": 0.4923, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.86692869174621, | |
| "grad_norm": 0.29332301020622253, | |
| "learning_rate": 8.94739740833439e-06, | |
| "loss": 0.4883, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.8714205502526671, | |
| "grad_norm": 0.2745992839336395, | |
| "learning_rate": 8.93124772252417e-06, | |
| "loss": 0.4992, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.8759124087591241, | |
| "grad_norm": 0.22587467730045319, | |
| "learning_rate": 8.914989899415323e-06, | |
| "loss": 0.4741, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8804042672655812, | |
| "grad_norm": 0.2980908453464508, | |
| "learning_rate": 8.898624386213724e-06, | |
| "loss": 0.4777, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.8848961257720381, | |
| "grad_norm": 0.3256934881210327, | |
| "learning_rate": 8.88215163308749e-06, | |
| "loss": 0.4859, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.8893879842784952, | |
| "grad_norm": 0.27209722995758057, | |
| "learning_rate": 8.8655720931546e-06, | |
| "loss": 0.4948, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.8938798427849522, | |
| "grad_norm": 0.26153564453125, | |
| "learning_rate": 8.84888622247043e-06, | |
| "loss": 0.4844, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.8983717012914093, | |
| "grad_norm": 0.3023647367954254, | |
| "learning_rate": 8.832094480015211e-06, | |
| "loss": 0.4711, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9028635597978664, | |
| "grad_norm": 0.2709749937057495, | |
| "learning_rate": 8.815197327681399e-06, | |
| "loss": 0.4839, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.9073554183043234, | |
| "grad_norm": 0.2602469325065613, | |
| "learning_rate": 8.798195230260973e-06, | |
| "loss": 0.5001, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.9118472768107805, | |
| "grad_norm": 0.2420167326927185, | |
| "learning_rate": 8.781088655432648e-06, | |
| "loss": 0.4774, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.9163391353172375, | |
| "grad_norm": 0.2579791843891144, | |
| "learning_rate": 8.763878073749013e-06, | |
| "loss": 0.4784, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.9208309938236946, | |
| "grad_norm": 0.24879693984985352, | |
| "learning_rate": 8.746563958623584e-06, | |
| "loss": 0.4879, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.9253228523301515, | |
| "grad_norm": 0.24991406500339508, | |
| "learning_rate": 8.729146786317787e-06, | |
| "loss": 0.4931, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.9298147108366086, | |
| "grad_norm": 0.285593181848526, | |
| "learning_rate": 8.711627035927848e-06, | |
| "loss": 0.4815, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.9343065693430657, | |
| "grad_norm": 0.2333066463470459, | |
| "learning_rate": 8.694005189371627e-06, | |
| "loss": 0.4794, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.9387984278495227, | |
| "grad_norm": 0.271056592464447, | |
| "learning_rate": 8.676281731375355e-06, | |
| "loss": 0.4893, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.9432902863559798, | |
| "grad_norm": 0.24806849658489227, | |
| "learning_rate": 8.658457149460296e-06, | |
| "loss": 0.4929, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9477821448624368, | |
| "grad_norm": 0.25548022985458374, | |
| "learning_rate": 8.640531933929344e-06, | |
| "loss": 0.4828, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.9522740033688939, | |
| "grad_norm": 0.2774530053138733, | |
| "learning_rate": 8.622506577853538e-06, | |
| "loss": 0.4764, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.956765861875351, | |
| "grad_norm": 0.2682695984840393, | |
| "learning_rate": 8.604381577058486e-06, | |
| "loss": 0.476, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.961257720381808, | |
| "grad_norm": 0.24734169244766235, | |
| "learning_rate": 8.586157430110747e-06, | |
| "loss": 0.4858, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.9657495788882651, | |
| "grad_norm": 0.2741633951663971, | |
| "learning_rate": 8.56783463830409e-06, | |
| "loss": 0.4647, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.970241437394722, | |
| "grad_norm": 0.2486443966627121, | |
| "learning_rate": 8.549413705645737e-06, | |
| "loss": 0.481, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.9747332959011791, | |
| "grad_norm": 0.25736650824546814, | |
| "learning_rate": 8.530895138842467e-06, | |
| "loss": 0.4764, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.9792251544076361, | |
| "grad_norm": 0.22502939403057098, | |
| "learning_rate": 8.512279447286704e-06, | |
| "loss": 0.484, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.9837170129140932, | |
| "grad_norm": 0.24173305928707123, | |
| "learning_rate": 8.493567143042485e-06, | |
| "loss": 0.4827, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.9882088714205502, | |
| "grad_norm": 0.20804060995578766, | |
| "learning_rate": 8.47475874083139e-06, | |
| "loss": 0.4963, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9927007299270073, | |
| "grad_norm": 0.24134287238121033, | |
| "learning_rate": 8.455854758018377e-06, | |
| "loss": 0.4844, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.9971925884334644, | |
| "grad_norm": 0.23423220217227936, | |
| "learning_rate": 8.436855714597548e-06, | |
| "loss": 0.4819, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.0030881527231892, | |
| "grad_norm": 0.4805089831352234, | |
| "learning_rate": 8.417762133177849e-06, | |
| "loss": 0.7888, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.0075800112296462, | |
| "grad_norm": 0.43639126420021057, | |
| "learning_rate": 8.398574538968697e-06, | |
| "loss": 0.4816, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.0120718697361033, | |
| "grad_norm": 0.36428216099739075, | |
| "learning_rate": 8.379293459765527e-06, | |
| "loss": 0.4509, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.0165637282425604, | |
| "grad_norm": 0.33154451847076416, | |
| "learning_rate": 8.359919425935276e-06, | |
| "loss": 0.4652, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.0210555867490174, | |
| "grad_norm": 0.28618019819259644, | |
| "learning_rate": 8.340452970401798e-06, | |
| "loss": 0.462, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.0255474452554745, | |
| "grad_norm": 0.35930895805358887, | |
| "learning_rate": 8.3208946286312e-06, | |
| "loss": 0.485, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.0300393037619315, | |
| "grad_norm": 0.29257896542549133, | |
| "learning_rate": 8.301244938617117e-06, | |
| "loss": 0.4649, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.0345311622683886, | |
| "grad_norm": 0.28928303718566895, | |
| "learning_rate": 8.281504440865905e-06, | |
| "loss": 0.4828, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0390230207748457, | |
| "grad_norm": 0.3513385057449341, | |
| "learning_rate": 8.261673678381786e-06, | |
| "loss": 0.4787, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.0435148792813027, | |
| "grad_norm": 0.334462970495224, | |
| "learning_rate": 8.241753196651903e-06, | |
| "loss": 0.4859, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.0480067377877598, | |
| "grad_norm": 0.2647038996219635, | |
| "learning_rate": 8.221743543631314e-06, | |
| "loss": 0.4701, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.0524985962942168, | |
| "grad_norm": 0.36219117045402527, | |
| "learning_rate": 8.201645269727924e-06, | |
| "loss": 0.4675, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.0569904548006739, | |
| "grad_norm": 0.3206924796104431, | |
| "learning_rate": 8.181458927787347e-06, | |
| "loss": 0.4782, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.0614823133071307, | |
| "grad_norm": 0.29768282175064087, | |
| "learning_rate": 8.161185073077686e-06, | |
| "loss": 0.4695, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.0659741718135878, | |
| "grad_norm": 0.32597997784614563, | |
| "learning_rate": 8.140824263274278e-06, | |
| "loss": 0.4546, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.0704660303200448, | |
| "grad_norm": 0.26331695914268494, | |
| "learning_rate": 8.120377058444336e-06, | |
| "loss": 0.4749, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.074957888826502, | |
| "grad_norm": 0.2784285843372345, | |
| "learning_rate": 8.099844021031559e-06, | |
| "loss": 0.4664, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.079449747332959, | |
| "grad_norm": 0.30415502190589905, | |
| "learning_rate": 8.079225715840646e-06, | |
| "loss": 0.4611, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.083941605839416, | |
| "grad_norm": 0.2318006455898285, | |
| "learning_rate": 8.058522710021773e-06, | |
| "loss": 0.4647, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.088433464345873, | |
| "grad_norm": 0.3010203242301941, | |
| "learning_rate": 8.037735573054979e-06, | |
| "loss": 0.4778, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.0929253228523301, | |
| "grad_norm": 0.29548323154449463, | |
| "learning_rate": 8.016864876734514e-06, | |
| "loss": 0.4679, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.0974171813587872, | |
| "grad_norm": 0.2665966749191284, | |
| "learning_rate": 7.995911195153105e-06, | |
| "loss": 0.4559, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.1019090398652442, | |
| "grad_norm": 0.2814834713935852, | |
| "learning_rate": 7.974875104686164e-06, | |
| "loss": 0.4668, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.1064008983717013, | |
| "grad_norm": 0.3019580841064453, | |
| "learning_rate": 7.95375718397593e-06, | |
| "loss": 0.4915, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.1108927568781584, | |
| "grad_norm": 0.2376527190208435, | |
| "learning_rate": 7.932558013915561e-06, | |
| "loss": 0.4494, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.1153846153846154, | |
| "grad_norm": 0.24670954048633575, | |
| "learning_rate": 7.911278177633151e-06, | |
| "loss": 0.468, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.1198764738910725, | |
| "grad_norm": 0.25502070784568787, | |
| "learning_rate": 7.889918260475685e-06, | |
| "loss": 0.4723, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.1243683323975295, | |
| "grad_norm": 0.23503544926643372, | |
| "learning_rate": 7.868478849992944e-06, | |
| "loss": 0.4913, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.1288601909039866, | |
| "grad_norm": 0.2687453031539917, | |
| "learning_rate": 7.846960535921344e-06, | |
| "loss": 0.468, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.1333520494104437, | |
| "grad_norm": 0.2521950304508209, | |
| "learning_rate": 7.825363910167709e-06, | |
| "loss": 0.4421, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.1378439079169007, | |
| "grad_norm": 0.240973562002182, | |
| "learning_rate": 7.803689566792989e-06, | |
| "loss": 0.4902, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.1423357664233578, | |
| "grad_norm": 0.254777729511261, | |
| "learning_rate": 7.781938101995928e-06, | |
| "loss": 0.4518, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.1468276249298146, | |
| "grad_norm": 0.2425854653120041, | |
| "learning_rate": 7.76011011409665e-06, | |
| "loss": 0.4836, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.1513194834362717, | |
| "grad_norm": 0.256160169839859, | |
| "learning_rate": 7.738206203520223e-06, | |
| "loss": 0.4478, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.1558113419427287, | |
| "grad_norm": 0.2657393515110016, | |
| "learning_rate": 7.716226972780111e-06, | |
| "loss": 0.4838, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.1603032004491858, | |
| "grad_norm": 0.2680897116661072, | |
| "learning_rate": 7.694173026461634e-06, | |
| "loss": 0.465, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.1647950589556428, | |
| "grad_norm": 0.37795037031173706, | |
| "learning_rate": 7.672044971205315e-06, | |
| "loss": 0.481, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.1692869174621, | |
| "grad_norm": 0.24703949689865112, | |
| "learning_rate": 7.649843415690198e-06, | |
| "loss": 0.4517, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.173778775968557, | |
| "grad_norm": 0.3117562532424927, | |
| "learning_rate": 7.627568970617114e-06, | |
| "loss": 0.4445, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.178270634475014, | |
| "grad_norm": 0.2995341420173645, | |
| "learning_rate": 7.6052222486918725e-06, | |
| "loss": 0.4928, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.182762492981471, | |
| "grad_norm": 0.2563771903514862, | |
| "learning_rate": 7.582803864608411e-06, | |
| "loss": 0.462, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.1872543514879281, | |
| "grad_norm": 0.30361172556877136, | |
| "learning_rate": 7.560314435031886e-06, | |
| "loss": 0.4723, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.1917462099943852, | |
| "grad_norm": 0.24592924118041992, | |
| "learning_rate": 7.537754578581711e-06, | |
| "loss": 0.4569, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.1962380685008422, | |
| "grad_norm": 0.28482651710510254, | |
| "learning_rate": 7.51512491581454e-06, | |
| "loss": 0.474, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.2007299270072993, | |
| "grad_norm": 0.28539368510246277, | |
| "learning_rate": 7.4924260692072e-06, | |
| "loss": 0.4665, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.2052217855137564, | |
| "grad_norm": 0.2118852585554123, | |
| "learning_rate": 7.4696586631395626e-06, | |
| "loss": 0.4665, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.2097136440202134, | |
| "grad_norm": 0.27248629927635193, | |
| "learning_rate": 7.446823323877375e-06, | |
| "loss": 0.4749, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.2142055025266705, | |
| "grad_norm": 0.24900004267692566, | |
| "learning_rate": 7.423920679555029e-06, | |
| "loss": 0.4492, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.2186973610331275, | |
| "grad_norm": 0.23359505832195282, | |
| "learning_rate": 7.400951360158285e-06, | |
| "loss": 0.4866, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.2231892195395846, | |
| "grad_norm": 0.24799376726150513, | |
| "learning_rate": 7.377915997506945e-06, | |
| "loss": 0.4448, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.2276810780460417, | |
| "grad_norm": 0.29386043548583984, | |
| "learning_rate": 7.354815225237468e-06, | |
| "loss": 0.4617, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.2321729365524985, | |
| "grad_norm": 0.2598021924495697, | |
| "learning_rate": 7.331649678785545e-06, | |
| "loss": 0.4761, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.2366647950589555, | |
| "grad_norm": 0.2887316942214966, | |
| "learning_rate": 7.308419995368616e-06, | |
| "loss": 0.4835, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.2411566535654126, | |
| "grad_norm": 0.2421315759420395, | |
| "learning_rate": 7.285126813968346e-06, | |
| "loss": 0.4417, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.2456485120718697, | |
| "grad_norm": 0.296011358499527, | |
| "learning_rate": 7.2617707753130465e-06, | |
| "loss": 0.4778, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.2501403705783267, | |
| "grad_norm": 0.24542121589183807, | |
| "learning_rate": 7.238352521860049e-06, | |
| "loss": 0.4504, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.2546322290847838, | |
| "grad_norm": 0.2937444746494293, | |
| "learning_rate": 7.214872697778037e-06, | |
| "loss": 0.468, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.2591240875912408, | |
| "grad_norm": 0.2538372278213501, | |
| "learning_rate": 7.191331948929323e-06, | |
| "loss": 0.4629, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.263615946097698, | |
| "grad_norm": 0.2601139545440674, | |
| "learning_rate": 7.1677309228520865e-06, | |
| "loss": 0.4675, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.268107804604155, | |
| "grad_norm": 0.254896342754364, | |
| "learning_rate": 7.14407026874256e-06, | |
| "loss": 0.4657, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.272599663110612, | |
| "grad_norm": 0.18791675567626953, | |
| "learning_rate": 7.120350637437166e-06, | |
| "loss": 0.4638, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.277091521617069, | |
| "grad_norm": 0.3108649253845215, | |
| "learning_rate": 7.096572681394625e-06, | |
| "loss": 0.4667, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.2815833801235261, | |
| "grad_norm": 0.21536403894424438, | |
| "learning_rate": 7.072737054678004e-06, | |
| "loss": 0.468, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.2860752386299832, | |
| "grad_norm": 0.24769152700901031, | |
| "learning_rate": 7.048844412936718e-06, | |
| "loss": 0.4655, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.2905670971364402, | |
| "grad_norm": 0.2215883433818817, | |
| "learning_rate": 7.024895413388508e-06, | |
| "loss": 0.4816, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.2950589556428973, | |
| "grad_norm": 0.21110257506370544, | |
| "learning_rate": 7.000890714801352e-06, | |
| "loss": 0.4655, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.2995508141493544, | |
| "grad_norm": 0.21130134165287018, | |
| "learning_rate": 6.976830977475346e-06, | |
| "loss": 0.444, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.3040426726558114, | |
| "grad_norm": 0.2305009365081787, | |
| "learning_rate": 6.952716863224551e-06, | |
| "loss": 0.4665, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.3085345311622683, | |
| "grad_norm": 0.24490021169185638, | |
| "learning_rate": 6.928549035358772e-06, | |
| "loss": 0.4675, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.3130263896687255, | |
| "grad_norm": 0.20088279247283936, | |
| "learning_rate": 6.904328158665323e-06, | |
| "loss": 0.4589, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.3175182481751824, | |
| "grad_norm": 0.25551822781562805, | |
| "learning_rate": 6.880054899390744e-06, | |
| "loss": 0.4707, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.3220101066816397, | |
| "grad_norm": 0.21504874527454376, | |
| "learning_rate": 6.855729925222462e-06, | |
| "loss": 0.4633, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.3265019651880965, | |
| "grad_norm": 0.2354370504617691, | |
| "learning_rate": 6.831353905270433e-06, | |
| "loss": 0.4739, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.3309938236945535, | |
| "grad_norm": 0.22273460030555725, | |
| "learning_rate": 6.806927510048738e-06, | |
| "loss": 0.4676, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.3354856822010106, | |
| "grad_norm": 0.20321142673492432, | |
| "learning_rate": 6.782451411457137e-06, | |
| "loss": 0.4581, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.3399775407074677, | |
| "grad_norm": 0.21091462671756744, | |
| "learning_rate": 6.757926282762583e-06, | |
| "loss": 0.4603, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.3444693992139247, | |
| "grad_norm": 0.22511275112628937, | |
| "learning_rate": 6.733352798580708e-06, | |
| "loss": 0.4584, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.3489612577203818, | |
| "grad_norm": 0.20873034000396729, | |
| "learning_rate": 6.7087316348572626e-06, | |
| "loss": 0.4768, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.3534531162268388, | |
| "grad_norm": 0.19452911615371704, | |
| "learning_rate": 6.684063468849528e-06, | |
| "loss": 0.4687, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.357944974733296, | |
| "grad_norm": 0.2007768303155899, | |
| "learning_rate": 6.659348979107679e-06, | |
| "loss": 0.4602, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.362436833239753, | |
| "grad_norm": 0.18347479403018951, | |
| "learning_rate": 6.634588845456122e-06, | |
| "loss": 0.4565, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.36692869174621, | |
| "grad_norm": 0.2227877527475357, | |
| "learning_rate": 6.609783748974802e-06, | |
| "loss": 0.4554, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.371420550252667, | |
| "grad_norm": 0.2087058275938034, | |
| "learning_rate": 6.584934371980452e-06, | |
| "loss": 0.4552, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.3759124087591241, | |
| "grad_norm": 0.24951022863388062, | |
| "learning_rate": 6.560041398007847e-06, | |
| "loss": 0.46, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.3804042672655812, | |
| "grad_norm": 0.1927427351474762, | |
| "learning_rate": 6.535105511790979e-06, | |
| "loss": 0.4441, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.3848961257720382, | |
| "grad_norm": 0.2886252999305725, | |
| "learning_rate": 6.510127399244235e-06, | |
| "loss": 0.4769, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.3893879842784953, | |
| "grad_norm": 0.2261786013841629, | |
| "learning_rate": 6.485107747443529e-06, | |
| "loss": 0.4552, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.3938798427849521, | |
| "grad_norm": 0.22381217777729034, | |
| "learning_rate": 6.460047244607397e-06, | |
| "loss": 0.4576, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3983717012914094, | |
| "grad_norm": 0.21512459218502045, | |
| "learning_rate": 6.434946580078072e-06, | |
| "loss": 0.4417, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.4028635597978663, | |
| "grad_norm": 0.2732483446598053, | |
| "learning_rate": 6.409806444302519e-06, | |
| "loss": 0.4635, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.4073554183043235, | |
| "grad_norm": 0.2072446048259735, | |
| "learning_rate": 6.384627528813439e-06, | |
| "loss": 0.4544, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.4118472768107804, | |
| "grad_norm": 0.3143913447856903, | |
| "learning_rate": 6.359410526210259e-06, | |
| "loss": 0.49, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.4163391353172374, | |
| "grad_norm": 0.26985523104667664, | |
| "learning_rate": 6.334156130140068e-06, | |
| "loss": 0.4533, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.4208309938236945, | |
| "grad_norm": 0.25130948424339294, | |
| "learning_rate": 6.308865035278539e-06, | |
| "loss": 0.4536, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.4253228523301515, | |
| "grad_norm": 0.2829897403717041, | |
| "learning_rate": 6.283537937310829e-06, | |
| "loss": 0.484, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.4298147108366086, | |
| "grad_norm": 0.22410716116428375, | |
| "learning_rate": 6.258175532912432e-06, | |
| "loss": 0.4594, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.4343065693430657, | |
| "grad_norm": 0.2592792510986328, | |
| "learning_rate": 6.232778519730024e-06, | |
| "loss": 0.4709, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.4387984278495227, | |
| "grad_norm": 0.20447753369808197, | |
| "learning_rate": 6.207347596362265e-06, | |
| "loss": 0.4622, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.4432902863559798, | |
| "grad_norm": 0.25977230072021484, | |
| "learning_rate": 6.181883462340589e-06, | |
| "loss": 0.4753, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.4477821448624368, | |
| "grad_norm": 0.20790189504623413, | |
| "learning_rate": 6.1563868181099596e-06, | |
| "loss": 0.4535, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.452274003368894, | |
| "grad_norm": 0.29562297463417053, | |
| "learning_rate": 6.130858365009601e-06, | |
| "loss": 0.4744, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.456765861875351, | |
| "grad_norm": 0.21025337278842926, | |
| "learning_rate": 6.105298805253709e-06, | |
| "loss": 0.4663, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.461257720381808, | |
| "grad_norm": 0.26838284730911255, | |
| "learning_rate": 6.079708841912133e-06, | |
| "loss": 0.4682, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.465749578888265, | |
| "grad_norm": 0.21380648016929626, | |
| "learning_rate": 6.054089178891039e-06, | |
| "loss": 0.4669, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.4702414373947221, | |
| "grad_norm": 0.2207087278366089, | |
| "learning_rate": 6.028440520913545e-06, | |
| "loss": 0.4414, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.4747332959011792, | |
| "grad_norm": 0.21855312585830688, | |
| "learning_rate": 6.002763573500332e-06, | |
| "loss": 0.4833, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.479225154407636, | |
| "grad_norm": 0.23827263712882996, | |
| "learning_rate": 5.977059042950252e-06, | |
| "loss": 0.4486, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.4837170129140933, | |
| "grad_norm": 0.22362874448299408, | |
| "learning_rate": 5.951327636320878e-06, | |
| "loss": 0.4761, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.4882088714205501, | |
| "grad_norm": 0.23705798387527466, | |
| "learning_rate": 5.925570061409077e-06, | |
| "loss": 0.4667, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.4927007299270074, | |
| "grad_norm": 0.20592792332172394, | |
| "learning_rate": 5.899787026731524e-06, | |
| "loss": 0.4528, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.4971925884334643, | |
| "grad_norm": 0.1896170973777771, | |
| "learning_rate": 5.873979241505219e-06, | |
| "loss": 0.4618, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.5016844469399215, | |
| "grad_norm": 0.1971491128206253, | |
| "learning_rate": 5.848147415627981e-06, | |
| "loss": 0.4684, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.5061763054463784, | |
| "grad_norm": 0.21199488639831543, | |
| "learning_rate": 5.822292259658914e-06, | |
| "loss": 0.4653, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.5106681639528357, | |
| "grad_norm": 0.19454307854175568, | |
| "learning_rate": 5.79641448479887e-06, | |
| "loss": 0.4739, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.5151600224592925, | |
| "grad_norm": 0.2176458239555359, | |
| "learning_rate": 5.770514802870879e-06, | |
| "loss": 0.4583, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.5196518809657495, | |
| "grad_norm": 0.21410858631134033, | |
| "learning_rate": 5.744593926300573e-06, | |
| "loss": 0.4494, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.5241437394722066, | |
| "grad_norm": 0.21628141403198242, | |
| "learning_rate": 5.718652568096585e-06, | |
| "loss": 0.4639, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.5286355979786637, | |
| "grad_norm": 0.2230045348405838, | |
| "learning_rate": 5.6926914418309405e-06, | |
| "loss": 0.4603, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.5331274564851207, | |
| "grad_norm": 0.24906450510025024, | |
| "learning_rate": 5.666711261619429e-06, | |
| "loss": 0.4662, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.5376193149915778, | |
| "grad_norm": 0.19834677875041962, | |
| "learning_rate": 5.640712742101954e-06, | |
| "loss": 0.4582, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.5421111734980348, | |
| "grad_norm": 0.2662658989429474, | |
| "learning_rate": 5.614696598422885e-06, | |
| "loss": 0.4748, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.546603032004492, | |
| "grad_norm": 0.22788119316101074, | |
| "learning_rate": 5.5886635462113805e-06, | |
| "loss": 0.4654, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.551094890510949, | |
| "grad_norm": 0.22741669416427612, | |
| "learning_rate": 5.562614301561704e-06, | |
| "loss": 0.4527, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.5555867490174058, | |
| "grad_norm": 0.24191468954086304, | |
| "learning_rate": 5.536549581013525e-06, | |
| "loss": 0.4691, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.560078607523863, | |
| "grad_norm": 0.22361521422863007, | |
| "learning_rate": 5.510470101532213e-06, | |
| "loss": 0.4571, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.56457046603032, | |
| "grad_norm": 0.21503981947898865, | |
| "learning_rate": 5.48437658048911e-06, | |
| "loss": 0.447, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.5690623245367772, | |
| "grad_norm": 0.22127224504947662, | |
| "learning_rate": 5.4582697356418036e-06, | |
| "loss": 0.4604, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.573554183043234, | |
| "grad_norm": 0.24467918276786804, | |
| "learning_rate": 5.4321502851143785e-06, | |
| "loss": 0.479, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.5780460415496913, | |
| "grad_norm": 0.1886945217847824, | |
| "learning_rate": 5.406018947377668e-06, | |
| "loss": 0.4641, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.5825379000561481, | |
| "grad_norm": 0.20987150073051453, | |
| "learning_rate": 5.379876441229486e-06, | |
| "loss": 0.4629, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.5870297585626054, | |
| "grad_norm": 0.19232037663459778, | |
| "learning_rate": 5.353723485774859e-06, | |
| "loss": 0.4367, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.5915216170690623, | |
| "grad_norm": 0.18378283083438873, | |
| "learning_rate": 5.327560800406241e-06, | |
| "loss": 0.48, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.5960134755755195, | |
| "grad_norm": 0.22845953702926636, | |
| "learning_rate": 5.301389104783731e-06, | |
| "loss": 0.4695, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.6005053340819764, | |
| "grad_norm": 0.1837320476770401, | |
| "learning_rate": 5.275209118815274e-06, | |
| "loss": 0.4402, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.6049971925884334, | |
| "grad_norm": 0.20027028024196625, | |
| "learning_rate": 5.249021562636857e-06, | |
| "loss": 0.4538, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.6094890510948905, | |
| "grad_norm": 0.2599356472492218, | |
| "learning_rate": 5.222827156592701e-06, | |
| "loss": 0.4719, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.6139809096013475, | |
| "grad_norm": 0.20169083774089813, | |
| "learning_rate": 5.196626621215449e-06, | |
| "loss": 0.4722, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.6184727681078046, | |
| "grad_norm": 0.2602737247943878, | |
| "learning_rate": 5.170420677206344e-06, | |
| "loss": 0.4567, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.6229646266142617, | |
| "grad_norm": 0.20139826834201813, | |
| "learning_rate": 5.144210045415402e-06, | |
| "loss": 0.4607, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.6274564851207187, | |
| "grad_norm": 0.19559621810913086, | |
| "learning_rate": 5.117995446821592e-06, | |
| "loss": 0.4652, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.6319483436271758, | |
| "grad_norm": 0.21619027853012085, | |
| "learning_rate": 5.091777602512993e-06, | |
| "loss": 0.4381, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.6364402021336328, | |
| "grad_norm": 0.20571349561214447, | |
| "learning_rate": 5.065557233666968e-06, | |
| "loss": 0.4665, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.6409320606400897, | |
| "grad_norm": 0.2172069400548935, | |
| "learning_rate": 5.039335061530319e-06, | |
| "loss": 0.4463, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.645423919146547, | |
| "grad_norm": 0.20271949470043182, | |
| "learning_rate": 5.013111807399455e-06, | |
| "loss": 0.458, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.6499157776530038, | |
| "grad_norm": 0.19836212694644928, | |
| "learning_rate": 4.986888192600546e-06, | |
| "loss": 0.461, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.654407636159461, | |
| "grad_norm": 0.1832582950592041, | |
| "learning_rate": 4.960664938469683e-06, | |
| "loss": 0.4393, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.658899494665918, | |
| "grad_norm": 0.23569650948047638, | |
| "learning_rate": 4.934442766333034e-06, | |
| "loss": 0.4717, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.6633913531723752, | |
| "grad_norm": 0.186725914478302, | |
| "learning_rate": 4.908222397487009e-06, | |
| "loss": 0.4522, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.667883211678832, | |
| "grad_norm": 0.21094153821468353, | |
| "learning_rate": 4.88200455317841e-06, | |
| "loss": 0.4543, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.6723750701852893, | |
| "grad_norm": 0.24533981084823608, | |
| "learning_rate": 4.8557899545846e-06, | |
| "loss": 0.4714, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.6768669286917461, | |
| "grad_norm": 0.19219504296779633, | |
| "learning_rate": 4.829579322793659e-06, | |
| "loss": 0.4756, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.6813587871982034, | |
| "grad_norm": 0.22109220921993256, | |
| "learning_rate": 4.8033733787845535e-06, | |
| "loss": 0.4423, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.6858506457046603, | |
| "grad_norm": 0.21374572813510895, | |
| "learning_rate": 4.7771728434073005e-06, | |
| "loss": 0.4526, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.6903425042111173, | |
| "grad_norm": 0.21615076065063477, | |
| "learning_rate": 4.7509784373631446e-06, | |
| "loss": 0.4657, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.6948343627175744, | |
| "grad_norm": 0.2250211238861084, | |
| "learning_rate": 4.724790881184727e-06, | |
| "loss": 0.4621, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.6993262212240314, | |
| "grad_norm": 0.21998076140880585, | |
| "learning_rate": 4.69861089521627e-06, | |
| "loss": 0.4613, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.7038180797304885, | |
| "grad_norm": 0.20503737032413483, | |
| "learning_rate": 4.672439199593761e-06, | |
| "loss": 0.4529, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.7083099382369455, | |
| "grad_norm": 0.2038574367761612, | |
| "learning_rate": 4.646276514225143e-06, | |
| "loss": 0.4556, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.7128017967434026, | |
| "grad_norm": 0.18167847394943237, | |
| "learning_rate": 4.6201235587705155e-06, | |
| "loss": 0.4734, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.7172936552498597, | |
| "grad_norm": 0.1952500194311142, | |
| "learning_rate": 4.593981052622334e-06, | |
| "loss": 0.4542, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.7217855137563167, | |
| "grad_norm": 0.2038043886423111, | |
| "learning_rate": 4.567849714885622e-06, | |
| "loss": 0.482, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.7262773722627736, | |
| "grad_norm": 0.1942799836397171, | |
| "learning_rate": 4.541730264358198e-06, | |
| "loss": 0.4502, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.7307692307692308, | |
| "grad_norm": 0.17968258261680603, | |
| "learning_rate": 4.515623419510891e-06, | |
| "loss": 0.4595, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.7352610892756877, | |
| "grad_norm": 0.21333391964435577, | |
| "learning_rate": 4.489529898467789e-06, | |
| "loss": 0.4742, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.739752947782145, | |
| "grad_norm": 0.19606302678585052, | |
| "learning_rate": 4.463450418986477e-06, | |
| "loss": 0.4581, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.7442448062886018, | |
| "grad_norm": 0.17203934490680695, | |
| "learning_rate": 4.4373856984382985e-06, | |
| "loss": 0.449, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.748736664795059, | |
| "grad_norm": 0.19502601027488708, | |
| "learning_rate": 4.411336453788622e-06, | |
| "loss": 0.472, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.753228523301516, | |
| "grad_norm": 0.19490373134613037, | |
| "learning_rate": 4.3853034015771176e-06, | |
| "loss": 0.4509, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.7577203818079732, | |
| "grad_norm": 0.18702197074890137, | |
| "learning_rate": 4.3592872578980495e-06, | |
| "loss": 0.4867, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.76221224031443, | |
| "grad_norm": 0.185566708445549, | |
| "learning_rate": 4.333288738380574e-06, | |
| "loss": 0.4488, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.7667040988208873, | |
| "grad_norm": 0.17845405638217926, | |
| "learning_rate": 4.30730855816906e-06, | |
| "loss": 0.4657, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.7711959573273441, | |
| "grad_norm": 0.17764534056186676, | |
| "learning_rate": 4.281347431903416e-06, | |
| "loss": 0.4499, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.7756878158338012, | |
| "grad_norm": 0.19315990805625916, | |
| "learning_rate": 4.255406073699428e-06, | |
| "loss": 0.4497, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.7801796743402583, | |
| "grad_norm": 0.1673203557729721, | |
| "learning_rate": 4.229485197129122e-06, | |
| "loss": 0.4462, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.7846715328467153, | |
| "grad_norm": 0.18644391000270844, | |
| "learning_rate": 4.203585515201131e-06, | |
| "loss": 0.4469, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.7891633913531724, | |
| "grad_norm": 0.17737650871276855, | |
| "learning_rate": 4.177707740341088e-06, | |
| "loss": 0.4605, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.7936552498596294, | |
| "grad_norm": 0.17971952259540558, | |
| "learning_rate": 4.151852584372021e-06, | |
| "loss": 0.4473, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.7981471083660865, | |
| "grad_norm": 0.1814514845609665, | |
| "learning_rate": 4.1260207584947825e-06, | |
| "loss": 0.466, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.8026389668725435, | |
| "grad_norm": 0.17787028849124908, | |
| "learning_rate": 4.100212973268478e-06, | |
| "loss": 0.4656, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.8071308253790006, | |
| "grad_norm": 0.2006688117980957, | |
| "learning_rate": 4.074429938590924e-06, | |
| "loss": 0.4484, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.8116226838854577, | |
| "grad_norm": 0.20157967507839203, | |
| "learning_rate": 4.048672363679124e-06, | |
| "loss": 0.4735, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.8161145423919147, | |
| "grad_norm": 0.21290093660354614, | |
| "learning_rate": 4.022940957049752e-06, | |
| "loss": 0.4705, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.8206064008983716, | |
| "grad_norm": 0.18965761363506317, | |
| "learning_rate": 3.99723642649967e-06, | |
| "loss": 0.4534, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.8250982594048288, | |
| "grad_norm": 0.19589713215827942, | |
| "learning_rate": 3.971559479086459e-06, | |
| "loss": 0.4587, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.8295901179112857, | |
| "grad_norm": 0.16529035568237305, | |
| "learning_rate": 3.945910821108963e-06, | |
| "loss": 0.4587, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.834081976417743, | |
| "grad_norm": 0.17363354563713074, | |
| "learning_rate": 3.9202911580878685e-06, | |
| "loss": 0.4409, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.8385738349241998, | |
| "grad_norm": 0.1951248049736023, | |
| "learning_rate": 3.894701194746291e-06, | |
| "loss": 0.4534, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.843065693430657, | |
| "grad_norm": 0.18388287723064423, | |
| "learning_rate": 3.869141634990399e-06, | |
| "loss": 0.474, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.847557551937114, | |
| "grad_norm": 0.17535457015037537, | |
| "learning_rate": 3.843613181890042e-06, | |
| "loss": 0.4469, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.8520494104435712, | |
| "grad_norm": 0.1836119443178177, | |
| "learning_rate": 3.818116537659412e-06, | |
| "loss": 0.4536, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.856541268950028, | |
| "grad_norm": 0.18821248412132263, | |
| "learning_rate": 3.7926524036377366e-06, | |
| "loss": 0.477, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.861033127456485, | |
| "grad_norm": 0.19100096821784973, | |
| "learning_rate": 3.767221480269978e-06, | |
| "loss": 0.4336, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.8655249859629421, | |
| "grad_norm": 0.1809721142053604, | |
| "learning_rate": 3.741824467087569e-06, | |
| "loss": 0.4539, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.8700168444693992, | |
| "grad_norm": 0.22617210447788239, | |
| "learning_rate": 3.7164620626891724e-06, | |
| "loss": 0.4883, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.8745087029758563, | |
| "grad_norm": 0.1782565861940384, | |
| "learning_rate": 3.6911349647214623e-06, | |
| "loss": 0.4498, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.8790005614823133, | |
| "grad_norm": 0.16668500006198883, | |
| "learning_rate": 3.665843869859934e-06, | |
| "loss": 0.4563, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.8834924199887704, | |
| "grad_norm": 0.2230617105960846, | |
| "learning_rate": 3.640589473789742e-06, | |
| "loss": 0.4695, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.8879842784952274, | |
| "grad_norm": 0.17857353389263153, | |
| "learning_rate": 3.6153724711865623e-06, | |
| "loss": 0.4445, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.8924761370016845, | |
| "grad_norm": 0.19203616678714752, | |
| "learning_rate": 3.5901935556974837e-06, | |
| "loss": 0.4559, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.8969679955081415, | |
| "grad_norm": 0.18788589537143707, | |
| "learning_rate": 3.56505341992193e-06, | |
| "loss": 0.4543, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.9014598540145986, | |
| "grad_norm": 0.17305254936218262, | |
| "learning_rate": 3.539952755392605e-06, | |
| "loss": 0.4567, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.9059517125210554, | |
| "grad_norm": 0.21953020989894867, | |
| "learning_rate": 3.514892252556474e-06, | |
| "loss": 0.4717, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.9104435710275127, | |
| "grad_norm": 0.15481020510196686, | |
| "learning_rate": 3.4898726007557655e-06, | |
| "loss": 0.4424, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.9149354295339696, | |
| "grad_norm": 0.18283706903457642, | |
| "learning_rate": 3.464894488209022e-06, | |
| "loss": 0.4467, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.9194272880404268, | |
| "grad_norm": 0.18225404620170593, | |
| "learning_rate": 3.439958601992153e-06, | |
| "loss": 0.4805, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.9239191465468837, | |
| "grad_norm": 0.15692797303199768, | |
| "learning_rate": 3.415065628019547e-06, | |
| "loss": 0.4536, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.928411005053341, | |
| "grad_norm": 0.17508453130722046, | |
| "learning_rate": 3.3902162510252e-06, | |
| "loss": 0.4567, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.9329028635597978, | |
| "grad_norm": 0.1812361180782318, | |
| "learning_rate": 3.365411154543878e-06, | |
| "loss": 0.4466, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.937394722066255, | |
| "grad_norm": 0.17098145186901093, | |
| "learning_rate": 3.3406510208923228e-06, | |
| "loss": 0.4563, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.941886580572712, | |
| "grad_norm": 0.19324849545955658, | |
| "learning_rate": 3.3159365311504732e-06, | |
| "loss": 0.4649, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.946378439079169, | |
| "grad_norm": 0.17696388065814972, | |
| "learning_rate": 3.291268365142738e-06, | |
| "loss": 0.4518, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 1.950870297585626, | |
| "grad_norm": 0.1926283836364746, | |
| "learning_rate": 3.2666472014192942e-06, | |
| "loss": 0.4457, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.955362156092083, | |
| "grad_norm": 0.18097148835659027, | |
| "learning_rate": 3.2420737172374184e-06, | |
| "loss": 0.4649, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.9598540145985401, | |
| "grad_norm": 0.2086239606142044, | |
| "learning_rate": 3.217548588542864e-06, | |
| "loss": 0.466, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.9643458731049972, | |
| "grad_norm": 0.17721500992774963, | |
| "learning_rate": 3.1930724899512633e-06, | |
| "loss": 0.4576, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 1.9688377316114543, | |
| "grad_norm": 0.18082091212272644, | |
| "learning_rate": 3.1686460947295695e-06, | |
| "loss": 0.4543, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.9733295901179113, | |
| "grad_norm": 0.19457915425300598, | |
| "learning_rate": 3.1442700747775413e-06, | |
| "loss": 0.4489, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.9778214486243684, | |
| "grad_norm": 0.18913620710372925, | |
| "learning_rate": 3.1199451006092586e-06, | |
| "loss": 0.4631, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.9823133071308254, | |
| "grad_norm": 0.18790724873542786, | |
| "learning_rate": 3.0956718413346785e-06, | |
| "loss": 0.4499, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.9868051656372825, | |
| "grad_norm": 0.16851408779621124, | |
| "learning_rate": 3.0714509646412295e-06, | |
| "loss": 0.4436, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.9912970241437393, | |
| "grad_norm": 0.19248563051223755, | |
| "learning_rate": 3.0472831367754496e-06, | |
| "loss": 0.4699, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.9957888826501966, | |
| "grad_norm": 0.17517177760601044, | |
| "learning_rate": 3.0231690225246537e-06, | |
| "loss": 0.454, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 2.0016844469399215, | |
| "grad_norm": 0.30906590819358826, | |
| "learning_rate": 2.999109285198649e-06, | |
| "loss": 0.7469, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.0061763054463784, | |
| "grad_norm": 0.18735533952713013, | |
| "learning_rate": 2.9751045866114926e-06, | |
| "loss": 0.45, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 2.0106681639528357, | |
| "grad_norm": 0.17937350273132324, | |
| "learning_rate": 2.9511555870632824e-06, | |
| "loss": 0.4468, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 2.0151600224592925, | |
| "grad_norm": 0.18137559294700623, | |
| "learning_rate": 2.927262945321998e-06, | |
| "loss": 0.4341, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 2.0196518809657498, | |
| "grad_norm": 0.18004953861236572, | |
| "learning_rate": 2.903427318605376e-06, | |
| "loss": 0.4426, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 2.0241437394722066, | |
| "grad_norm": 0.17482782900333405, | |
| "learning_rate": 2.8796493625628357e-06, | |
| "loss": 0.4584, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.028635597978664, | |
| "grad_norm": 0.20240633189678192, | |
| "learning_rate": 2.8559297312574417e-06, | |
| "loss": 0.4589, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 2.0331274564851207, | |
| "grad_norm": 0.15797480940818787, | |
| "learning_rate": 2.8322690771479135e-06, | |
| "loss": 0.4416, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 2.0376193149915776, | |
| "grad_norm": 0.16248145699501038, | |
| "learning_rate": 2.808668051070678e-06, | |
| "loss": 0.4356, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 2.042111173498035, | |
| "grad_norm": 0.17543736100196838, | |
| "learning_rate": 2.7851273022219645e-06, | |
| "loss": 0.4531, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 2.0466030320044917, | |
| "grad_norm": 0.18104536831378937, | |
| "learning_rate": 2.7616474781399527e-06, | |
| "loss": 0.4566, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 2.051094890510949, | |
| "grad_norm": 0.1631019413471222, | |
| "learning_rate": 2.7382292246869548e-06, | |
| "loss": 0.4345, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 2.055586749017406, | |
| "grad_norm": 0.16401013731956482, | |
| "learning_rate": 2.7148731860316544e-06, | |
| "loss": 0.4466, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 2.060078607523863, | |
| "grad_norm": 0.18238060176372528, | |
| "learning_rate": 2.6915800046313852e-06, | |
| "loss": 0.4587, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 2.06457046603032, | |
| "grad_norm": 0.1778591126203537, | |
| "learning_rate": 2.6683503212144563e-06, | |
| "loss": 0.4652, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 2.069062324536777, | |
| "grad_norm": 0.16754575073719025, | |
| "learning_rate": 2.645184774762533e-06, | |
| "loss": 0.4265, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.073554183043234, | |
| "grad_norm": 0.16498114168643951, | |
| "learning_rate": 2.6220840024930562e-06, | |
| "loss": 0.4335, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 2.0780460415496913, | |
| "grad_norm": 0.18743063509464264, | |
| "learning_rate": 2.599048639841717e-06, | |
| "loss": 0.4472, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 2.082537900056148, | |
| "grad_norm": 0.1664198338985443, | |
| "learning_rate": 2.5760793204449737e-06, | |
| "loss": 0.4468, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 2.0870297585626054, | |
| "grad_norm": 0.16629664599895477, | |
| "learning_rate": 2.553176676122627e-06, | |
| "loss": 0.4449, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 2.0915216170690623, | |
| "grad_norm": 0.1749490648508072, | |
| "learning_rate": 2.530341336860439e-06, | |
| "loss": 0.4378, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 2.0960134755755195, | |
| "grad_norm": 0.15219521522521973, | |
| "learning_rate": 2.5075739307928017e-06, | |
| "loss": 0.4317, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 2.1005053340819764, | |
| "grad_norm": 0.16345295310020447, | |
| "learning_rate": 2.484875084185462e-06, | |
| "loss": 0.4491, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 2.1049971925884337, | |
| "grad_norm": 0.162262961268425, | |
| "learning_rate": 2.462245421418292e-06, | |
| "loss": 0.4477, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 2.1094890510948905, | |
| "grad_norm": 0.16510337591171265, | |
| "learning_rate": 2.4396855649681166e-06, | |
| "loss": 0.4344, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 2.1139809096013478, | |
| "grad_norm": 0.17441292107105255, | |
| "learning_rate": 2.4171961353915914e-06, | |
| "loss": 0.4657, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.1184727681078046, | |
| "grad_norm": 0.15476343035697937, | |
| "learning_rate": 2.394777751308129e-06, | |
| "loss": 0.4377, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 2.1229646266142614, | |
| "grad_norm": 0.16448856890201569, | |
| "learning_rate": 2.372431029382888e-06, | |
| "loss": 0.4497, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 2.1274564851207187, | |
| "grad_norm": 0.18509458005428314, | |
| "learning_rate": 2.350156584309804e-06, | |
| "loss": 0.4511, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 2.1319483436271756, | |
| "grad_norm": 0.1604773998260498, | |
| "learning_rate": 2.3279550287946883e-06, | |
| "loss": 0.4582, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 2.136440202133633, | |
| "grad_norm": 0.17485181987285614, | |
| "learning_rate": 2.305826973538366e-06, | |
| "loss": 0.4403, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.1409320606400897, | |
| "grad_norm": 0.1811332404613495, | |
| "learning_rate": 2.2837730272198886e-06, | |
| "loss": 0.4436, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 2.145423919146547, | |
| "grad_norm": 0.15909186005592346, | |
| "learning_rate": 2.2617937964797786e-06, | |
| "loss": 0.4432, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 2.149915777653004, | |
| "grad_norm": 0.1537752002477646, | |
| "learning_rate": 2.2398898859033496e-06, | |
| "loss": 0.4398, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 2.154407636159461, | |
| "grad_norm": 0.14990779757499695, | |
| "learning_rate": 2.2180618980040747e-06, | |
| "loss": 0.4434, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 2.158899494665918, | |
| "grad_norm": 0.15770702064037323, | |
| "learning_rate": 2.196310433207013e-06, | |
| "loss": 0.445, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.163391353172375, | |
| "grad_norm": 0.1663922518491745, | |
| "learning_rate": 2.174636089832293e-06, | |
| "loss": 0.4624, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 2.167883211678832, | |
| "grad_norm": 0.1494603157043457, | |
| "learning_rate": 2.1530394640786567e-06, | |
| "loss": 0.4095, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 2.1723750701852893, | |
| "grad_norm": 0.16668330132961273, | |
| "learning_rate": 2.131521150007056e-06, | |
| "loss": 0.4749, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 2.176866928691746, | |
| "grad_norm": 0.156080424785614, | |
| "learning_rate": 2.110081739524316e-06, | |
| "loss": 0.4377, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 2.1813587871982034, | |
| "grad_norm": 0.1667332947254181, | |
| "learning_rate": 2.0887218223668493e-06, | |
| "loss": 0.4634, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 2.1858506457046603, | |
| "grad_norm": 0.16652284562587738, | |
| "learning_rate": 2.0674419860844385e-06, | |
| "loss": 0.4635, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 2.1903425042111175, | |
| "grad_norm": 0.17611587047576904, | |
| "learning_rate": 2.046242816024071e-06, | |
| "loss": 0.4333, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 2.1948343627175744, | |
| "grad_norm": 0.170632004737854, | |
| "learning_rate": 2.0251248953138377e-06, | |
| "loss": 0.4577, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 2.199326221224031, | |
| "grad_norm": 0.1769552379846573, | |
| "learning_rate": 2.0040888048468954e-06, | |
| "loss": 0.4404, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 2.2038180797304885, | |
| "grad_norm": 0.16041500866413116, | |
| "learning_rate": 1.9831351232654874e-06, | |
| "loss": 0.4354, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.2083099382369458, | |
| "grad_norm": 0.15111730992794037, | |
| "learning_rate": 1.962264426945023e-06, | |
| "loss": 0.4412, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 2.2128017967434026, | |
| "grad_norm": 0.15624019503593445, | |
| "learning_rate": 1.9414772899782274e-06, | |
| "loss": 0.438, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 2.2172936552498594, | |
| "grad_norm": 0.1754332333803177, | |
| "learning_rate": 1.920774284159353e-06, | |
| "loss": 0.4604, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 2.2217855137563167, | |
| "grad_norm": 0.16165252029895782, | |
| "learning_rate": 1.9001559789684403e-06, | |
| "loss": 0.4283, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 2.2262773722627736, | |
| "grad_norm": 0.15245720744132996, | |
| "learning_rate": 1.8796229415556628e-06, | |
| "loss": 0.4331, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 2.230769230769231, | |
| "grad_norm": 0.1491118222475052, | |
| "learning_rate": 1.859175736725724e-06, | |
| "loss": 0.4371, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 2.2352610892756877, | |
| "grad_norm": 0.1607341170310974, | |
| "learning_rate": 1.8388149269223153e-06, | |
| "loss": 0.4443, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 2.239752947782145, | |
| "grad_norm": 0.16271331906318665, | |
| "learning_rate": 1.8185410722126556e-06, | |
| "loss": 0.4367, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 2.244244806288602, | |
| "grad_norm": 0.15422794222831726, | |
| "learning_rate": 1.7983547302720773e-06, | |
| "loss": 0.4297, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 2.248736664795059, | |
| "grad_norm": 0.16086190938949585, | |
| "learning_rate": 1.7782564563686882e-06, | |
| "loss": 0.4595, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.253228523301516, | |
| "grad_norm": 0.1475391536951065, | |
| "learning_rate": 1.7582468033480992e-06, | |
| "loss": 0.4418, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 2.257720381807973, | |
| "grad_norm": 0.16517473757266998, | |
| "learning_rate": 1.7383263216182155e-06, | |
| "loss": 0.4611, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 2.26221224031443, | |
| "grad_norm": 0.14682336151599884, | |
| "learning_rate": 1.7184955591340974e-06, | |
| "loss": 0.4302, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 2.2667040988208873, | |
| "grad_norm": 0.16191935539245605, | |
| "learning_rate": 1.6987550613828863e-06, | |
| "loss": 0.4728, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 2.271195957327344, | |
| "grad_norm": 0.1535215973854065, | |
| "learning_rate": 1.6791053713688022e-06, | |
| "loss": 0.4396, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 2.2756878158338014, | |
| "grad_norm": 0.1514642834663391, | |
| "learning_rate": 1.6595470295982047e-06, | |
| "loss": 0.4341, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 2.2801796743402583, | |
| "grad_norm": 0.1438160538673401, | |
| "learning_rate": 1.6400805740647269e-06, | |
| "loss": 0.4426, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 2.2846715328467155, | |
| "grad_norm": 0.1464770883321762, | |
| "learning_rate": 1.6207065402344746e-06, | |
| "loss": 0.4757, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 2.2891633913531724, | |
| "grad_norm": 0.14882051944732666, | |
| "learning_rate": 1.6014254610313035e-06, | |
| "loss": 0.4402, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 2.293655249859629, | |
| "grad_norm": 0.14686354994773865, | |
| "learning_rate": 1.5822378668221511e-06, | |
| "loss": 0.4506, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.2981471083660865, | |
| "grad_norm": 0.16808481514453888, | |
| "learning_rate": 1.563144285402453e-06, | |
| "loss": 0.4535, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 2.3026389668725433, | |
| "grad_norm": 0.1671183556318283, | |
| "learning_rate": 1.5441452419816238e-06, | |
| "loss": 0.4576, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 2.3071308253790006, | |
| "grad_norm": 0.158988356590271, | |
| "learning_rate": 1.5252412591686105e-06, | |
| "loss": 0.4606, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 2.3116226838854574, | |
| "grad_norm": 0.1585473269224167, | |
| "learning_rate": 1.5064328569575166e-06, | |
| "loss": 0.4364, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 2.3161145423919147, | |
| "grad_norm": 0.1624506711959839, | |
| "learning_rate": 1.4877205527132983e-06, | |
| "loss": 0.4452, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 2.3206064008983716, | |
| "grad_norm": 0.14651590585708618, | |
| "learning_rate": 1.4691048611575337e-06, | |
| "loss": 0.432, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 2.325098259404829, | |
| "grad_norm": 0.14389865100383759, | |
| "learning_rate": 1.4505862943542643e-06, | |
| "loss": 0.4517, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 2.3295901179112857, | |
| "grad_norm": 0.15029338002204895, | |
| "learning_rate": 1.4321653616959096e-06, | |
| "loss": 0.4548, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 2.334081976417743, | |
| "grad_norm": 0.14938291907310486, | |
| "learning_rate": 1.4138425698892555e-06, | |
| "loss": 0.444, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 2.3385738349242, | |
| "grad_norm": 0.15582537651062012, | |
| "learning_rate": 1.3956184229415148e-06, | |
| "loss": 0.4343, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.343065693430657, | |
| "grad_norm": 0.16049213707447052, | |
| "learning_rate": 1.3774934221464643e-06, | |
| "loss": 0.4866, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 2.347557551937114, | |
| "grad_norm": 0.1587788462638855, | |
| "learning_rate": 1.3594680660706572e-06, | |
| "loss": 0.4259, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 2.352049410443571, | |
| "grad_norm": 0.15312546491622925, | |
| "learning_rate": 1.341542850539706e-06, | |
| "loss": 0.4567, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 2.356541268950028, | |
| "grad_norm": 0.16400395333766937, | |
| "learning_rate": 1.323718268624647e-06, | |
| "loss": 0.437, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 2.3610331274564853, | |
| "grad_norm": 0.15856412053108215, | |
| "learning_rate": 1.3059948106283725e-06, | |
| "loss": 0.4568, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 2.365524985962942, | |
| "grad_norm": 0.16507361829280853, | |
| "learning_rate": 1.2883729640721532e-06, | |
| "loss": 0.4369, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 2.370016844469399, | |
| "grad_norm": 0.1538458913564682, | |
| "learning_rate": 1.2708532136822156e-06, | |
| "loss": 0.4352, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 2.3745087029758563, | |
| "grad_norm": 0.15951845049858093, | |
| "learning_rate": 1.253436041376417e-06, | |
| "loss": 0.4631, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 2.3790005614823135, | |
| "grad_norm": 0.1614406406879425, | |
| "learning_rate": 1.2361219262509882e-06, | |
| "loss": 0.4507, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 2.3834924199887704, | |
| "grad_norm": 0.162693589925766, | |
| "learning_rate": 1.2189113445673528e-06, | |
| "loss": 0.4686, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.387984278495227, | |
| "grad_norm": 0.1591520756483078, | |
| "learning_rate": 1.201804769739028e-06, | |
| "loss": 0.4288, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 2.3924761370016845, | |
| "grad_norm": 0.1632116734981537, | |
| "learning_rate": 1.1848026723186013e-06, | |
| "loss": 0.4627, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 2.3969679955081413, | |
| "grad_norm": 0.15154661238193512, | |
| "learning_rate": 1.1679055199847894e-06, | |
| "loss": 0.4392, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 2.4014598540145986, | |
| "grad_norm": 0.15290887653827667, | |
| "learning_rate": 1.1511137775295705e-06, | |
| "loss": 0.4468, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 2.4059517125210554, | |
| "grad_norm": 0.15557996928691864, | |
| "learning_rate": 1.1344279068454012e-06, | |
| "loss": 0.4629, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 2.4104435710275127, | |
| "grad_norm": 0.15348927676677704, | |
| "learning_rate": 1.1178483669125112e-06, | |
| "loss": 0.4421, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 2.4149354295339696, | |
| "grad_norm": 0.14844442903995514, | |
| "learning_rate": 1.101375613786278e-06, | |
| "loss": 0.4501, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 2.419427288040427, | |
| "grad_norm": 0.15993905067443848, | |
| "learning_rate": 1.0850101005846787e-06, | |
| "loss": 0.4595, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 2.4239191465468837, | |
| "grad_norm": 0.14879821240901947, | |
| "learning_rate": 1.0687522774758318e-06, | |
| "loss": 0.4146, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 2.428411005053341, | |
| "grad_norm": 0.14690467715263367, | |
| "learning_rate": 1.052602591665612e-06, | |
| "loss": 0.4579, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.432902863559798, | |
| "grad_norm": 0.1457294076681137, | |
| "learning_rate": 1.0365614873853463e-06, | |
| "loss": 0.4484, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 2.437394722066255, | |
| "grad_norm": 0.15229414403438568, | |
| "learning_rate": 1.0206294058795974e-06, | |
| "loss": 0.4357, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 2.441886580572712, | |
| "grad_norm": 0.15288710594177246, | |
| "learning_rate": 1.0048067853940286e-06, | |
| "loss": 0.4415, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 2.446378439079169, | |
| "grad_norm": 0.15801645815372467, | |
| "learning_rate": 9.890940611633416e-07, | |
| "loss": 0.4452, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 2.450870297585626, | |
| "grad_norm": 0.14208218455314636, | |
| "learning_rate": 9.734916653993104e-07, | |
| "loss": 0.4478, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 2.4553621560920833, | |
| "grad_norm": 0.14192849397659302, | |
| "learning_rate": 9.580000272788915e-07, | |
| "loss": 0.4202, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 2.45985401459854, | |
| "grad_norm": 0.14638514816761017, | |
| "learning_rate": 9.426195729324161e-07, | |
| "loss": 0.4462, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 2.464345873104997, | |
| "grad_norm": 0.1476951539516449, | |
| "learning_rate": 9.27350725431872e-07, | |
| "loss": 0.4697, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 2.4688377316114543, | |
| "grad_norm": 0.13385483622550964, | |
| "learning_rate": 9.121939047792622e-07, | |
| "loss": 0.4335, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 2.473329590117911, | |
| "grad_norm": 0.15053848922252655, | |
| "learning_rate": 8.971495278950559e-07, | |
| "loss": 0.4633, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.4778214486243684, | |
| "grad_norm": 0.14375874400138855, | |
| "learning_rate": 8.822180086067161e-07, | |
| "loss": 0.4618, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 2.482313307130825, | |
| "grad_norm": 0.14123788475990295, | |
| "learning_rate": 8.673997576373205e-07, | |
| "loss": 0.441, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 2.4868051656372825, | |
| "grad_norm": 0.1446654200553894, | |
| "learning_rate": 8.526951825942608e-07, | |
| "loss": 0.4495, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 2.4912970241437393, | |
| "grad_norm": 0.14551837742328644, | |
| "learning_rate": 8.381046879580307e-07, | |
| "loss": 0.437, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 2.4957888826501966, | |
| "grad_norm": 0.13555008172988892, | |
| "learning_rate": 8.23628675071102e-07, | |
| "loss": 0.4535, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 2.5002807411566534, | |
| "grad_norm": 0.1343921571969986, | |
| "learning_rate": 8.092675421268825e-07, | |
| "loss": 0.4214, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 2.5047725996631107, | |
| "grad_norm": 0.14767403900623322, | |
| "learning_rate": 7.950216841587638e-07, | |
| "loss": 0.4737, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 2.5092644581695676, | |
| "grad_norm": 0.13317753374576569, | |
| "learning_rate": 7.808914930292544e-07, | |
| "loss": 0.4147, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 2.513756316676025, | |
| "grad_norm": 0.14309468865394592, | |
| "learning_rate": 7.66877357419204e-07, | |
| "loss": 0.4357, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 2.5182481751824817, | |
| "grad_norm": 0.1514693796634674, | |
| "learning_rate": 7.529796628171071e-07, | |
| "loss": 0.4626, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.522740033688939, | |
| "grad_norm": 0.14453499019145966, | |
| "learning_rate": 7.391987915085014e-07, | |
| "loss": 0.4349, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 2.527231892195396, | |
| "grad_norm": 0.14623714983463287, | |
| "learning_rate": 7.255351225654527e-07, | |
| "loss": 0.4436, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 2.531723750701853, | |
| "grad_norm": 0.14449186623096466, | |
| "learning_rate": 7.119890318361278e-07, | |
| "loss": 0.4318, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 2.53621560920831, | |
| "grad_norm": 0.15518885850906372, | |
| "learning_rate": 6.98560891934455e-07, | |
| "loss": 0.4568, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 2.5407074677147667, | |
| "grad_norm": 0.15409418940544128, | |
| "learning_rate": 6.852510722298761e-07, | |
| "loss": 0.4654, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 2.545199326221224, | |
| "grad_norm": 0.1519947648048401, | |
| "learning_rate": 6.72059938837184e-07, | |
| "loss": 0.4567, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 2.5496911847276813, | |
| "grad_norm": 0.13835862278938293, | |
| "learning_rate": 6.589878546064544e-07, | |
| "loss": 0.4312, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 2.554183043234138, | |
| "grad_norm": 0.1384487897157669, | |
| "learning_rate": 6.46035179113062e-07, | |
| "loss": 0.4264, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 2.558674901740595, | |
| "grad_norm": 0.16356123983860016, | |
| "learning_rate": 6.332022686477929e-07, | |
| "loss": 0.4936, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 2.5631667602470523, | |
| "grad_norm": 0.14139850437641144, | |
| "learning_rate": 6.204894762070407e-07, | |
| "loss": 0.4379, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.5676586187535095, | |
| "grad_norm": 0.14354942739009857, | |
| "learning_rate": 6.078971514830989e-07, | |
| "loss": 0.4396, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 2.5721504772599664, | |
| "grad_norm": 0.17011038959026337, | |
| "learning_rate": 5.954256408545417e-07, | |
| "loss": 0.4474, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 2.576642335766423, | |
| "grad_norm": 0.13311436772346497, | |
| "learning_rate": 5.830752873766948e-07, | |
| "loss": 0.4325, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 2.5811341942728805, | |
| "grad_norm": 0.14901982247829437, | |
| "learning_rate": 5.708464307722006e-07, | |
| "loss": 0.481, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 2.5856260527793373, | |
| "grad_norm": 0.14402733743190765, | |
| "learning_rate": 5.587394074216712e-07, | |
| "loss": 0.4279, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 2.5901179112857946, | |
| "grad_norm": 0.14105503261089325, | |
| "learning_rate": 5.467545503544381e-07, | |
| "loss": 0.4456, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 2.5946097697922514, | |
| "grad_norm": 0.15389291942119598, | |
| "learning_rate": 5.348921892393905e-07, | |
| "loss": 0.4538, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 2.5991016282987087, | |
| "grad_norm": 0.13729771971702576, | |
| "learning_rate": 5.231526503759055e-07, | |
| "loss": 0.4328, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 2.6035934868051656, | |
| "grad_norm": 0.1453271359205246, | |
| "learning_rate": 5.115362566848747e-07, | |
| "loss": 0.4522, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 2.608085345311623, | |
| "grad_norm": 0.1378541737794876, | |
| "learning_rate": 5.000433276998218e-07, | |
| "loss": 0.4461, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.6125772038180797, | |
| "grad_norm": 0.14585429430007935, | |
| "learning_rate": 4.886741795581101e-07, | |
| "loss": 0.4487, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 2.6170690623245365, | |
| "grad_norm": 0.13975073397159576, | |
| "learning_rate": 4.774291249922508e-07, | |
| "loss": 0.4603, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 2.621560920830994, | |
| "grad_norm": 0.130731463432312, | |
| "learning_rate": 4.663084733212958e-07, | |
| "loss": 0.4121, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 2.626052779337451, | |
| "grad_norm": 0.13513977825641632, | |
| "learning_rate": 4.5531253044233393e-07, | |
| "loss": 0.4589, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 2.630544637843908, | |
| "grad_norm": 0.13990791141986847, | |
| "learning_rate": 4.4444159882207406e-07, | |
| "loss": 0.4352, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 2.6350364963503647, | |
| "grad_norm": 0.1394883394241333, | |
| "learning_rate": 4.336959774885241e-07, | |
| "loss": 0.4662, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 2.639528354856822, | |
| "grad_norm": 0.14077690243721008, | |
| "learning_rate": 4.230759620227681e-07, | |
| "loss": 0.4468, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 2.6440202133632793, | |
| "grad_norm": 0.14740774035453796, | |
| "learning_rate": 4.125818445508351e-07, | |
| "loss": 0.4505, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 2.648512071869736, | |
| "grad_norm": 0.13872882723808289, | |
| "learning_rate": 4.022139137356623e-07, | |
| "loss": 0.432, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 2.653003930376193, | |
| "grad_norm": 0.1328064501285553, | |
| "learning_rate": 3.9197245476915556e-07, | |
| "loss": 0.4578, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.6574957888826503, | |
| "grad_norm": 0.1333392858505249, | |
| "learning_rate": 3.818577493643444e-07, | |
| "loss": 0.4516, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 2.661987647389107, | |
| "grad_norm": 0.13673578202724457, | |
| "learning_rate": 3.718700757476323e-07, | |
| "loss": 0.4443, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 2.6664795058955644, | |
| "grad_norm": 0.14025312662124634, | |
| "learning_rate": 3.6200970865114705e-07, | |
| "loss": 0.4367, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 2.670971364402021, | |
| "grad_norm": 0.13922296464443207, | |
| "learning_rate": 3.5227691930517895e-07, | |
| "loss": 0.4489, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 2.6754632229084785, | |
| "grad_norm": 0.1483432501554489, | |
| "learning_rate": 3.426719754307206e-07, | |
| "loss": 0.4535, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 2.6799550814149353, | |
| "grad_norm": 0.14136959612369537, | |
| "learning_rate": 3.331951412321066e-07, | |
| "loss": 0.4441, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 2.6844469399213926, | |
| "grad_norm": 0.14010727405548096, | |
| "learning_rate": 3.23846677389742e-07, | |
| "loss": 0.4508, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 2.6889387984278494, | |
| "grad_norm": 0.13651929795742035, | |
| "learning_rate": 3.14626841052933e-07, | |
| "loss": 0.4367, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 2.6934306569343067, | |
| "grad_norm": 0.1339380443096161, | |
| "learning_rate": 3.0553588583281446e-07, | |
| "loss": 0.4498, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 2.6979225154407636, | |
| "grad_norm": 0.14083103835582733, | |
| "learning_rate": 2.965740617953733e-07, | |
| "loss": 0.4276, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.702414373947221, | |
| "grad_norm": 0.13285264372825623, | |
| "learning_rate": 2.877416154545681e-07, | |
| "loss": 0.4547, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 2.7069062324536777, | |
| "grad_norm": 0.13597218692302704, | |
| "learning_rate": 2.7903878976555165e-07, | |
| "loss": 0.4384, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 2.7113980909601345, | |
| "grad_norm": 0.14728978276252747, | |
| "learning_rate": 2.704658241179847e-07, | |
| "loss": 0.4478, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 2.715889949466592, | |
| "grad_norm": 0.1403026282787323, | |
| "learning_rate": 2.620229543294528e-07, | |
| "loss": 0.4463, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 2.720381807973049, | |
| "grad_norm": 0.14910593628883362, | |
| "learning_rate": 2.5371041263897945e-07, | |
| "loss": 0.4636, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 2.724873666479506, | |
| "grad_norm": 0.13558971881866455, | |
| "learning_rate": 2.4552842770063756e-07, | |
| "loss": 0.4223, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 2.7293655249859627, | |
| "grad_norm": 0.13463318347930908, | |
| "learning_rate": 2.3747722457725996e-07, | |
| "loss": 0.4413, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 2.73385738349242, | |
| "grad_norm": 0.12877213954925537, | |
| "learning_rate": 2.2955702473424824e-07, | |
| "loss": 0.4399, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 2.7383492419988773, | |
| "grad_norm": 0.13977070152759552, | |
| "learning_rate": 2.217680460334809e-07, | |
| "loss": 0.4518, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 2.742841100505334, | |
| "grad_norm": 0.13352471590042114, | |
| "learning_rate": 2.141105027273227e-07, | |
| "loss": 0.4456, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.747332959011791, | |
| "grad_norm": 0.14282134175300598, | |
| "learning_rate": 2.0658460545272653e-07, | |
| "loss": 0.4272, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 2.7518248175182483, | |
| "grad_norm": 0.13226774334907532, | |
| "learning_rate": 1.9919056122544467e-07, | |
| "loss": 0.446, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 2.756316676024705, | |
| "grad_norm": 0.13558903336524963, | |
| "learning_rate": 1.9192857343433069e-07, | |
| "loss": 0.441, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 2.7608085345311624, | |
| "grad_norm": 0.13000066578388214, | |
| "learning_rate": 1.847988418357466e-07, | |
| "loss": 0.4406, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 2.765300393037619, | |
| "grad_norm": 0.13514713943004608, | |
| "learning_rate": 1.778015625480678e-07, | |
| "loss": 0.4497, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 2.7697922515440765, | |
| "grad_norm": 0.1365610659122467, | |
| "learning_rate": 1.7093692804628637e-07, | |
| "loss": 0.4279, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 2.7742841100505333, | |
| "grad_norm": 0.1374463140964508, | |
| "learning_rate": 1.642051271567213e-07, | |
| "loss": 0.4638, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 2.7787759685569906, | |
| "grad_norm": 0.12901850044727325, | |
| "learning_rate": 1.5760634505182004e-07, | |
| "loss": 0.4493, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 2.7832678270634474, | |
| "grad_norm": 0.13684892654418945, | |
| "learning_rate": 1.5114076324506567e-07, | |
| "loss": 0.4433, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 2.7877596855699043, | |
| "grad_norm": 0.13286025822162628, | |
| "learning_rate": 1.4480855958598716e-07, | |
| "loss": 0.4239, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.7922515440763616, | |
| "grad_norm": 0.1406407654285431, | |
| "learning_rate": 1.3860990825526334e-07, | |
| "loss": 0.4582, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 2.796743402582819, | |
| "grad_norm": 0.12750087678432465, | |
| "learning_rate": 1.3254497975993263e-07, | |
| "loss": 0.4476, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 2.8012352610892757, | |
| "grad_norm": 0.14279097318649292, | |
| "learning_rate": 1.266139409287054e-07, | |
| "loss": 0.4591, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 2.8057271195957325, | |
| "grad_norm": 0.13092957437038422, | |
| "learning_rate": 1.2081695490737177e-07, | |
| "loss": 0.4373, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 2.81021897810219, | |
| "grad_norm": 0.14031663537025452, | |
| "learning_rate": 1.1515418115431554e-07, | |
| "loss": 0.4418, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 2.814710836608647, | |
| "grad_norm": 0.1413896381855011, | |
| "learning_rate": 1.0962577543612796e-07, | |
| "loss": 0.447, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 2.819202695115104, | |
| "grad_norm": 0.13287171721458435, | |
| "learning_rate": 1.04231889823323e-07, | |
| "loss": 0.4297, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 2.8236945536215607, | |
| "grad_norm": 0.13622640073299408, | |
| "learning_rate": 9.897267268615285e-08, | |
| "loss": 0.46, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 2.828186412128018, | |
| "grad_norm": 0.13213184475898743, | |
| "learning_rate": 9.384826869052899e-08, | |
| "loss": 0.4498, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 2.832678270634475, | |
| "grad_norm": 0.13575822114944458, | |
| "learning_rate": 8.885881879404201e-08, | |
| "loss": 0.4398, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.837170129140932, | |
| "grad_norm": 0.13295438885688782, | |
| "learning_rate": 8.400446024208309e-08, | |
| "loss": 0.4562, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 2.841661987647389, | |
| "grad_norm": 0.13092169165611267, | |
| "learning_rate": 7.928532656407029e-08, | |
| "loss": 0.4405, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 2.8461538461538463, | |
| "grad_norm": 0.1367148458957672, | |
| "learning_rate": 7.470154756977544e-08, | |
| "loss": 0.4483, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 2.850645704660303, | |
| "grad_norm": 0.13294926285743713, | |
| "learning_rate": 7.02532493457514e-08, | |
| "loss": 0.4552, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 2.8551375631667604, | |
| "grad_norm": 0.13394087553024292, | |
| "learning_rate": 6.594055425186763e-08, | |
| "loss": 0.4607, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 2.859629421673217, | |
| "grad_norm": 0.12872961163520813, | |
| "learning_rate": 6.176358091794011e-08, | |
| "loss": 0.4353, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 2.8641212801796745, | |
| "grad_norm": 0.13416117429733276, | |
| "learning_rate": 5.772244424047169e-08, | |
| "loss": 0.437, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 2.8686131386861313, | |
| "grad_norm": 0.13002049922943115, | |
| "learning_rate": 5.3817255379488565e-08, | |
| "loss": 0.4512, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 2.8731049971925886, | |
| "grad_norm": 0.14305950701236725, | |
| "learning_rate": 5.004812175548657e-08, | |
| "loss": 0.4604, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 2.8775968556990454, | |
| "grad_norm": 0.1293281465768814, | |
| "learning_rate": 4.641514704647132e-08, | |
| "loss": 0.4368, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.8820887142055023, | |
| "grad_norm": 0.13169856369495392, | |
| "learning_rate": 4.2918431185110523e-08, | |
| "loss": 0.4241, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 2.8865805727119596, | |
| "grad_norm": 0.13361629843711853, | |
| "learning_rate": 3.9558070355983356e-08, | |
| "loss": 0.4689, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 2.891072431218417, | |
| "grad_norm": 0.13451144099235535, | |
| "learning_rate": 3.633415699293541e-08, | |
| "loss": 0.4431, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 2.8955642897248737, | |
| "grad_norm": 0.1316562294960022, | |
| "learning_rate": 3.324677977653401e-08, | |
| "loss": 0.4403, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 2.9000561482313305, | |
| "grad_norm": 0.12996108829975128, | |
| "learning_rate": 3.0296023631631866e-08, | |
| "loss": 0.4472, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 2.904548006737788, | |
| "grad_norm": 0.13493524491786957, | |
| "learning_rate": 2.7481969725028923e-08, | |
| "loss": 0.4486, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 2.909039865244245, | |
| "grad_norm": 0.12959401309490204, | |
| "learning_rate": 2.4804695463240825e-08, | |
| "loss": 0.435, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 2.913531723750702, | |
| "grad_norm": 0.13398133218288422, | |
| "learning_rate": 2.226427449036894e-08, | |
| "loss": 0.4465, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 2.9180235822571587, | |
| "grad_norm": 0.12995387613773346, | |
| "learning_rate": 1.9860776686075332e-08, | |
| "loss": 0.4491, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 2.922515440763616, | |
| "grad_norm": 0.1356271654367447, | |
| "learning_rate": 1.7594268163659277e-08, | |
| "loss": 0.4449, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.927007299270073, | |
| "grad_norm": 0.12999212741851807, | |
| "learning_rate": 1.546481126824151e-08, | |
| "loss": 0.4446, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 2.93149915777653, | |
| "grad_norm": 0.1264127790927887, | |
| "learning_rate": 1.347246457504503e-08, | |
| "loss": 0.4397, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 2.935991016282987, | |
| "grad_norm": 0.13195131719112396, | |
| "learning_rate": 1.1617282887787517e-08, | |
| "loss": 0.4434, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 2.9404828747894443, | |
| "grad_norm": 0.13151754438877106, | |
| "learning_rate": 9.899317237172524e-09, | |
| "loss": 0.432, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 2.944974733295901, | |
| "grad_norm": 0.1297924965620041, | |
| "learning_rate": 8.318614879485044e-09, | |
| "loss": 0.4271, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 2.9494665918023584, | |
| "grad_norm": 0.1355123370885849, | |
| "learning_rate": 6.8752192952931115e-09, | |
| "loss": 0.4666, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 2.953958450308815, | |
| "grad_norm": 0.13056360185146332, | |
| "learning_rate": 5.569170188250983e-09, | |
| "loss": 0.4374, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 2.958450308815272, | |
| "grad_norm": 0.15338468551635742, | |
| "learning_rate": 4.4005034840061135e-09, | |
| "loss": 0.4453, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 2.9629421673217293, | |
| "grad_norm": 0.13862362504005432, | |
| "learning_rate": 3.3692513292132855e-09, | |
| "loss": 0.4584, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 2.9674340258281866, | |
| "grad_norm": 0.13754971325397491, | |
| "learning_rate": 2.4754420906475396e-09, | |
| "loss": 0.4386, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.9719258843346434, | |
| "grad_norm": 0.13375526666641235, | |
| "learning_rate": 1.7191003544259067e-09, | |
| "loss": 0.4451, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 2.9764177428411003, | |
| "grad_norm": 0.13616608083248138, | |
| "learning_rate": 1.100246925331283e-09, | |
| "loss": 0.4446, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 2.9809096013475576, | |
| "grad_norm": 0.1345696747303009, | |
| "learning_rate": 6.188988262373353e-10, | |
| "loss": 0.4473, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 2.985401459854015, | |
| "grad_norm": 0.13510431349277496, | |
| "learning_rate": 2.750692976444258e-10, | |
| "loss": 0.4373, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 2.9898933183604717, | |
| "grad_norm": 0.13065224885940552, | |
| "learning_rate": 6.876779731213035e-11, | |
| "loss": 0.4428, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 2.9943851768669285, | |
| "grad_norm": 0.1353355348110199, | |
| "learning_rate": 0.0, | |
| "loss": 0.4488, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 2.9943851768669285, | |
| "step": 666, | |
| "total_flos": 4607052148899840.0, | |
| "train_loss": 0.4853614707429846, | |
| "train_runtime": 116828.6857, | |
| "train_samples_per_second": 2.926, | |
| "train_steps_per_second": 0.006 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 666, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4607052148899840.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |