ReForm-SFT-14B / trainer_state.json
SiniShell1's picture
Upload folder using huggingface_hub
0278193 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9951534733441034,
"eval_steps": 500,
"global_step": 618,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0032310177705977385,
"grad_norm": 25.128438265728068,
"learning_rate": 8.064516129032259e-08,
"loss": 0.1803,
"step": 1
},
{
"epoch": 0.006462035541195477,
"grad_norm": 20.481937139410615,
"learning_rate": 1.6129032258064518e-07,
"loss": 0.1841,
"step": 2
},
{
"epoch": 0.009693053311793215,
"grad_norm": 40.482947214989046,
"learning_rate": 2.4193548387096775e-07,
"loss": 0.2517,
"step": 3
},
{
"epoch": 0.012924071082390954,
"grad_norm": 56.10659153508464,
"learning_rate": 3.2258064516129035e-07,
"loss": 0.289,
"step": 4
},
{
"epoch": 0.01615508885298869,
"grad_norm": 40.7368526949032,
"learning_rate": 4.032258064516129e-07,
"loss": 0.2432,
"step": 5
},
{
"epoch": 0.01938610662358643,
"grad_norm": 36.018819191325036,
"learning_rate": 4.838709677419355e-07,
"loss": 0.1939,
"step": 6
},
{
"epoch": 0.022617124394184167,
"grad_norm": 64.22644318288756,
"learning_rate": 5.645161290322581e-07,
"loss": 0.2186,
"step": 7
},
{
"epoch": 0.025848142164781908,
"grad_norm": 68.80190588802398,
"learning_rate": 6.451612903225807e-07,
"loss": 0.237,
"step": 8
},
{
"epoch": 0.029079159935379646,
"grad_norm": 53.87483186618541,
"learning_rate": 7.258064516129033e-07,
"loss": 0.1904,
"step": 9
},
{
"epoch": 0.03231017770597738,
"grad_norm": 27.250940737956068,
"learning_rate": 8.064516129032258e-07,
"loss": 0.1914,
"step": 10
},
{
"epoch": 0.035541195476575124,
"grad_norm": 19.13516705978285,
"learning_rate": 8.870967741935485e-07,
"loss": 0.21,
"step": 11
},
{
"epoch": 0.03877221324717286,
"grad_norm": 5.971043423480582,
"learning_rate": 9.67741935483871e-07,
"loss": 0.1893,
"step": 12
},
{
"epoch": 0.0420032310177706,
"grad_norm": 8.917076042584409,
"learning_rate": 1.0483870967741936e-06,
"loss": 0.1672,
"step": 13
},
{
"epoch": 0.045234248788368334,
"grad_norm": 6.564634516149644,
"learning_rate": 1.1290322580645162e-06,
"loss": 0.1639,
"step": 14
},
{
"epoch": 0.048465266558966075,
"grad_norm": 7.4524362304900995,
"learning_rate": 1.2096774193548388e-06,
"loss": 0.1904,
"step": 15
},
{
"epoch": 0.051696284329563816,
"grad_norm": 6.424814341877685,
"learning_rate": 1.2903225806451614e-06,
"loss": 0.1539,
"step": 16
},
{
"epoch": 0.05492730210016155,
"grad_norm": 8.919971119630558,
"learning_rate": 1.3709677419354838e-06,
"loss": 0.1276,
"step": 17
},
{
"epoch": 0.05815831987075929,
"grad_norm": 8.858198890334634,
"learning_rate": 1.4516129032258066e-06,
"loss": 0.148,
"step": 18
},
{
"epoch": 0.061389337641357025,
"grad_norm": 8.19809300927439,
"learning_rate": 1.5322580645161292e-06,
"loss": 0.1528,
"step": 19
},
{
"epoch": 0.06462035541195477,
"grad_norm": 9.263957823023876,
"learning_rate": 1.6129032258064516e-06,
"loss": 0.1654,
"step": 20
},
{
"epoch": 0.06785137318255251,
"grad_norm": 5.211246085563888,
"learning_rate": 1.6935483870967742e-06,
"loss": 0.1478,
"step": 21
},
{
"epoch": 0.07108239095315025,
"grad_norm": 4.16568459472329,
"learning_rate": 1.774193548387097e-06,
"loss": 0.1548,
"step": 22
},
{
"epoch": 0.07431340872374798,
"grad_norm": 2.1941339248998233,
"learning_rate": 1.8548387096774196e-06,
"loss": 0.1349,
"step": 23
},
{
"epoch": 0.07754442649434572,
"grad_norm": 1.5770028567175525,
"learning_rate": 1.935483870967742e-06,
"loss": 0.174,
"step": 24
},
{
"epoch": 0.08077544426494346,
"grad_norm": 1.606149082877329,
"learning_rate": 2.0161290322580646e-06,
"loss": 0.1468,
"step": 25
},
{
"epoch": 0.0840064620355412,
"grad_norm": 1.4435377235758515,
"learning_rate": 2.096774193548387e-06,
"loss": 0.1482,
"step": 26
},
{
"epoch": 0.08723747980613894,
"grad_norm": 1.4360703380719761,
"learning_rate": 2.17741935483871e-06,
"loss": 0.1404,
"step": 27
},
{
"epoch": 0.09046849757673667,
"grad_norm": 0.8101385337166851,
"learning_rate": 2.2580645161290324e-06,
"loss": 0.1142,
"step": 28
},
{
"epoch": 0.09369951534733441,
"grad_norm": 1.381962668180297,
"learning_rate": 2.338709677419355e-06,
"loss": 0.1331,
"step": 29
},
{
"epoch": 0.09693053311793215,
"grad_norm": 1.2208818326390372,
"learning_rate": 2.4193548387096776e-06,
"loss": 0.1196,
"step": 30
},
{
"epoch": 0.10016155088852989,
"grad_norm": 1.1209301057498027,
"learning_rate": 2.5e-06,
"loss": 0.0986,
"step": 31
},
{
"epoch": 0.10339256865912763,
"grad_norm": 1.9664855281381353,
"learning_rate": 2.580645161290323e-06,
"loss": 0.133,
"step": 32
},
{
"epoch": 0.10662358642972536,
"grad_norm": 1.2526567677482312,
"learning_rate": 2.6612903225806454e-06,
"loss": 0.0969,
"step": 33
},
{
"epoch": 0.1098546042003231,
"grad_norm": 1.0939478434588246,
"learning_rate": 2.7419354838709676e-06,
"loss": 0.115,
"step": 34
},
{
"epoch": 0.11308562197092084,
"grad_norm": 0.8286974625967464,
"learning_rate": 2.822580645161291e-06,
"loss": 0.0941,
"step": 35
},
{
"epoch": 0.11631663974151858,
"grad_norm": 1.1449426230450832,
"learning_rate": 2.903225806451613e-06,
"loss": 0.1182,
"step": 36
},
{
"epoch": 0.11954765751211632,
"grad_norm": 1.188437444192784,
"learning_rate": 2.983870967741936e-06,
"loss": 0.1107,
"step": 37
},
{
"epoch": 0.12277867528271405,
"grad_norm": 1.1499792776405726,
"learning_rate": 3.0645161290322584e-06,
"loss": 0.1044,
"step": 38
},
{
"epoch": 0.1260096930533118,
"grad_norm": 1.0118348591126385,
"learning_rate": 3.145161290322581e-06,
"loss": 0.0926,
"step": 39
},
{
"epoch": 0.12924071082390953,
"grad_norm": 2.2856482367185813,
"learning_rate": 3.225806451612903e-06,
"loss": 0.0935,
"step": 40
},
{
"epoch": 0.13247172859450726,
"grad_norm": 1.3141305507246495,
"learning_rate": 3.306451612903226e-06,
"loss": 0.1411,
"step": 41
},
{
"epoch": 0.13570274636510501,
"grad_norm": 1.5200078779321133,
"learning_rate": 3.3870967741935484e-06,
"loss": 0.1086,
"step": 42
},
{
"epoch": 0.13893376413570274,
"grad_norm": 1.199141198842882,
"learning_rate": 3.4677419354838714e-06,
"loss": 0.0963,
"step": 43
},
{
"epoch": 0.1421647819063005,
"grad_norm": 0.9118534178360143,
"learning_rate": 3.548387096774194e-06,
"loss": 0.0838,
"step": 44
},
{
"epoch": 0.14539579967689822,
"grad_norm": 0.8313523608482088,
"learning_rate": 3.6290322580645166e-06,
"loss": 0.0822,
"step": 45
},
{
"epoch": 0.14862681744749595,
"grad_norm": 1.0602464666470106,
"learning_rate": 3.7096774193548392e-06,
"loss": 0.1031,
"step": 46
},
{
"epoch": 0.1518578352180937,
"grad_norm": 1.4189660331134808,
"learning_rate": 3.7903225806451614e-06,
"loss": 0.0985,
"step": 47
},
{
"epoch": 0.15508885298869143,
"grad_norm": 0.7654402241394538,
"learning_rate": 3.870967741935484e-06,
"loss": 0.0759,
"step": 48
},
{
"epoch": 0.1583198707592892,
"grad_norm": 1.5213372179407378,
"learning_rate": 3.951612903225807e-06,
"loss": 0.1288,
"step": 49
},
{
"epoch": 0.16155088852988692,
"grad_norm": 1.2908078430353152,
"learning_rate": 4.032258064516129e-06,
"loss": 0.1117,
"step": 50
},
{
"epoch": 0.16478190630048464,
"grad_norm": 1.0569868777193876,
"learning_rate": 4.112903225806452e-06,
"loss": 0.0983,
"step": 51
},
{
"epoch": 0.1680129240710824,
"grad_norm": 0.9711071114660816,
"learning_rate": 4.193548387096774e-06,
"loss": 0.0911,
"step": 52
},
{
"epoch": 0.17124394184168013,
"grad_norm": 1.102405320517146,
"learning_rate": 4.274193548387097e-06,
"loss": 0.1037,
"step": 53
},
{
"epoch": 0.17447495961227788,
"grad_norm": 0.9937966694938649,
"learning_rate": 4.35483870967742e-06,
"loss": 0.0815,
"step": 54
},
{
"epoch": 0.1777059773828756,
"grad_norm": 1.0718680838436345,
"learning_rate": 4.435483870967742e-06,
"loss": 0.1004,
"step": 55
},
{
"epoch": 0.18093699515347333,
"grad_norm": 1.1717408640580886,
"learning_rate": 4.516129032258065e-06,
"loss": 0.1278,
"step": 56
},
{
"epoch": 0.1841680129240711,
"grad_norm": 0.9371050661628969,
"learning_rate": 4.596774193548387e-06,
"loss": 0.09,
"step": 57
},
{
"epoch": 0.18739903069466882,
"grad_norm": 0.7008244964012689,
"learning_rate": 4.67741935483871e-06,
"loss": 0.0704,
"step": 58
},
{
"epoch": 0.19063004846526657,
"grad_norm": 1.198678087514701,
"learning_rate": 4.758064516129033e-06,
"loss": 0.0966,
"step": 59
},
{
"epoch": 0.1938610662358643,
"grad_norm": 1.237062798014623,
"learning_rate": 4.838709677419355e-06,
"loss": 0.108,
"step": 60
},
{
"epoch": 0.19709208400646203,
"grad_norm": 0.9791118203985985,
"learning_rate": 4.919354838709678e-06,
"loss": 0.0991,
"step": 61
},
{
"epoch": 0.20032310177705978,
"grad_norm": 1.0841634147338435,
"learning_rate": 5e-06,
"loss": 0.0934,
"step": 62
},
{
"epoch": 0.2035541195476575,
"grad_norm": 1.258223416727101,
"learning_rate": 4.999960092086724e-06,
"loss": 0.1219,
"step": 63
},
{
"epoch": 0.20678513731825526,
"grad_norm": 1.0617428668947297,
"learning_rate": 4.999840369621011e-06,
"loss": 0.1124,
"step": 64
},
{
"epoch": 0.210016155088853,
"grad_norm": 1.213736182334779,
"learning_rate": 4.999640836425159e-06,
"loss": 0.1008,
"step": 65
},
{
"epoch": 0.21324717285945072,
"grad_norm": 1.100400949090305,
"learning_rate": 4.99936149886953e-06,
"loss": 0.0732,
"step": 66
},
{
"epoch": 0.21647819063004847,
"grad_norm": 1.0182260583019065,
"learning_rate": 4.999002365872348e-06,
"loss": 0.0818,
"step": 67
},
{
"epoch": 0.2197092084006462,
"grad_norm": 1.0758854772406632,
"learning_rate": 4.998563448899413e-06,
"loss": 0.1152,
"step": 68
},
{
"epoch": 0.22294022617124395,
"grad_norm": 0.7655531388319169,
"learning_rate": 4.998044761963731e-06,
"loss": 0.0629,
"step": 69
},
{
"epoch": 0.22617124394184168,
"grad_norm": 0.9063698842381128,
"learning_rate": 4.9974463216250735e-06,
"loss": 0.0947,
"step": 70
},
{
"epoch": 0.2294022617124394,
"grad_norm": 1.111926907799753,
"learning_rate": 4.996768146989446e-06,
"loss": 0.0869,
"step": 71
},
{
"epoch": 0.23263327948303716,
"grad_norm": 1.2171632327405464,
"learning_rate": 4.996010259708475e-06,
"loss": 0.0906,
"step": 72
},
{
"epoch": 0.2358642972536349,
"grad_norm": 1.1760501880854652,
"learning_rate": 4.99517268397872e-06,
"loss": 0.0905,
"step": 73
},
{
"epoch": 0.23909531502423265,
"grad_norm": 0.9925140924590656,
"learning_rate": 4.9942554465409e-06,
"loss": 0.0759,
"step": 74
},
{
"epoch": 0.24232633279483037,
"grad_norm": 1.1900673708039176,
"learning_rate": 4.993258576679043e-06,
"loss": 0.0959,
"step": 75
},
{
"epoch": 0.2455573505654281,
"grad_norm": 1.0525284959082881,
"learning_rate": 4.9921821062195445e-06,
"loss": 0.0854,
"step": 76
},
{
"epoch": 0.24878836833602586,
"grad_norm": 0.9782484128440981,
"learning_rate": 4.991026069530156e-06,
"loss": 0.0811,
"step": 77
},
{
"epoch": 0.2520193861066236,
"grad_norm": 1.099947189666444,
"learning_rate": 4.989790503518888e-06,
"loss": 0.1213,
"step": 78
},
{
"epoch": 0.2552504038772213,
"grad_norm": 0.9763034951592139,
"learning_rate": 4.988475447632829e-06,
"loss": 0.0978,
"step": 79
},
{
"epoch": 0.25848142164781907,
"grad_norm": 1.1059217088306985,
"learning_rate": 4.987080943856887e-06,
"loss": 0.1188,
"step": 80
},
{
"epoch": 0.2617124394184168,
"grad_norm": 0.8811301090737587,
"learning_rate": 4.985607036712453e-06,
"loss": 0.0959,
"step": 81
},
{
"epoch": 0.2649434571890145,
"grad_norm": 0.876445346062972,
"learning_rate": 4.984053773255971e-06,
"loss": 0.0942,
"step": 82
},
{
"epoch": 0.2681744749596123,
"grad_norm": 0.9532526347121169,
"learning_rate": 4.982421203077446e-06,
"loss": 0.0877,
"step": 83
},
{
"epoch": 0.27140549273021003,
"grad_norm": 1.123157927763249,
"learning_rate": 4.980709378298851e-06,
"loss": 0.1278,
"step": 84
},
{
"epoch": 0.27463651050080773,
"grad_norm": 1.0388291288831657,
"learning_rate": 4.978918353572471e-06,
"loss": 0.0711,
"step": 85
},
{
"epoch": 0.2778675282714055,
"grad_norm": 0.9767051395631638,
"learning_rate": 4.977048186079155e-06,
"loss": 0.0778,
"step": 86
},
{
"epoch": 0.28109854604200324,
"grad_norm": 0.796702172980499,
"learning_rate": 4.975098935526487e-06,
"loss": 0.0737,
"step": 87
},
{
"epoch": 0.284329563812601,
"grad_norm": 0.930052043825522,
"learning_rate": 4.973070664146885e-06,
"loss": 0.0797,
"step": 88
},
{
"epoch": 0.2875605815831987,
"grad_norm": 0.8599629277716169,
"learning_rate": 4.970963436695612e-06,
"loss": 0.0736,
"step": 89
},
{
"epoch": 0.29079159935379645,
"grad_norm": 1.2487651297160884,
"learning_rate": 4.968777320448707e-06,
"loss": 0.1234,
"step": 90
},
{
"epoch": 0.2940226171243942,
"grad_norm": 1.3265846927372396,
"learning_rate": 4.966512385200841e-06,
"loss": 0.097,
"step": 91
},
{
"epoch": 0.2972536348949919,
"grad_norm": 1.2653863711384867,
"learning_rate": 4.964168703263086e-06,
"loss": 0.0851,
"step": 92
},
{
"epoch": 0.30048465266558966,
"grad_norm": 1.05226163192955,
"learning_rate": 4.961746349460607e-06,
"loss": 0.0901,
"step": 93
},
{
"epoch": 0.3037156704361874,
"grad_norm": 0.7053264391935079,
"learning_rate": 4.959245401130269e-06,
"loss": 0.0472,
"step": 94
},
{
"epoch": 0.3069466882067851,
"grad_norm": 1.1693965687082317,
"learning_rate": 4.956665938118179e-06,
"loss": 0.0909,
"step": 95
},
{
"epoch": 0.31017770597738287,
"grad_norm": 1.0877301086902282,
"learning_rate": 4.954008042777125e-06,
"loss": 0.0897,
"step": 96
},
{
"epoch": 0.3134087237479806,
"grad_norm": 0.9145033111059097,
"learning_rate": 4.951271799963952e-06,
"loss": 0.0871,
"step": 97
},
{
"epoch": 0.3166397415185784,
"grad_norm": 0.8706912416502465,
"learning_rate": 4.9484572970368516e-06,
"loss": 0.0656,
"step": 98
},
{
"epoch": 0.3198707592891761,
"grad_norm": 1.1252067732287794,
"learning_rate": 4.945564623852577e-06,
"loss": 0.0916,
"step": 99
},
{
"epoch": 0.32310177705977383,
"grad_norm": 0.9809013164181603,
"learning_rate": 4.942593872763566e-06,
"loss": 0.0671,
"step": 100
},
{
"epoch": 0.3263327948303716,
"grad_norm": 0.9939061476745602,
"learning_rate": 4.939545138615003e-06,
"loss": 0.0796,
"step": 101
},
{
"epoch": 0.3295638126009693,
"grad_norm": 0.8729446605586859,
"learning_rate": 4.93641851874178e-06,
"loss": 0.1031,
"step": 102
},
{
"epoch": 0.33279483037156704,
"grad_norm": 0.8275137143927102,
"learning_rate": 4.933214112965399e-06,
"loss": 0.098,
"step": 103
},
{
"epoch": 0.3360258481421648,
"grad_norm": 1.1250877000414508,
"learning_rate": 4.929932023590776e-06,
"loss": 0.0818,
"step": 104
},
{
"epoch": 0.3392568659127625,
"grad_norm": 0.6635247677362335,
"learning_rate": 4.926572355402983e-06,
"loss": 0.069,
"step": 105
},
{
"epoch": 0.34248788368336025,
"grad_norm": 0.7585575652472656,
"learning_rate": 4.923135215663897e-06,
"loss": 0.0574,
"step": 106
},
{
"epoch": 0.345718901453958,
"grad_norm": 0.8301240151962479,
"learning_rate": 4.919620714108777e-06,
"loss": 0.0894,
"step": 107
},
{
"epoch": 0.34894991922455576,
"grad_norm": 0.7584921353525548,
"learning_rate": 4.916028962942763e-06,
"loss": 0.0693,
"step": 108
},
{
"epoch": 0.35218093699515346,
"grad_norm": 0.8546005601207664,
"learning_rate": 4.912360076837289e-06,
"loss": 0.0853,
"step": 109
},
{
"epoch": 0.3554119547657512,
"grad_norm": 0.7665244261956478,
"learning_rate": 4.908614172926426e-06,
"loss": 0.0555,
"step": 110
},
{
"epoch": 0.35864297253634897,
"grad_norm": 0.7090721453253053,
"learning_rate": 4.904791370803141e-06,
"loss": 0.0809,
"step": 111
},
{
"epoch": 0.36187399030694667,
"grad_norm": 0.9004480711031451,
"learning_rate": 4.9008917925154795e-06,
"loss": 0.0798,
"step": 112
},
{
"epoch": 0.3651050080775444,
"grad_norm": 0.6681566874499523,
"learning_rate": 4.896915562562665e-06,
"loss": 0.0567,
"step": 113
},
{
"epoch": 0.3683360258481422,
"grad_norm": 0.7831071753829433,
"learning_rate": 4.892862807891131e-06,
"loss": 0.0886,
"step": 114
},
{
"epoch": 0.3715670436187399,
"grad_norm": 0.8572105799904943,
"learning_rate": 4.888733657890463e-06,
"loss": 0.0941,
"step": 115
},
{
"epoch": 0.37479806138933763,
"grad_norm": 0.6839139362191016,
"learning_rate": 4.884528244389269e-06,
"loss": 0.0499,
"step": 116
},
{
"epoch": 0.3780290791599354,
"grad_norm": 0.7535937955610514,
"learning_rate": 4.8802467016509704e-06,
"loss": 0.0648,
"step": 117
},
{
"epoch": 0.38126009693053314,
"grad_norm": 0.7278973767612748,
"learning_rate": 4.8758891663695165e-06,
"loss": 0.0647,
"step": 118
},
{
"epoch": 0.38449111470113084,
"grad_norm": 0.8658527730630694,
"learning_rate": 4.87145577766502e-06,
"loss": 0.0694,
"step": 119
},
{
"epoch": 0.3877221324717286,
"grad_norm": 0.5993174371068741,
"learning_rate": 4.866946677079314e-06,
"loss": 0.0532,
"step": 120
},
{
"epoch": 0.39095315024232635,
"grad_norm": 0.9585898899909292,
"learning_rate": 4.862362008571434e-06,
"loss": 0.081,
"step": 121
},
{
"epoch": 0.39418416801292405,
"grad_norm": 0.8402658697480521,
"learning_rate": 4.857701918513023e-06,
"loss": 0.0661,
"step": 122
},
{
"epoch": 0.3974151857835218,
"grad_norm": 0.8695072904889961,
"learning_rate": 4.852966555683657e-06,
"loss": 0.0777,
"step": 123
},
{
"epoch": 0.40064620355411956,
"grad_norm": 0.8700878343519065,
"learning_rate": 4.848156071266095e-06,
"loss": 0.0669,
"step": 124
},
{
"epoch": 0.40387722132471726,
"grad_norm": 0.7573866683757483,
"learning_rate": 4.843270618841455e-06,
"loss": 0.0659,
"step": 125
},
{
"epoch": 0.407108239095315,
"grad_norm": 0.7144861602580213,
"learning_rate": 4.838310354384304e-06,
"loss": 0.0673,
"step": 126
},
{
"epoch": 0.41033925686591277,
"grad_norm": 0.6521772244924818,
"learning_rate": 4.833275436257684e-06,
"loss": 0.0487,
"step": 127
},
{
"epoch": 0.4135702746365105,
"grad_norm": 0.9937642189435811,
"learning_rate": 4.828166025208059e-06,
"loss": 0.0748,
"step": 128
},
{
"epoch": 0.4168012924071082,
"grad_norm": 0.838066231893655,
"learning_rate": 4.822982284360173e-06,
"loss": 0.0732,
"step": 129
},
{
"epoch": 0.420032310177706,
"grad_norm": 0.9685536935346779,
"learning_rate": 4.8177243792118515e-06,
"loss": 0.1053,
"step": 130
},
{
"epoch": 0.42326332794830374,
"grad_norm": 0.8362566096259404,
"learning_rate": 4.8123924776287115e-06,
"loss": 0.0767,
"step": 131
},
{
"epoch": 0.42649434571890144,
"grad_norm": 0.7707131560838676,
"learning_rate": 4.8069867498388066e-06,
"loss": 0.059,
"step": 132
},
{
"epoch": 0.4297253634894992,
"grad_norm": 0.8446766172300785,
"learning_rate": 4.80150736842719e-06,
"loss": 0.0715,
"step": 133
},
{
"epoch": 0.43295638126009695,
"grad_norm": 0.9888301615866294,
"learning_rate": 4.795954508330403e-06,
"loss": 0.0816,
"step": 134
},
{
"epoch": 0.43618739903069464,
"grad_norm": 0.5818611202414509,
"learning_rate": 4.790328346830893e-06,
"loss": 0.0579,
"step": 135
},
{
"epoch": 0.4394184168012924,
"grad_norm": 0.7358503512846166,
"learning_rate": 4.784629063551354e-06,
"loss": 0.0729,
"step": 136
},
{
"epoch": 0.44264943457189015,
"grad_norm": 0.6475363027245936,
"learning_rate": 4.778856840448985e-06,
"loss": 0.0595,
"step": 137
},
{
"epoch": 0.4458804523424879,
"grad_norm": 0.8615033443900738,
"learning_rate": 4.773011861809694e-06,
"loss": 0.0951,
"step": 138
},
{
"epoch": 0.4491114701130856,
"grad_norm": 0.6927048507197057,
"learning_rate": 4.7670943142421955e-06,
"loss": 0.0524,
"step": 139
},
{
"epoch": 0.45234248788368336,
"grad_norm": 1.051050914153071,
"learning_rate": 4.761104386672074e-06,
"loss": 0.0766,
"step": 140
},
{
"epoch": 0.4555735056542811,
"grad_norm": 0.5989675474056907,
"learning_rate": 4.7550422703357355e-06,
"loss": 0.0701,
"step": 141
},
{
"epoch": 0.4588045234248788,
"grad_norm": 0.7775916205990767,
"learning_rate": 4.748908158774312e-06,
"loss": 0.0587,
"step": 142
},
{
"epoch": 0.4620355411954766,
"grad_norm": 0.6567814970993029,
"learning_rate": 4.742702247827476e-06,
"loss": 0.0558,
"step": 143
},
{
"epoch": 0.46526655896607433,
"grad_norm": 0.8201443957683763,
"learning_rate": 4.736424735627193e-06,
"loss": 0.066,
"step": 144
},
{
"epoch": 0.46849757673667203,
"grad_norm": 0.767995936704651,
"learning_rate": 4.730075822591392e-06,
"loss": 0.0698,
"step": 145
},
{
"epoch": 0.4717285945072698,
"grad_norm": 0.8765270563094147,
"learning_rate": 4.7236557114175705e-06,
"loss": 0.0752,
"step": 146
},
{
"epoch": 0.47495961227786754,
"grad_norm": 0.8060072499383857,
"learning_rate": 4.71716460707632e-06,
"loss": 0.0776,
"step": 147
},
{
"epoch": 0.4781906300484653,
"grad_norm": 0.7249857982210036,
"learning_rate": 4.710602716804784e-06,
"loss": 0.0645,
"step": 148
},
{
"epoch": 0.481421647819063,
"grad_norm": 0.9856585082866961,
"learning_rate": 4.703970250100041e-06,
"loss": 0.0813,
"step": 149
},
{
"epoch": 0.48465266558966075,
"grad_norm": 0.7832993384259341,
"learning_rate": 4.697267418712415e-06,
"loss": 0.0855,
"step": 150
},
{
"epoch": 0.4878836833602585,
"grad_norm": 0.7537716514815709,
"learning_rate": 4.690494436638718e-06,
"loss": 0.0662,
"step": 151
},
{
"epoch": 0.4911147011308562,
"grad_norm": 0.6970576661817561,
"learning_rate": 4.683651520115414e-06,
"loss": 0.0526,
"step": 152
},
{
"epoch": 0.49434571890145396,
"grad_norm": 0.6025061116050546,
"learning_rate": 4.67673888761172e-06,
"loss": 0.0423,
"step": 153
},
{
"epoch": 0.4975767366720517,
"grad_norm": 0.8666593448997953,
"learning_rate": 4.669756759822625e-06,
"loss": 0.0769,
"step": 154
},
{
"epoch": 0.5008077544426495,
"grad_norm": 0.8822574390448034,
"learning_rate": 4.66270535966185e-06,
"loss": 0.0835,
"step": 155
},
{
"epoch": 0.5040387722132472,
"grad_norm": 0.7966361049316173,
"learning_rate": 4.655584912254727e-06,
"loss": 0.0702,
"step": 156
},
{
"epoch": 0.5072697899838449,
"grad_norm": 0.975295266397441,
"learning_rate": 4.6483956449310155e-06,
"loss": 0.0868,
"step": 157
},
{
"epoch": 0.5105008077544426,
"grad_norm": 0.6419609425416163,
"learning_rate": 4.64113778721764e-06,
"loss": 0.0563,
"step": 158
},
{
"epoch": 0.5137318255250404,
"grad_norm": 1.074461428386011,
"learning_rate": 4.633811570831367e-06,
"loss": 0.1019,
"step": 159
},
{
"epoch": 0.5169628432956381,
"grad_norm": 0.7470516638155429,
"learning_rate": 4.626417229671401e-06,
"loss": 0.0699,
"step": 160
},
{
"epoch": 0.5201938610662359,
"grad_norm": 0.6593218212471074,
"learning_rate": 4.6189549998119235e-06,
"loss": 0.0489,
"step": 161
},
{
"epoch": 0.5234248788368336,
"grad_norm": 0.9595803117243349,
"learning_rate": 4.611425119494552e-06,
"loss": 0.0784,
"step": 162
},
{
"epoch": 0.5266558966074314,
"grad_norm": 0.6236918909086313,
"learning_rate": 4.603827829120734e-06,
"loss": 0.0581,
"step": 163
},
{
"epoch": 0.529886914378029,
"grad_norm": 0.94480008993302,
"learning_rate": 4.596163371244076e-06,
"loss": 0.0672,
"step": 164
},
{
"epoch": 0.5331179321486268,
"grad_norm": 0.8140229354538353,
"learning_rate": 4.588431990562593e-06,
"loss": 0.0557,
"step": 165
},
{
"epoch": 0.5363489499192245,
"grad_norm": 0.6881752667822972,
"learning_rate": 4.580633933910901e-06,
"loss": 0.0499,
"step": 166
},
{
"epoch": 0.5395799676898223,
"grad_norm": 0.6835421331522927,
"learning_rate": 4.572769450252335e-06,
"loss": 0.0726,
"step": 167
},
{
"epoch": 0.5428109854604201,
"grad_norm": 0.6690598961735207,
"learning_rate": 4.564838790671e-06,
"loss": 0.0553,
"step": 168
},
{
"epoch": 0.5460420032310178,
"grad_norm": 1.0189532436328723,
"learning_rate": 4.556842208363756e-06,
"loss": 0.0865,
"step": 169
},
{
"epoch": 0.5492730210016155,
"grad_norm": 0.6275318178066402,
"learning_rate": 4.548779958632134e-06,
"loss": 0.0549,
"step": 170
},
{
"epoch": 0.5525040387722132,
"grad_norm": 0.6758827324437686,
"learning_rate": 4.540652298874183e-06,
"loss": 0.055,
"step": 171
},
{
"epoch": 0.555735056542811,
"grad_norm": 0.7969599802065607,
"learning_rate": 4.532459488576258e-06,
"loss": 0.0667,
"step": 172
},
{
"epoch": 0.5589660743134087,
"grad_norm": 0.8835274962394156,
"learning_rate": 4.524201789304727e-06,
"loss": 0.1042,
"step": 173
},
{
"epoch": 0.5621970920840065,
"grad_norm": 1.0052690224759626,
"learning_rate": 4.515879464697629e-06,
"loss": 0.0676,
"step": 174
},
{
"epoch": 0.5654281098546042,
"grad_norm": 1.2550940752148396,
"learning_rate": 4.507492780456249e-06,
"loss": 0.0873,
"step": 175
},
{
"epoch": 0.568659127625202,
"grad_norm": 0.7510769288869017,
"learning_rate": 4.499042004336642e-06,
"loss": 0.0593,
"step": 176
},
{
"epoch": 0.5718901453957996,
"grad_norm": 1.0704861891317858,
"learning_rate": 4.490527406141081e-06,
"loss": 0.1084,
"step": 177
},
{
"epoch": 0.5751211631663974,
"grad_norm": 0.6284718527814975,
"learning_rate": 4.481949257709442e-06,
"loss": 0.0508,
"step": 178
},
{
"epoch": 0.5783521809369951,
"grad_norm": 0.7735314263403176,
"learning_rate": 4.4733078329105296e-06,
"loss": 0.071,
"step": 179
},
{
"epoch": 0.5815831987075929,
"grad_norm": 0.7328207313228111,
"learning_rate": 4.464603407633326e-06,
"loss": 0.0616,
"step": 180
},
{
"epoch": 0.5848142164781907,
"grad_norm": 0.8860553425416037,
"learning_rate": 4.455836259778193e-06,
"loss": 0.0933,
"step": 181
},
{
"epoch": 0.5880452342487884,
"grad_norm": 0.7335813522711138,
"learning_rate": 4.44700666924799e-06,
"loss": 0.0778,
"step": 182
},
{
"epoch": 0.5912762520193862,
"grad_norm": 1.0224460571070495,
"learning_rate": 4.438114917939145e-06,
"loss": 0.0816,
"step": 183
},
{
"epoch": 0.5945072697899838,
"grad_norm": 0.5929045336796159,
"learning_rate": 4.42916128973265e-06,
"loss": 0.0544,
"step": 184
},
{
"epoch": 0.5977382875605816,
"grad_norm": 0.9356825212983167,
"learning_rate": 4.420146070484997e-06,
"loss": 0.0697,
"step": 185
},
{
"epoch": 0.6009693053311793,
"grad_norm": 0.9940780034167175,
"learning_rate": 4.41106954801906e-06,
"loss": 0.0849,
"step": 186
},
{
"epoch": 0.6042003231017771,
"grad_norm": 1.1206234673942042,
"learning_rate": 4.401932012114893e-06,
"loss": 0.0822,
"step": 187
},
{
"epoch": 0.6074313408723748,
"grad_norm": 0.5583759826731935,
"learning_rate": 4.39273375450049e-06,
"loss": 0.0516,
"step": 188
},
{
"epoch": 0.6106623586429726,
"grad_norm": 0.6454315459498958,
"learning_rate": 4.383475068842464e-06,
"loss": 0.0577,
"step": 189
},
{
"epoch": 0.6138933764135702,
"grad_norm": 1.1332732216792498,
"learning_rate": 4.3741562507366754e-06,
"loss": 0.0655,
"step": 190
},
{
"epoch": 0.617124394184168,
"grad_norm": 0.8318517770779469,
"learning_rate": 4.36477759769879e-06,
"loss": 0.0954,
"step": 191
},
{
"epoch": 0.6203554119547657,
"grad_norm": 1.3985194163380206,
"learning_rate": 4.355339409154788e-06,
"loss": 0.119,
"step": 192
},
{
"epoch": 0.6235864297253635,
"grad_norm": 0.8270239497445794,
"learning_rate": 4.345841986431396e-06,
"loss": 0.0637,
"step": 193
},
{
"epoch": 0.6268174474959612,
"grad_norm": 0.7760464412137208,
"learning_rate": 4.336285632746472e-06,
"loss": 0.0884,
"step": 194
},
{
"epoch": 0.630048465266559,
"grad_norm": 0.7692351681728313,
"learning_rate": 4.326670653199323e-06,
"loss": 0.0585,
"step": 195
},
{
"epoch": 0.6332794830371568,
"grad_norm": 0.66023769139009,
"learning_rate": 4.316997354760965e-06,
"loss": 0.0678,
"step": 196
},
{
"epoch": 0.6365105008077544,
"grad_norm": 0.6890195746804293,
"learning_rate": 4.307266046264323e-06,
"loss": 0.0664,
"step": 197
},
{
"epoch": 0.6397415185783522,
"grad_norm": 0.847865514008143,
"learning_rate": 4.297477038394368e-06,
"loss": 0.0825,
"step": 198
},
{
"epoch": 0.6429725363489499,
"grad_norm": 0.9550408154272941,
"learning_rate": 4.287630643678204e-06,
"loss": 0.0744,
"step": 199
},
{
"epoch": 0.6462035541195477,
"grad_norm": 0.857502218920431,
"learning_rate": 4.2777271764750805e-06,
"loss": 0.0671,
"step": 200
},
{
"epoch": 0.6494345718901454,
"grad_norm": 0.8052709789557946,
"learning_rate": 4.267766952966369e-06,
"loss": 0.0612,
"step": 201
},
{
"epoch": 0.6526655896607432,
"grad_norm": 1.083089645941659,
"learning_rate": 4.257750291145457e-06,
"loss": 0.0822,
"step": 202
},
{
"epoch": 0.6558966074313409,
"grad_norm": 0.984740295179814,
"learning_rate": 4.247677510807602e-06,
"loss": 0.056,
"step": 203
},
{
"epoch": 0.6591276252019386,
"grad_norm": 0.8835020576624913,
"learning_rate": 4.237548933539718e-06,
"loss": 0.0758,
"step": 204
},
{
"epoch": 0.6623586429725363,
"grad_norm": 0.6564577266758135,
"learning_rate": 4.227364882710114e-06,
"loss": 0.0563,
"step": 205
},
{
"epoch": 0.6655896607431341,
"grad_norm": 1.1066641198832807,
"learning_rate": 4.217125683458162e-06,
"loss": 0.083,
"step": 206
},
{
"epoch": 0.6688206785137318,
"grad_norm": 0.6596522236176221,
"learning_rate": 4.206831662683922e-06,
"loss": 0.0661,
"step": 207
},
{
"epoch": 0.6720516962843296,
"grad_norm": 1.3814266436856864,
"learning_rate": 4.196483149037707e-06,
"loss": 0.1065,
"step": 208
},
{
"epoch": 0.6752827140549273,
"grad_norm": 0.8566187355631336,
"learning_rate": 4.186080472909582e-06,
"loss": 0.0798,
"step": 209
},
{
"epoch": 0.678513731825525,
"grad_norm": 0.7295355541151259,
"learning_rate": 4.1756239664188275e-06,
"loss": 0.0575,
"step": 210
},
{
"epoch": 0.6817447495961227,
"grad_norm": 1.1292799421470592,
"learning_rate": 4.165113963403326e-06,
"loss": 0.0721,
"step": 211
},
{
"epoch": 0.6849757673667205,
"grad_norm": 0.6756065514670196,
"learning_rate": 4.154550799408906e-06,
"loss": 0.0627,
"step": 212
},
{
"epoch": 0.6882067851373183,
"grad_norm": 0.7404052802160156,
"learning_rate": 4.143934811678637e-06,
"loss": 0.0535,
"step": 213
},
{
"epoch": 0.691437802907916,
"grad_norm": 1.022807263526455,
"learning_rate": 4.1332663391420515e-06,
"loss": 0.0854,
"step": 214
},
{
"epoch": 0.6946688206785138,
"grad_norm": 0.8472576700436739,
"learning_rate": 4.1225457224043316e-06,
"loss": 0.0623,
"step": 215
},
{
"epoch": 0.6978998384491115,
"grad_norm": 0.682205907382696,
"learning_rate": 4.111773303735432e-06,
"loss": 0.0523,
"step": 216
},
{
"epoch": 0.7011308562197092,
"grad_norm": 0.8113418772511839,
"learning_rate": 4.100949427059151e-06,
"loss": 0.0789,
"step": 217
},
{
"epoch": 0.7043618739903069,
"grad_norm": 0.9832925631785678,
"learning_rate": 4.090074437942155e-06,
"loss": 0.0723,
"step": 218
},
{
"epoch": 0.7075928917609047,
"grad_norm": 0.8721900501399756,
"learning_rate": 4.079148683582943e-06,
"loss": 0.0882,
"step": 219
},
{
"epoch": 0.7108239095315024,
"grad_norm": 0.6650598159524921,
"learning_rate": 4.06817251280076e-06,
"loss": 0.0645,
"step": 220
},
{
"epoch": 0.7140549273021002,
"grad_norm": 0.5794028281825343,
"learning_rate": 4.0571462760244626e-06,
"loss": 0.0505,
"step": 221
},
{
"epoch": 0.7172859450726979,
"grad_norm": 0.766211455982521,
"learning_rate": 4.046070325281333e-06,
"loss": 0.0716,
"step": 222
},
{
"epoch": 0.7205169628432956,
"grad_norm": 0.6405405389062213,
"learning_rate": 4.034945014185836e-06,
"loss": 0.0682,
"step": 223
},
{
"epoch": 0.7237479806138933,
"grad_norm": 0.7302480630044077,
"learning_rate": 4.0237706979283306e-06,
"loss": 0.0813,
"step": 224
},
{
"epoch": 0.7269789983844911,
"grad_norm": 0.8456954793821594,
"learning_rate": 4.012547733263734e-06,
"loss": 0.0672,
"step": 225
},
{
"epoch": 0.7302100161550888,
"grad_norm": 0.6987533622613603,
"learning_rate": 4.001276478500127e-06,
"loss": 0.0606,
"step": 226
},
{
"epoch": 0.7334410339256866,
"grad_norm": 0.7583800796325042,
"learning_rate": 3.989957293487314e-06,
"loss": 0.048,
"step": 227
},
{
"epoch": 0.7366720516962844,
"grad_norm": 0.8822810810943023,
"learning_rate": 3.978590539605338e-06,
"loss": 0.0604,
"step": 228
},
{
"epoch": 0.7399030694668821,
"grad_norm": 0.8557818511714256,
"learning_rate": 3.967176579752943e-06,
"loss": 0.0589,
"step": 229
},
{
"epoch": 0.7431340872374798,
"grad_norm": 0.5503873101604418,
"learning_rate": 3.955715778335984e-06,
"loss": 0.0528,
"step": 230
},
{
"epoch": 0.7463651050080775,
"grad_norm": 0.8045709480783542,
"learning_rate": 3.944208501255796e-06,
"loss": 0.0821,
"step": 231
},
{
"epoch": 0.7495961227786753,
"grad_norm": 0.752743452245411,
"learning_rate": 3.932655115897513e-06,
"loss": 0.0916,
"step": 232
},
{
"epoch": 0.752827140549273,
"grad_norm": 0.9726656057832658,
"learning_rate": 3.9210559911183345e-06,
"loss": 0.0826,
"step": 233
},
{
"epoch": 0.7560581583198708,
"grad_norm": 0.8247240601878854,
"learning_rate": 3.909411497235752e-06,
"loss": 0.0712,
"step": 234
},
{
"epoch": 0.7592891760904685,
"grad_norm": 0.5846761529324903,
"learning_rate": 3.89772200601573e-06,
"loss": 0.0497,
"step": 235
},
{
"epoch": 0.7625201938610663,
"grad_norm": 1.215906602352585,
"learning_rate": 3.885987890660828e-06,
"loss": 0.096,
"step": 236
},
{
"epoch": 0.7657512116316639,
"grad_norm": 0.7228133512999668,
"learning_rate": 3.874209525798293e-06,
"loss": 0.0676,
"step": 237
},
{
"epoch": 0.7689822294022617,
"grad_norm": 0.628874981721426,
"learning_rate": 3.862387287468095e-06,
"loss": 0.0638,
"step": 238
},
{
"epoch": 0.7722132471728594,
"grad_norm": 0.8665871369668172,
"learning_rate": 3.850521553110924e-06,
"loss": 0.0676,
"step": 239
},
{
"epoch": 0.7754442649434572,
"grad_norm": 0.7726368504889838,
"learning_rate": 3.838612701556138e-06,
"loss": 0.0617,
"step": 240
},
{
"epoch": 0.778675282714055,
"grad_norm": 0.8259876486738208,
"learning_rate": 3.826661113009671e-06,
"loss": 0.0827,
"step": 241
},
{
"epoch": 0.7819063004846527,
"grad_norm": 0.5825223238179703,
"learning_rate": 3.814667169041887e-06,
"loss": 0.0536,
"step": 242
},
{
"epoch": 0.7851373182552503,
"grad_norm": 0.8337660440150237,
"learning_rate": 3.8026312525754095e-06,
"loss": 0.0805,
"step": 243
},
{
"epoch": 0.7883683360258481,
"grad_norm": 1.2051840693103852,
"learning_rate": 3.790553747872885e-06,
"loss": 0.0876,
"step": 244
},
{
"epoch": 0.7915993537964459,
"grad_norm": 1.0177651978897424,
"learning_rate": 3.778435040524722e-06,
"loss": 0.088,
"step": 245
},
{
"epoch": 0.7948303715670436,
"grad_norm": 0.5088711298667009,
"learning_rate": 3.766275517436779e-06,
"loss": 0.0436,
"step": 246
},
{
"epoch": 0.7980613893376414,
"grad_norm": 0.7256417368432645,
"learning_rate": 3.75407556681801e-06,
"loss": 0.0567,
"step": 247
},
{
"epoch": 0.8012924071082391,
"grad_norm": 0.6647991839006098,
"learning_rate": 3.741835578168071e-06,
"loss": 0.0682,
"step": 248
},
{
"epoch": 0.8045234248788369,
"grad_norm": 0.6160939622053359,
"learning_rate": 3.7295559422648874e-06,
"loss": 0.0471,
"step": 249
},
{
"epoch": 0.8077544426494345,
"grad_norm": 0.9179464461903228,
"learning_rate": 3.717237051152175e-06,
"loss": 0.0774,
"step": 250
},
{
"epoch": 0.8109854604200323,
"grad_norm": 1.0639899619677535,
"learning_rate": 3.7048792981269245e-06,
"loss": 0.076,
"step": 251
},
{
"epoch": 0.81421647819063,
"grad_norm": 0.6225213021803089,
"learning_rate": 3.692483077726843e-06,
"loss": 0.0499,
"step": 252
},
{
"epoch": 0.8174474959612278,
"grad_norm": 0.6507281120193775,
"learning_rate": 3.6800487857177636e-06,
"loss": 0.0542,
"step": 253
},
{
"epoch": 0.8206785137318255,
"grad_norm": 0.8342956252285098,
"learning_rate": 3.6675768190810023e-06,
"loss": 0.0563,
"step": 254
},
{
"epoch": 0.8239095315024233,
"grad_norm": 0.8643206085017306,
"learning_rate": 3.6550675760006904e-06,
"loss": 0.0739,
"step": 255
},
{
"epoch": 0.827140549273021,
"grad_norm": 0.7946001867850625,
"learning_rate": 3.642521455851058e-06,
"loss": 0.0714,
"step": 256
},
{
"epoch": 0.8303715670436187,
"grad_norm": 0.8926582386596207,
"learning_rate": 3.629938859183686e-06,
"loss": 0.0713,
"step": 257
},
{
"epoch": 0.8336025848142165,
"grad_norm": 0.7491907200888189,
"learning_rate": 3.6173201877147134e-06,
"loss": 0.068,
"step": 258
},
{
"epoch": 0.8368336025848142,
"grad_norm": 1.1183078136862983,
"learning_rate": 3.6046658443120196e-06,
"loss": 0.1055,
"step": 259
},
{
"epoch": 0.840064620355412,
"grad_norm": 0.7909225982481742,
"learning_rate": 3.5919762329823556e-06,
"loss": 0.0695,
"step": 260
},
{
"epoch": 0.8432956381260097,
"grad_norm": 0.8739075343916631,
"learning_rate": 3.579251758858447e-06,
"loss": 0.069,
"step": 261
},
{
"epoch": 0.8465266558966075,
"grad_norm": 0.7730576890810616,
"learning_rate": 3.566492828186063e-06,
"loss": 0.0666,
"step": 262
},
{
"epoch": 0.8497576736672051,
"grad_norm": 0.9066328657599784,
"learning_rate": 3.5536998483110418e-06,
"loss": 0.0707,
"step": 263
},
{
"epoch": 0.8529886914378029,
"grad_norm": 0.713312105314591,
"learning_rate": 3.5408732276662882e-06,
"loss": 0.0913,
"step": 264
},
{
"epoch": 0.8562197092084006,
"grad_norm": 0.7057871040611122,
"learning_rate": 3.5280133757587343e-06,
"loss": 0.0591,
"step": 265
},
{
"epoch": 0.8594507269789984,
"grad_norm": 1.2427283512085103,
"learning_rate": 3.515120703156264e-06,
"loss": 0.0839,
"step": 266
},
{
"epoch": 0.8626817447495961,
"grad_norm": 0.7653071583973694,
"learning_rate": 3.5021956214746046e-06,
"loss": 0.0645,
"step": 267
},
{
"epoch": 0.8659127625201939,
"grad_norm": 0.6634481800817704,
"learning_rate": 3.4892385433641875e-06,
"loss": 0.0459,
"step": 268
},
{
"epoch": 0.8691437802907916,
"grad_norm": 0.6953364982312946,
"learning_rate": 3.4762498824969726e-06,
"loss": 0.0632,
"step": 269
},
{
"epoch": 0.8723747980613893,
"grad_norm": 0.6532166155768556,
"learning_rate": 3.4632300535532415e-06,
"loss": 0.053,
"step": 270
},
{
"epoch": 0.875605815831987,
"grad_norm": 0.685089930364244,
"learning_rate": 3.450179472208356e-06,
"loss": 0.0586,
"step": 271
},
{
"epoch": 0.8788368336025848,
"grad_norm": 0.79809625541097,
"learning_rate": 3.437098555119493e-06,
"loss": 0.053,
"step": 272
},
{
"epoch": 0.8820678513731826,
"grad_norm": 1.001360047718671,
"learning_rate": 3.4239877199123343e-06,
"loss": 0.0856,
"step": 273
},
{
"epoch": 0.8852988691437803,
"grad_norm": 0.9793506266219179,
"learning_rate": 3.4108473851677408e-06,
"loss": 0.0603,
"step": 274
},
{
"epoch": 0.8885298869143781,
"grad_norm": 1.1216710753897985,
"learning_rate": 3.397677970408384e-06,
"loss": 0.091,
"step": 275
},
{
"epoch": 0.8917609046849758,
"grad_norm": 1.0349146688296054,
"learning_rate": 3.3844798960853533e-06,
"loss": 0.0712,
"step": 276
},
{
"epoch": 0.8949919224555735,
"grad_norm": 1.3307977354919367,
"learning_rate": 3.3712535835647326e-06,
"loss": 0.0617,
"step": 277
},
{
"epoch": 0.8982229402261712,
"grad_norm": 0.7132616053518929,
"learning_rate": 3.357999455114148e-06,
"loss": 0.0659,
"step": 278
},
{
"epoch": 0.901453957996769,
"grad_norm": 0.9273582042526977,
"learning_rate": 3.344717933889289e-06,
"loss": 0.0641,
"step": 279
},
{
"epoch": 0.9046849757673667,
"grad_norm": 0.9302335920003337,
"learning_rate": 3.3314094439203903e-06,
"loss": 0.0626,
"step": 280
},
{
"epoch": 0.9079159935379645,
"grad_norm": 0.45357719437194755,
"learning_rate": 3.3180744100987045e-06,
"loss": 0.0431,
"step": 281
},
{
"epoch": 0.9111470113085622,
"grad_norm": 0.8331062557625248,
"learning_rate": 3.3047132581629297e-06,
"loss": 0.0755,
"step": 282
},
{
"epoch": 0.9143780290791599,
"grad_norm": 0.7394449396136501,
"learning_rate": 3.29132641468562e-06,
"loss": 0.0566,
"step": 283
},
{
"epoch": 0.9176090468497576,
"grad_norm": 0.9737240770489647,
"learning_rate": 3.277914307059566e-06,
"loss": 0.0801,
"step": 284
},
{
"epoch": 0.9208400646203554,
"grad_norm": 0.7702804759383343,
"learning_rate": 3.264477363484151e-06,
"loss": 0.0573,
"step": 285
},
{
"epoch": 0.9240710823909531,
"grad_norm": 0.5835013641966551,
"learning_rate": 3.251016012951678e-06,
"loss": 0.0562,
"step": 286
},
{
"epoch": 0.9273021001615509,
"grad_norm": 0.5371255642799613,
"learning_rate": 3.237530685233673e-06,
"loss": 0.0413,
"step": 287
},
{
"epoch": 0.9305331179321487,
"grad_norm": 0.816748025986031,
"learning_rate": 3.2240218108671683e-06,
"loss": 0.0534,
"step": 288
},
{
"epoch": 0.9337641357027464,
"grad_norm": 0.712574290987934,
"learning_rate": 3.2104898211409546e-06,
"loss": 0.0635,
"step": 289
},
{
"epoch": 0.9369951534733441,
"grad_norm": 0.7596552800433577,
"learning_rate": 3.196935148081808e-06,
"loss": 0.0562,
"step": 290
},
{
"epoch": 0.9402261712439418,
"grad_norm": 0.6394269281528583,
"learning_rate": 3.1833582244407036e-06,
"loss": 0.0569,
"step": 291
},
{
"epoch": 0.9434571890145396,
"grad_norm": 0.8847550945975203,
"learning_rate": 3.1697594836789924e-06,
"loss": 0.0854,
"step": 292
},
{
"epoch": 0.9466882067851373,
"grad_norm": 0.6380616258246674,
"learning_rate": 3.156139359954569e-06,
"loss": 0.0603,
"step": 293
},
{
"epoch": 0.9499192245557351,
"grad_norm": 0.8054064646081707,
"learning_rate": 3.142498288108007e-06,
"loss": 0.0811,
"step": 294
},
{
"epoch": 0.9531502423263328,
"grad_norm": 0.6206438760553665,
"learning_rate": 3.128836703648676e-06,
"loss": 0.0702,
"step": 295
},
{
"epoch": 0.9563812600969306,
"grad_norm": 0.925371456511262,
"learning_rate": 3.1151550427408383e-06,
"loss": 0.0663,
"step": 296
},
{
"epoch": 0.9596122778675282,
"grad_norm": 0.8737367079669791,
"learning_rate": 3.1014537421897222e-06,
"loss": 0.078,
"step": 297
},
{
"epoch": 0.962843295638126,
"grad_norm": 0.8790452308237618,
"learning_rate": 3.0877332394275806e-06,
"loss": 0.0796,
"step": 298
},
{
"epoch": 0.9660743134087237,
"grad_norm": 0.8200476689531796,
"learning_rate": 3.0739939724997205e-06,
"loss": 0.0777,
"step": 299
},
{
"epoch": 0.9693053311793215,
"grad_norm": 0.91054817725634,
"learning_rate": 3.0602363800505198e-06,
"loss": 0.0823,
"step": 300
},
{
"epoch": 0.9725363489499192,
"grad_norm": 1.0055625552384646,
"learning_rate": 3.0464609013094244e-06,
"loss": 0.0723,
"step": 301
},
{
"epoch": 0.975767366720517,
"grad_norm": 0.8948435648737145,
"learning_rate": 3.032667976076923e-06,
"loss": 0.0776,
"step": 302
},
{
"epoch": 0.9789983844911146,
"grad_norm": 0.6947875633713175,
"learning_rate": 3.0188580447105055e-06,
"loss": 0.0576,
"step": 303
},
{
"epoch": 0.9822294022617124,
"grad_norm": 0.710748833707824,
"learning_rate": 3.0050315481106074e-06,
"loss": 0.0606,
"step": 304
},
{
"epoch": 0.9854604200323102,
"grad_norm": 0.4721520789494047,
"learning_rate": 2.9911889277065314e-06,
"loss": 0.043,
"step": 305
},
{
"epoch": 0.9886914378029079,
"grad_norm": 0.7878237728564794,
"learning_rate": 2.977330625442352e-06,
"loss": 0.0832,
"step": 306
},
{
"epoch": 0.9919224555735057,
"grad_norm": 0.818350538492891,
"learning_rate": 2.963457083762809e-06,
"loss": 0.0727,
"step": 307
},
{
"epoch": 0.9951534733441034,
"grad_norm": 0.8240104583346066,
"learning_rate": 2.949568745599182e-06,
"loss": 0.0698,
"step": 308
},
{
"epoch": 0.9983844911147012,
"grad_norm": 0.6820819240767714,
"learning_rate": 2.935666054355146e-06,
"loss": 0.0604,
"step": 309
},
{
"epoch": 1.0,
"grad_norm": 0.6820819240767714,
"learning_rate": 2.921749453892618e-06,
"loss": 0.0453,
"step": 310
},
{
"epoch": 1.0032310177705976,
"grad_norm": 0.8574995005049099,
"learning_rate": 2.9078193885175875e-06,
"loss": 0.0401,
"step": 311
},
{
"epoch": 1.0064620355411955,
"grad_norm": 0.7517375531460555,
"learning_rate": 2.893876302965925e-06,
"loss": 0.0646,
"step": 312
},
{
"epoch": 1.0096930533117932,
"grad_norm": 0.7030688450315203,
"learning_rate": 2.8799206423891895e-06,
"loss": 0.0428,
"step": 313
},
{
"epoch": 1.012924071082391,
"grad_norm": 0.6810197305641746,
"learning_rate": 2.865952852340417e-06,
"loss": 0.0491,
"step": 314
},
{
"epoch": 1.0161550888529887,
"grad_norm": 0.6457251139138377,
"learning_rate": 2.8519733787598887e-06,
"loss": 0.0527,
"step": 315
},
{
"epoch": 1.0193861066235865,
"grad_norm": 0.6750520782194651,
"learning_rate": 2.8379826679609e-06,
"loss": 0.0456,
"step": 316
},
{
"epoch": 1.0226171243941842,
"grad_norm": 0.7287551036819017,
"learning_rate": 2.8239811666155105e-06,
"loss": 0.0703,
"step": 317
},
{
"epoch": 1.0258481421647818,
"grad_norm": 0.5506534914240341,
"learning_rate": 2.8099693217402807e-06,
"loss": 0.0461,
"step": 318
},
{
"epoch": 1.0290791599353797,
"grad_norm": 0.6716945269140621,
"learning_rate": 2.795947580682003e-06,
"loss": 0.0424,
"step": 319
},
{
"epoch": 1.0323101777059773,
"grad_norm": 0.6029067986975312,
"learning_rate": 2.7819163911034175e-06,
"loss": 0.032,
"step": 320
},
{
"epoch": 1.0355411954765752,
"grad_norm": 0.5754461947410837,
"learning_rate": 2.767876200968923e-06,
"loss": 0.0417,
"step": 321
},
{
"epoch": 1.0387722132471728,
"grad_norm": 0.5670203427921292,
"learning_rate": 2.7538274585302707e-06,
"loss": 0.0676,
"step": 322
},
{
"epoch": 1.0420032310177707,
"grad_norm": 0.6371474674240052,
"learning_rate": 2.7397706123122563e-06,
"loss": 0.0401,
"step": 323
},
{
"epoch": 1.0452342487883683,
"grad_norm": 0.633563817351995,
"learning_rate": 2.7257061110984005e-06,
"loss": 0.0507,
"step": 324
},
{
"epoch": 1.048465266558966,
"grad_norm": 0.455254402627076,
"learning_rate": 2.7116344039166192e-06,
"loss": 0.0346,
"step": 325
},
{
"epoch": 1.0516962843295639,
"grad_norm": 0.46808216871215624,
"learning_rate": 2.6975559400248876e-06,
"loss": 0.0447,
"step": 326
},
{
"epoch": 1.0549273021001615,
"grad_norm": 0.5421960059682478,
"learning_rate": 2.683471168896899e-06,
"loss": 0.0412,
"step": 327
},
{
"epoch": 1.0581583198707594,
"grad_norm": 0.6365474014428788,
"learning_rate": 2.6693805402077123e-06,
"loss": 0.0482,
"step": 328
},
{
"epoch": 1.061389337641357,
"grad_norm": 0.5544336402631056,
"learning_rate": 2.6552845038193977e-06,
"loss": 0.0392,
"step": 329
},
{
"epoch": 1.0646203554119547,
"grad_norm": 0.49713090653008446,
"learning_rate": 2.641183509766675e-06,
"loss": 0.028,
"step": 330
},
{
"epoch": 1.0678513731825525,
"grad_norm": 0.42542651226656086,
"learning_rate": 2.627078008242541e-06,
"loss": 0.0361,
"step": 331
},
{
"epoch": 1.0710823909531502,
"grad_norm": 0.5295471778809274,
"learning_rate": 2.6129684495839013e-06,
"loss": 0.0657,
"step": 332
},
{
"epoch": 1.074313408723748,
"grad_norm": 0.5910271655601097,
"learning_rate": 2.5988552842571903e-06,
"loss": 0.0447,
"step": 333
},
{
"epoch": 1.0775444264943457,
"grad_norm": 0.5391865465723893,
"learning_rate": 2.5847389628439905e-06,
"loss": 0.0468,
"step": 334
},
{
"epoch": 1.0807754442649435,
"grad_norm": 0.7945092160241262,
"learning_rate": 2.570619936026647e-06,
"loss": 0.0526,
"step": 335
},
{
"epoch": 1.0840064620355412,
"grad_norm": 0.6384237762747144,
"learning_rate": 2.5564986545738767e-06,
"loss": 0.0435,
"step": 336
},
{
"epoch": 1.0872374798061388,
"grad_norm": 0.6166409813836169,
"learning_rate": 2.542375569326382e-06,
"loss": 0.0405,
"step": 337
},
{
"epoch": 1.0904684975767367,
"grad_norm": 0.5773480639364901,
"learning_rate": 2.52825113118245e-06,
"loss": 0.0534,
"step": 338
},
{
"epoch": 1.0936995153473343,
"grad_norm": 0.5202558074438228,
"learning_rate": 2.514125791083563e-06,
"loss": 0.0507,
"step": 339
},
{
"epoch": 1.0969305331179322,
"grad_norm": 0.6263121460011797,
"learning_rate": 2.5e-06,
"loss": 0.0512,
"step": 340
},
{
"epoch": 1.1001615508885298,
"grad_norm": 0.7123374131643287,
"learning_rate": 2.485874208916438e-06,
"loss": 0.0332,
"step": 341
},
{
"epoch": 1.1033925686591277,
"grad_norm": 0.4202479639460712,
"learning_rate": 2.4717488688175513e-06,
"loss": 0.0407,
"step": 342
},
{
"epoch": 1.1066235864297254,
"grad_norm": 0.642912112013506,
"learning_rate": 2.4576244306736193e-06,
"loss": 0.0454,
"step": 343
},
{
"epoch": 1.109854604200323,
"grad_norm": 0.5278433369392868,
"learning_rate": 2.4435013454261246e-06,
"loss": 0.0454,
"step": 344
},
{
"epoch": 1.1130856219709209,
"grad_norm": 0.49367590472244555,
"learning_rate": 2.4293800639733537e-06,
"loss": 0.0425,
"step": 345
},
{
"epoch": 1.1163166397415185,
"grad_norm": 0.663577683371772,
"learning_rate": 2.4152610371560095e-06,
"loss": 0.0502,
"step": 346
},
{
"epoch": 1.1195476575121164,
"grad_norm": 0.6258554932784628,
"learning_rate": 2.40114471574281e-06,
"loss": 0.0503,
"step": 347
},
{
"epoch": 1.122778675282714,
"grad_norm": 0.6117692229579965,
"learning_rate": 2.3870315504160995e-06,
"loss": 0.0357,
"step": 348
},
{
"epoch": 1.1260096930533119,
"grad_norm": 0.698709357123754,
"learning_rate": 2.3729219917574597e-06,
"loss": 0.0398,
"step": 349
},
{
"epoch": 1.1292407108239095,
"grad_norm": 0.5509108322599596,
"learning_rate": 2.358816490233326e-06,
"loss": 0.0504,
"step": 350
},
{
"epoch": 1.1324717285945072,
"grad_norm": 0.5944772383426618,
"learning_rate": 2.3447154961806027e-06,
"loss": 0.0417,
"step": 351
},
{
"epoch": 1.135702746365105,
"grad_norm": 0.7597386477794142,
"learning_rate": 2.330619459792289e-06,
"loss": 0.0469,
"step": 352
},
{
"epoch": 1.1389337641357027,
"grad_norm": 0.5719437487716487,
"learning_rate": 2.3165288311031024e-06,
"loss": 0.0294,
"step": 353
},
{
"epoch": 1.1421647819063006,
"grad_norm": 0.6213273588952939,
"learning_rate": 2.3024440599751132e-06,
"loss": 0.0384,
"step": 354
},
{
"epoch": 1.1453957996768982,
"grad_norm": 0.6354291435615703,
"learning_rate": 2.288365596083381e-06,
"loss": 0.0349,
"step": 355
},
{
"epoch": 1.148626817447496,
"grad_norm": 0.7486710035630336,
"learning_rate": 2.274293888901599e-06,
"loss": 0.0406,
"step": 356
},
{
"epoch": 1.1518578352180937,
"grad_norm": 0.46539243165120686,
"learning_rate": 2.260229387687744e-06,
"loss": 0.0406,
"step": 357
},
{
"epoch": 1.1550888529886914,
"grad_norm": 0.8582980219163947,
"learning_rate": 2.24617254146973e-06,
"loss": 0.0751,
"step": 358
},
{
"epoch": 1.1583198707592892,
"grad_norm": 0.5134947351957787,
"learning_rate": 2.232123799031078e-06,
"loss": 0.0257,
"step": 359
},
{
"epoch": 1.1615508885298869,
"grad_norm": 0.6414286514892072,
"learning_rate": 2.2180836088965833e-06,
"loss": 0.053,
"step": 360
},
{
"epoch": 1.1647819063004847,
"grad_norm": 0.6893327762289948,
"learning_rate": 2.204052419317998e-06,
"loss": 0.0518,
"step": 361
},
{
"epoch": 1.1680129240710824,
"grad_norm": 0.6315851441021518,
"learning_rate": 2.19003067825972e-06,
"loss": 0.0311,
"step": 362
},
{
"epoch": 1.1712439418416802,
"grad_norm": 0.506389589217495,
"learning_rate": 2.1760188333844907e-06,
"loss": 0.03,
"step": 363
},
{
"epoch": 1.1744749596122779,
"grad_norm": 0.48074703533770824,
"learning_rate": 2.1620173320391007e-06,
"loss": 0.0575,
"step": 364
},
{
"epoch": 1.1777059773828755,
"grad_norm": 0.7338858093688776,
"learning_rate": 2.1480266212401117e-06,
"loss": 0.0652,
"step": 365
},
{
"epoch": 1.1809369951534734,
"grad_norm": 0.5215250199335827,
"learning_rate": 2.1340471476595836e-06,
"loss": 0.0404,
"step": 366
},
{
"epoch": 1.184168012924071,
"grad_norm": 0.5101940305735346,
"learning_rate": 2.1200793576108105e-06,
"loss": 0.04,
"step": 367
},
{
"epoch": 1.187399030694669,
"grad_norm": 0.6072945357387388,
"learning_rate": 2.1061236970340756e-06,
"loss": 0.0471,
"step": 368
},
{
"epoch": 1.1906300484652665,
"grad_norm": 0.6485948720338923,
"learning_rate": 2.0921806114824134e-06,
"loss": 0.0402,
"step": 369
},
{
"epoch": 1.1938610662358644,
"grad_norm": 0.5618119097496929,
"learning_rate": 2.0782505461073822e-06,
"loss": 0.0309,
"step": 370
},
{
"epoch": 1.197092084006462,
"grad_norm": 0.5710364970799129,
"learning_rate": 2.0643339456448547e-06,
"loss": 0.0534,
"step": 371
},
{
"epoch": 1.2003231017770597,
"grad_norm": 0.5166825620149433,
"learning_rate": 2.0504312544008193e-06,
"loss": 0.0443,
"step": 372
},
{
"epoch": 1.2035541195476576,
"grad_norm": 0.6582279195349342,
"learning_rate": 2.0365429162371922e-06,
"loss": 0.042,
"step": 373
},
{
"epoch": 1.2067851373182552,
"grad_norm": 0.5628923740662667,
"learning_rate": 2.0226693745576494e-06,
"loss": 0.0338,
"step": 374
},
{
"epoch": 1.210016155088853,
"grad_norm": 0.5269299240421488,
"learning_rate": 2.008811072293469e-06,
"loss": 0.0333,
"step": 375
},
{
"epoch": 1.2132471728594507,
"grad_norm": 0.37904232204217875,
"learning_rate": 1.9949684518893926e-06,
"loss": 0.0321,
"step": 376
},
{
"epoch": 1.2164781906300486,
"grad_norm": 0.5084850707267878,
"learning_rate": 1.9811419552894953e-06,
"loss": 0.0408,
"step": 377
},
{
"epoch": 1.2197092084006462,
"grad_norm": 0.4746627443509293,
"learning_rate": 1.9673320239230783e-06,
"loss": 0.0359,
"step": 378
},
{
"epoch": 1.2229402261712439,
"grad_norm": 0.6346489110095358,
"learning_rate": 1.9535390986905764e-06,
"loss": 0.0505,
"step": 379
},
{
"epoch": 1.2261712439418417,
"grad_norm": 0.6162762793044799,
"learning_rate": 1.939763619949481e-06,
"loss": 0.0327,
"step": 380
},
{
"epoch": 1.2294022617124394,
"grad_norm": 0.7282286592219928,
"learning_rate": 1.92600602750028e-06,
"loss": 0.0664,
"step": 381
},
{
"epoch": 1.2326332794830372,
"grad_norm": 0.687246463772339,
"learning_rate": 1.9122667605724202e-06,
"loss": 0.0485,
"step": 382
},
{
"epoch": 1.235864297253635,
"grad_norm": 0.6284714240874483,
"learning_rate": 1.8985462578102786e-06,
"loss": 0.0505,
"step": 383
},
{
"epoch": 1.2390953150242328,
"grad_norm": 0.6820536891979064,
"learning_rate": 1.884844957259163e-06,
"loss": 0.0311,
"step": 384
},
{
"epoch": 1.2423263327948304,
"grad_norm": 0.549881437893592,
"learning_rate": 1.8711632963513237e-06,
"loss": 0.0375,
"step": 385
},
{
"epoch": 1.245557350565428,
"grad_norm": 0.6650268250589539,
"learning_rate": 1.857501711891993e-06,
"loss": 0.054,
"step": 386
},
{
"epoch": 1.248788368336026,
"grad_norm": 0.5113954409802595,
"learning_rate": 1.8438606400454312e-06,
"loss": 0.031,
"step": 387
},
{
"epoch": 1.2520193861066236,
"grad_norm": 0.5332778481364839,
"learning_rate": 1.830240516321008e-06,
"loss": 0.0463,
"step": 388
},
{
"epoch": 1.2552504038772212,
"grad_norm": 0.5928101900055703,
"learning_rate": 1.8166417755592975e-06,
"loss": 0.0353,
"step": 389
},
{
"epoch": 1.258481421647819,
"grad_norm": 0.4719442237818212,
"learning_rate": 1.8030648519181926e-06,
"loss": 0.037,
"step": 390
},
{
"epoch": 1.261712439418417,
"grad_norm": 0.6879135991203472,
"learning_rate": 1.789510178859046e-06,
"loss": 0.0523,
"step": 391
},
{
"epoch": 1.2649434571890146,
"grad_norm": 0.6862575200798275,
"learning_rate": 1.7759781891328321e-06,
"loss": 0.0344,
"step": 392
},
{
"epoch": 1.2681744749596122,
"grad_norm": 0.4157263515958759,
"learning_rate": 1.762469314766328e-06,
"loss": 0.0373,
"step": 393
},
{
"epoch": 1.27140549273021,
"grad_norm": 0.6888877331536797,
"learning_rate": 1.7489839870483236e-06,
"loss": 0.0493,
"step": 394
},
{
"epoch": 1.2746365105008077,
"grad_norm": 0.7225121433676083,
"learning_rate": 1.7355226365158489e-06,
"loss": 0.0685,
"step": 395
},
{
"epoch": 1.2778675282714054,
"grad_norm": 0.7380932165691942,
"learning_rate": 1.7220856929404342e-06,
"loss": 0.068,
"step": 396
},
{
"epoch": 1.2810985460420032,
"grad_norm": 0.8373452926500001,
"learning_rate": 1.7086735853143803e-06,
"loss": 0.0263,
"step": 397
},
{
"epoch": 1.284329563812601,
"grad_norm": 0.4057578810529795,
"learning_rate": 1.6952867418370707e-06,
"loss": 0.0435,
"step": 398
},
{
"epoch": 1.2875605815831987,
"grad_norm": 0.8172156898788309,
"learning_rate": 1.6819255899012963e-06,
"loss": 0.0521,
"step": 399
},
{
"epoch": 1.2907915993537964,
"grad_norm": 0.5540166018870007,
"learning_rate": 1.6685905560796101e-06,
"loss": 0.0301,
"step": 400
},
{
"epoch": 1.2940226171243943,
"grad_norm": 0.5152650366438487,
"learning_rate": 1.6552820661107119e-06,
"loss": 0.0357,
"step": 401
},
{
"epoch": 1.297253634894992,
"grad_norm": 0.5108482274841849,
"learning_rate": 1.6420005448858522e-06,
"loss": 0.0431,
"step": 402
},
{
"epoch": 1.3004846526655895,
"grad_norm": 0.4240316233127095,
"learning_rate": 1.6287464164352684e-06,
"loss": 0.0342,
"step": 403
},
{
"epoch": 1.3037156704361874,
"grad_norm": 0.5910340203210401,
"learning_rate": 1.6155201039146478e-06,
"loss": 0.0466,
"step": 404
},
{
"epoch": 1.306946688206785,
"grad_norm": 0.4716663558161469,
"learning_rate": 1.6023220295916162e-06,
"loss": 0.0247,
"step": 405
},
{
"epoch": 1.310177705977383,
"grad_norm": 0.5348908739290964,
"learning_rate": 1.5891526148322594e-06,
"loss": 0.0448,
"step": 406
},
{
"epoch": 1.3134087237479806,
"grad_norm": 0.7405657033049722,
"learning_rate": 1.576012280087666e-06,
"loss": 0.0425,
"step": 407
},
{
"epoch": 1.3166397415185784,
"grad_norm": 0.5749817118431164,
"learning_rate": 1.562901444880508e-06,
"loss": 0.0415,
"step": 408
},
{
"epoch": 1.319870759289176,
"grad_norm": 0.659084728824521,
"learning_rate": 1.5498205277916444e-06,
"loss": 0.0525,
"step": 409
},
{
"epoch": 1.3231017770597737,
"grad_norm": 0.4660035971953124,
"learning_rate": 1.5367699464467596e-06,
"loss": 0.0352,
"step": 410
},
{
"epoch": 1.3263327948303716,
"grad_norm": 0.6397348475129173,
"learning_rate": 1.523750117503028e-06,
"loss": 0.0456,
"step": 411
},
{
"epoch": 1.3295638126009692,
"grad_norm": 0.6385407976586193,
"learning_rate": 1.5107614566358136e-06,
"loss": 0.0562,
"step": 412
},
{
"epoch": 1.332794830371567,
"grad_norm": 0.5922977287717205,
"learning_rate": 1.4978043785253964e-06,
"loss": 0.0333,
"step": 413
},
{
"epoch": 1.3360258481421647,
"grad_norm": 0.47241930141595956,
"learning_rate": 1.4848792968437376e-06,
"loss": 0.0311,
"step": 414
},
{
"epoch": 1.3392568659127626,
"grad_norm": 0.43287945232089914,
"learning_rate": 1.4719866242412661e-06,
"loss": 0.0384,
"step": 415
},
{
"epoch": 1.3424878836833603,
"grad_norm": 0.5476421812276494,
"learning_rate": 1.4591267723337122e-06,
"loss": 0.0285,
"step": 416
},
{
"epoch": 1.345718901453958,
"grad_norm": 0.6654675392000585,
"learning_rate": 1.4463001516889597e-06,
"loss": 0.0546,
"step": 417
},
{
"epoch": 1.3489499192245558,
"grad_norm": 0.5265404309231011,
"learning_rate": 1.4335071718139379e-06,
"loss": 0.0563,
"step": 418
},
{
"epoch": 1.3521809369951534,
"grad_norm": 0.7273168490119778,
"learning_rate": 1.4207482411415532e-06,
"loss": 0.0319,
"step": 419
},
{
"epoch": 1.3554119547657513,
"grad_norm": 0.5515284785247014,
"learning_rate": 1.4080237670176456e-06,
"loss": 0.0465,
"step": 420
},
{
"epoch": 1.358642972536349,
"grad_norm": 0.6815587580127385,
"learning_rate": 1.395334155687981e-06,
"loss": 0.0396,
"step": 421
},
{
"epoch": 1.3618739903069468,
"grad_norm": 0.5046530996438253,
"learning_rate": 1.382679812285287e-06,
"loss": 0.0359,
"step": 422
},
{
"epoch": 1.3651050080775444,
"grad_norm": 0.636616018716514,
"learning_rate": 1.3700611408163158e-06,
"loss": 0.0364,
"step": 423
},
{
"epoch": 1.368336025848142,
"grad_norm": 0.8016482324734947,
"learning_rate": 1.357478544148943e-06,
"loss": 0.0548,
"step": 424
},
{
"epoch": 1.37156704361874,
"grad_norm": 0.7335385001009505,
"learning_rate": 1.3449324239993094e-06,
"loss": 0.0307,
"step": 425
},
{
"epoch": 1.3747980613893376,
"grad_norm": 0.5405071350061603,
"learning_rate": 1.3324231809189985e-06,
"loss": 0.0435,
"step": 426
},
{
"epoch": 1.3780290791599354,
"grad_norm": 0.5968950584538801,
"learning_rate": 1.3199512142822374e-06,
"loss": 0.0442,
"step": 427
},
{
"epoch": 1.381260096930533,
"grad_norm": 0.5586370191823509,
"learning_rate": 1.3075169222731573e-06,
"loss": 0.034,
"step": 428
},
{
"epoch": 1.384491114701131,
"grad_norm": 0.6462203526051947,
"learning_rate": 1.2951207018730772e-06,
"loss": 0.0591,
"step": 429
},
{
"epoch": 1.3877221324717286,
"grad_norm": 0.4892957539044203,
"learning_rate": 1.2827629488478254e-06,
"loss": 0.0345,
"step": 430
},
{
"epoch": 1.3909531502423262,
"grad_norm": 0.543453607383533,
"learning_rate": 1.270444057735113e-06,
"loss": 0.0275,
"step": 431
},
{
"epoch": 1.394184168012924,
"grad_norm": 0.6522770107038732,
"learning_rate": 1.25816442183193e-06,
"loss": 0.0446,
"step": 432
},
{
"epoch": 1.3974151857835218,
"grad_norm": 0.54367348046309,
"learning_rate": 1.2459244331819912e-06,
"loss": 0.0381,
"step": 433
},
{
"epoch": 1.4006462035541196,
"grad_norm": 0.6775444066593574,
"learning_rate": 1.2337244825632217e-06,
"loss": 0.0507,
"step": 434
},
{
"epoch": 1.4038772213247173,
"grad_norm": 0.7125046807656238,
"learning_rate": 1.2215649594752782e-06,
"loss": 0.0564,
"step": 435
},
{
"epoch": 1.4071082390953151,
"grad_norm": 0.6675281925628074,
"learning_rate": 1.2094462521271156e-06,
"loss": 0.0337,
"step": 436
},
{
"epoch": 1.4103392568659128,
"grad_norm": 0.5401273336581622,
"learning_rate": 1.197368747424592e-06,
"loss": 0.0398,
"step": 437
},
{
"epoch": 1.4135702746365104,
"grad_norm": 0.7065350638753161,
"learning_rate": 1.1853328309581139e-06,
"loss": 0.0435,
"step": 438
},
{
"epoch": 1.4168012924071083,
"grad_norm": 0.7095730881777876,
"learning_rate": 1.17333888699033e-06,
"loss": 0.0297,
"step": 439
},
{
"epoch": 1.420032310177706,
"grad_norm": 0.8419216282460489,
"learning_rate": 1.161387298443863e-06,
"loss": 0.0516,
"step": 440
},
{
"epoch": 1.4232633279483038,
"grad_norm": 0.6570671193798376,
"learning_rate": 1.149478446889077e-06,
"loss": 0.0499,
"step": 441
},
{
"epoch": 1.4264943457189014,
"grad_norm": 0.4516949531869673,
"learning_rate": 1.1376127125319065e-06,
"loss": 0.0332,
"step": 442
},
{
"epoch": 1.4297253634894993,
"grad_norm": 0.6804270748390296,
"learning_rate": 1.125790474201708e-06,
"loss": 0.0554,
"step": 443
},
{
"epoch": 1.432956381260097,
"grad_norm": 0.6237302316612817,
"learning_rate": 1.1140121093391736e-06,
"loss": 0.0406,
"step": 444
},
{
"epoch": 1.4361873990306946,
"grad_norm": 0.5492979077070937,
"learning_rate": 1.1022779939842704e-06,
"loss": 0.0359,
"step": 445
},
{
"epoch": 1.4394184168012925,
"grad_norm": 0.5435741772086434,
"learning_rate": 1.0905885027642484e-06,
"loss": 0.0524,
"step": 446
},
{
"epoch": 1.44264943457189,
"grad_norm": 0.6801306673227177,
"learning_rate": 1.0789440088816666e-06,
"loss": 0.0385,
"step": 447
},
{
"epoch": 1.445880452342488,
"grad_norm": 0.4434075942240935,
"learning_rate": 1.0673448841024875e-06,
"loss": 0.048,
"step": 448
},
{
"epoch": 1.4491114701130856,
"grad_norm": 0.6540515881701509,
"learning_rate": 1.0557914987442048e-06,
"loss": 0.0471,
"step": 449
},
{
"epoch": 1.4523424878836835,
"grad_norm": 0.6361305209643866,
"learning_rate": 1.0442842216640168e-06,
"loss": 0.0388,
"step": 450
},
{
"epoch": 1.4555735056542811,
"grad_norm": 0.5896519670655204,
"learning_rate": 1.0328234202470574e-06,
"loss": 0.0466,
"step": 451
},
{
"epoch": 1.4588045234248788,
"grad_norm": 0.49886003535682916,
"learning_rate": 1.021409460394663e-06,
"loss": 0.024,
"step": 452
},
{
"epoch": 1.4620355411954766,
"grad_norm": 0.4865979088073283,
"learning_rate": 1.0100427065126874e-06,
"loss": 0.0461,
"step": 453
},
{
"epoch": 1.4652665589660743,
"grad_norm": 0.6304447558055624,
"learning_rate": 9.987235214998741e-07,
"loss": 0.0303,
"step": 454
},
{
"epoch": 1.468497576736672,
"grad_norm": 0.5998572755132743,
"learning_rate": 9.87452266736266e-07,
"loss": 0.0413,
"step": 455
},
{
"epoch": 1.4717285945072698,
"grad_norm": 0.475261837629553,
"learning_rate": 9.762293020716696e-07,
"loss": 0.051,
"step": 456
},
{
"epoch": 1.4749596122778676,
"grad_norm": 0.5717008410694144,
"learning_rate": 9.650549858141646e-07,
"loss": 0.0325,
"step": 457
},
{
"epoch": 1.4781906300484653,
"grad_norm": 0.5180031308985593,
"learning_rate": 9.53929674718668e-07,
"loss": 0.0296,
"step": 458
},
{
"epoch": 1.481421647819063,
"grad_norm": 0.5815736580452745,
"learning_rate": 9.428537239755381e-07,
"loss": 0.0462,
"step": 459
},
{
"epoch": 1.4846526655896608,
"grad_norm": 0.6313068151131468,
"learning_rate": 9.318274871992408e-07,
"loss": 0.0726,
"step": 460
},
{
"epoch": 1.4878836833602584,
"grad_norm": 1.221847458822884,
"learning_rate": 9.208513164170579e-07,
"loss": 0.0405,
"step": 461
},
{
"epoch": 1.491114701130856,
"grad_norm": 0.5310642493149078,
"learning_rate": 9.099255620578451e-07,
"loss": 0.0296,
"step": 462
},
{
"epoch": 1.494345718901454,
"grad_norm": 0.5177627615454572,
"learning_rate": 8.990505729408494e-07,
"loss": 0.0651,
"step": 463
},
{
"epoch": 1.4975767366720518,
"grad_norm": 0.6991644226087359,
"learning_rate": 8.882266962645695e-07,
"loss": 0.0403,
"step": 464
},
{
"epoch": 1.5008077544426495,
"grad_norm": 0.5744181895807364,
"learning_rate": 8.774542775956679e-07,
"loss": 0.0512,
"step": 465
},
{
"epoch": 1.504038772213247,
"grad_norm": 0.5233586593423929,
"learning_rate": 8.667336608579488e-07,
"loss": 0.0408,
"step": 466
},
{
"epoch": 1.507269789983845,
"grad_norm": 0.569083608134158,
"learning_rate": 8.560651883213633e-07,
"loss": 0.0325,
"step": 467
},
{
"epoch": 1.5105008077544426,
"grad_norm": 0.5065403358211885,
"learning_rate": 8.454492005910942e-07,
"loss": 0.039,
"step": 468
},
{
"epoch": 1.5137318255250403,
"grad_norm": 0.5538367181328561,
"learning_rate": 8.34886036596676e-07,
"loss": 0.0461,
"step": 469
},
{
"epoch": 1.5169628432956381,
"grad_norm": 0.5214647871849324,
"learning_rate": 8.243760335811734e-07,
"loss": 0.0284,
"step": 470
},
{
"epoch": 1.520193861066236,
"grad_norm": 0.6202345630142632,
"learning_rate": 8.139195270904182e-07,
"loss": 0.0448,
"step": 471
},
{
"epoch": 1.5234248788368336,
"grad_norm": 0.596159859282425,
"learning_rate": 8.035168509622948e-07,
"loss": 0.0743,
"step": 472
},
{
"epoch": 1.5266558966074313,
"grad_norm": 0.5888141535757658,
"learning_rate": 7.931683373160789e-07,
"loss": 0.0379,
"step": 473
},
{
"epoch": 1.5298869143780292,
"grad_norm": 0.7088883925452755,
"learning_rate": 7.828743165418393e-07,
"loss": 0.0676,
"step": 474
},
{
"epoch": 1.5331179321486268,
"grad_norm": 0.7275077844858919,
"learning_rate": 7.726351172898869e-07,
"loss": 0.0372,
"step": 475
},
{
"epoch": 1.5363489499192244,
"grad_norm": 0.4568256055898612,
"learning_rate": 7.624510664602819e-07,
"loss": 0.037,
"step": 476
},
{
"epoch": 1.5395799676898223,
"grad_norm": 0.5900479097481395,
"learning_rate": 7.523224891923983e-07,
"loss": 0.0335,
"step": 477
},
{
"epoch": 1.5428109854604202,
"grad_norm": 0.5945624497711766,
"learning_rate": 7.422497088545436e-07,
"loss": 0.0473,
"step": 478
},
{
"epoch": 1.5460420032310178,
"grad_norm": 0.6053475731541442,
"learning_rate": 7.322330470336314e-07,
"loss": 0.0355,
"step": 479
},
{
"epoch": 1.5492730210016155,
"grad_norm": 0.5924482508513196,
"learning_rate": 7.222728235249196e-07,
"loss": 0.0395,
"step": 480
},
{
"epoch": 1.5525040387722133,
"grad_norm": 0.6450291567303781,
"learning_rate": 7.123693563217978e-07,
"loss": 0.0597,
"step": 481
},
{
"epoch": 1.555735056542811,
"grad_norm": 0.8328909187805514,
"learning_rate": 7.025229616056326e-07,
"loss": 0.0472,
"step": 482
},
{
"epoch": 1.5589660743134086,
"grad_norm": 0.6806153349540757,
"learning_rate": 6.927339537356778e-07,
"loss": 0.0484,
"step": 483
},
{
"epoch": 1.5621970920840065,
"grad_norm": 0.6602101596852485,
"learning_rate": 6.830026452390354e-07,
"loss": 0.0477,
"step": 484
},
{
"epoch": 1.5654281098546043,
"grad_norm": 0.5937543973198223,
"learning_rate": 6.733293468006774e-07,
"loss": 0.042,
"step": 485
},
{
"epoch": 1.568659127625202,
"grad_norm": 0.6210912711298314,
"learning_rate": 6.637143672535282e-07,
"loss": 0.0411,
"step": 486
},
{
"epoch": 1.5718901453957996,
"grad_norm": 0.563220202424066,
"learning_rate": 6.541580135686046e-07,
"loss": 0.0287,
"step": 487
},
{
"epoch": 1.5751211631663975,
"grad_norm": 0.4909721841694075,
"learning_rate": 6.446605908452122e-07,
"loss": 0.0341,
"step": 488
},
{
"epoch": 1.5783521809369951,
"grad_norm": 0.6948201563174986,
"learning_rate": 6.352224023012096e-07,
"loss": 0.046,
"step": 489
},
{
"epoch": 1.5815831987075928,
"grad_norm": 0.46668729207255394,
"learning_rate": 6.258437492633254e-07,
"loss": 0.0597,
"step": 490
},
{
"epoch": 1.5848142164781907,
"grad_norm": 0.7349838782964371,
"learning_rate": 6.165249311575361e-07,
"loss": 0.047,
"step": 491
},
{
"epoch": 1.5880452342487885,
"grad_norm": 0.6329932642072914,
"learning_rate": 6.072662454995101e-07,
"loss": 0.0402,
"step": 492
},
{
"epoch": 1.5912762520193862,
"grad_norm": 1.1075017060647618,
"learning_rate": 5.980679878851076e-07,
"loss": 0.0644,
"step": 493
},
{
"epoch": 1.5945072697899838,
"grad_norm": 0.6709210038908787,
"learning_rate": 5.889304519809402e-07,
"loss": 0.0405,
"step": 494
},
{
"epoch": 1.5977382875605817,
"grad_norm": 0.6039830090199084,
"learning_rate": 5.798539295150027e-07,
"loss": 0.0515,
"step": 495
},
{
"epoch": 1.6009693053311793,
"grad_norm": 0.47306097411821496,
"learning_rate": 5.708387102673507e-07,
"loss": 0.0452,
"step": 496
},
{
"epoch": 1.604200323101777,
"grad_norm": 0.7299041964334542,
"learning_rate": 5.618850820608548e-07,
"loss": 0.0386,
"step": 497
},
{
"epoch": 1.6074313408723748,
"grad_norm": 0.527708763111458,
"learning_rate": 5.529933307520102e-07,
"loss": 0.037,
"step": 498
},
{
"epoch": 1.6106623586429727,
"grad_norm": 0.6276271955216034,
"learning_rate": 5.441637402218077e-07,
"loss": 0.0566,
"step": 499
},
{
"epoch": 1.6138933764135701,
"grad_norm": 0.5498853546865076,
"learning_rate": 5.353965923666743e-07,
"loss": 0.0373,
"step": 500
},
{
"epoch": 1.617124394184168,
"grad_norm": 0.6058328263346706,
"learning_rate": 5.26692167089472e-07,
"loss": 0.0269,
"step": 501
},
{
"epoch": 1.6203554119547658,
"grad_norm": 0.4158541379346223,
"learning_rate": 5.180507422905585e-07,
"loss": 0.0393,
"step": 502
},
{
"epoch": 1.6235864297253635,
"grad_norm": 0.5063280186973653,
"learning_rate": 5.094725938589193e-07,
"loss": 0.0299,
"step": 503
},
{
"epoch": 1.6268174474959611,
"grad_norm": 0.42841992008265245,
"learning_rate": 5.009579956633578e-07,
"loss": 0.0301,
"step": 504
},
{
"epoch": 1.630048465266559,
"grad_norm": 0.7945718206145054,
"learning_rate": 4.925072195437511e-07,
"loss": 0.0488,
"step": 505
},
{
"epoch": 1.6332794830371569,
"grad_norm": 0.507794598401741,
"learning_rate": 4.841205353023715e-07,
"loss": 0.0617,
"step": 506
},
{
"epoch": 1.6365105008077543,
"grad_norm": 0.5915868711150505,
"learning_rate": 4.757982106952735e-07,
"loss": 0.0357,
"step": 507
},
{
"epoch": 1.6397415185783522,
"grad_norm": 0.5886571451007635,
"learning_rate": 4.6754051142374275e-07,
"loss": 0.0383,
"step": 508
},
{
"epoch": 1.64297253634895,
"grad_norm": 0.7114424474339083,
"learning_rate": 4.5934770112581713e-07,
"loss": 0.0299,
"step": 509
},
{
"epoch": 1.6462035541195477,
"grad_norm": 0.4631064089704488,
"learning_rate": 4.512200413678672e-07,
"loss": 0.04,
"step": 510
},
{
"epoch": 1.6494345718901453,
"grad_norm": 0.4403449444039548,
"learning_rate": 4.4315779163624476e-07,
"loss": 0.0329,
"step": 511
},
{
"epoch": 1.6526655896607432,
"grad_norm": 0.5420749756119222,
"learning_rate": 4.351612093290006e-07,
"loss": 0.0423,
"step": 512
},
{
"epoch": 1.655896607431341,
"grad_norm": 0.6010914237823689,
"learning_rate": 4.2723054974766585e-07,
"loss": 0.0381,
"step": 513
},
{
"epoch": 1.6591276252019385,
"grad_norm": 0.7377468114121554,
"learning_rate": 4.1936606608909887e-07,
"loss": 0.0515,
"step": 514
},
{
"epoch": 1.6623586429725363,
"grad_norm": 0.3875573660249705,
"learning_rate": 4.115680094374075e-07,
"loss": 0.0375,
"step": 515
},
{
"epoch": 1.6655896607431342,
"grad_norm": 0.605903732271668,
"learning_rate": 4.038366287559245e-07,
"loss": 0.0359,
"step": 516
},
{
"epoch": 1.6688206785137318,
"grad_norm": 0.5676552999273701,
"learning_rate": 3.961721708792662e-07,
"loss": 0.0352,
"step": 517
},
{
"epoch": 1.6720516962843295,
"grad_norm": 0.49258629677672283,
"learning_rate": 3.8857488050544903e-07,
"loss": 0.0427,
"step": 518
},
{
"epoch": 1.6752827140549273,
"grad_norm": 0.6179633227247718,
"learning_rate": 3.8104500018807806e-07,
"loss": 0.0436,
"step": 519
},
{
"epoch": 1.678513731825525,
"grad_norm": 0.6093007583811607,
"learning_rate": 3.7358277032860016e-07,
"loss": 0.0285,
"step": 520
},
{
"epoch": 1.6817447495961226,
"grad_norm": 0.5650933346174037,
"learning_rate": 3.6618842916863377e-07,
"loss": 0.062,
"step": 521
},
{
"epoch": 1.6849757673667205,
"grad_norm": 0.938334017983649,
"learning_rate": 3.5886221278236045e-07,
"loss": 0.0533,
"step": 522
},
{
"epoch": 1.6882067851373184,
"grad_norm": 0.5004878719352018,
"learning_rate": 3.5160435506898514e-07,
"loss": 0.0357,
"step": 523
},
{
"epoch": 1.691437802907916,
"grad_norm": 0.5331645289198377,
"learning_rate": 3.4441508774527345e-07,
"loss": 0.049,
"step": 524
},
{
"epoch": 1.6946688206785137,
"grad_norm": 0.6457764842086132,
"learning_rate": 3.3729464033815077e-07,
"loss": 0.0428,
"step": 525
},
{
"epoch": 1.6978998384491115,
"grad_norm": 0.5850353162840484,
"learning_rate": 3.3024324017737555e-07,
"loss": 0.0353,
"step": 526
},
{
"epoch": 1.7011308562197092,
"grad_norm": 0.5309296730801445,
"learning_rate": 3.232611123882809e-07,
"loss": 0.0423,
"step": 527
},
{
"epoch": 1.7043618739903068,
"grad_norm": 0.5673210523979683,
"learning_rate": 3.163484798845862e-07,
"loss": 0.0426,
"step": 528
},
{
"epoch": 1.7075928917609047,
"grad_norm": 0.6324074485656012,
"learning_rate": 3.0950556336128255e-07,
"loss": 0.0306,
"step": 529
},
{
"epoch": 1.7108239095315025,
"grad_norm": 0.44362940233898746,
"learning_rate": 3.0273258128758585e-07,
"loss": 0.0356,
"step": 530
},
{
"epoch": 1.7140549273021002,
"grad_norm": 0.6204310781031833,
"learning_rate": 2.960297498999601e-07,
"loss": 0.0375,
"step": 531
},
{
"epoch": 1.7172859450726978,
"grad_norm": 0.5062348215221578,
"learning_rate": 2.893972831952166e-07,
"loss": 0.0345,
"step": 532
},
{
"epoch": 1.7205169628432957,
"grad_norm": 0.5125908206118261,
"learning_rate": 2.82835392923681e-07,
"loss": 0.0284,
"step": 533
},
{
"epoch": 1.7237479806138933,
"grad_norm": 0.4768669694675723,
"learning_rate": 2.7634428858242995e-07,
"loss": 0.0466,
"step": 534
},
{
"epoch": 1.726978998384491,
"grad_norm": 0.6399021669002876,
"learning_rate": 2.699241774086081e-07,
"loss": 0.0561,
"step": 535
},
{
"epoch": 1.7302100161550888,
"grad_norm": 0.6404329572646896,
"learning_rate": 2.6357526437280764e-07,
"loss": 0.0322,
"step": 536
},
{
"epoch": 1.7334410339256867,
"grad_norm": 0.594233151602921,
"learning_rate": 2.572977521725242e-07,
"loss": 0.0343,
"step": 537
},
{
"epoch": 1.7366720516962844,
"grad_norm": 0.6970207221495152,
"learning_rate": 2.5109184122568797e-07,
"loss": 0.0676,
"step": 538
},
{
"epoch": 1.739903069466882,
"grad_norm": 0.5196706248162594,
"learning_rate": 2.449577296642647e-07,
"loss": 0.0306,
"step": 539
},
{
"epoch": 1.7431340872374799,
"grad_norm": 0.4727031774493694,
"learning_rate": 2.388956133279266e-07,
"loss": 0.0318,
"step": 540
},
{
"epoch": 1.7463651050080775,
"grad_norm": 0.5101584744111383,
"learning_rate": 2.329056857578049e-07,
"loss": 0.0475,
"step": 541
},
{
"epoch": 1.7495961227786752,
"grad_norm": 0.47288331779975473,
"learning_rate": 2.2698813819030802e-07,
"loss": 0.0349,
"step": 542
},
{
"epoch": 1.752827140549273,
"grad_norm": 0.5306522527872187,
"learning_rate": 2.2114315955101495e-07,
"loss": 0.0404,
"step": 543
},
{
"epoch": 1.7560581583198709,
"grad_norm": 0.6700784665030379,
"learning_rate": 2.153709364486467e-07,
"loss": 0.0535,
"step": 544
},
{
"epoch": 1.7592891760904685,
"grad_norm": 0.5532660459489857,
"learning_rate": 2.0967165316910675e-07,
"loss": 0.0362,
"step": 545
},
{
"epoch": 1.7625201938610662,
"grad_norm": 0.5690415151498087,
"learning_rate": 2.040454916695972e-07,
"loss": 0.0416,
"step": 546
},
{
"epoch": 1.765751211631664,
"grad_norm": 0.581032791692991,
"learning_rate": 1.9849263157281057e-07,
"loss": 0.0489,
"step": 547
},
{
"epoch": 1.7689822294022617,
"grad_norm": 0.5121050632247067,
"learning_rate": 1.9301325016119338e-07,
"loss": 0.0326,
"step": 548
},
{
"epoch": 1.7722132471728593,
"grad_norm": 0.4844742553742546,
"learning_rate": 1.8760752237128864e-07,
"loss": 0.0495,
"step": 549
},
{
"epoch": 1.7754442649434572,
"grad_norm": 0.6114242878876927,
"learning_rate": 1.8227562078814903e-07,
"loss": 0.0332,
"step": 550
},
{
"epoch": 1.778675282714055,
"grad_norm": 0.5118426883532984,
"learning_rate": 1.7701771563982757e-07,
"loss": 0.0369,
"step": 551
},
{
"epoch": 1.7819063004846527,
"grad_norm": 0.4094343968094117,
"learning_rate": 1.7183397479194175e-07,
"loss": 0.0449,
"step": 552
},
{
"epoch": 1.7851373182552503,
"grad_norm": 1.1161654981261413,
"learning_rate": 1.667245637423162e-07,
"loss": 0.0461,
"step": 553
},
{
"epoch": 1.7883683360258482,
"grad_norm": 0.5663414514839669,
"learning_rate": 1.6168964561569716e-07,
"loss": 0.0341,
"step": 554
},
{
"epoch": 1.7915993537964459,
"grad_norm": 0.5593424866536396,
"learning_rate": 1.5672938115854546e-07,
"loss": 0.0415,
"step": 555
},
{
"epoch": 1.7948303715670435,
"grad_norm": 0.6317941412989588,
"learning_rate": 1.5184392873390463e-07,
"loss": 0.0393,
"step": 556
},
{
"epoch": 1.7980613893376414,
"grad_norm": 0.4921503369201142,
"learning_rate": 1.470334443163432e-07,
"loss": 0.0338,
"step": 557
},
{
"epoch": 1.8012924071082392,
"grad_norm": 0.45350637218576434,
"learning_rate": 1.4229808148697732e-07,
"loss": 0.0307,
"step": 558
},
{
"epoch": 1.8045234248788369,
"grad_norm": 0.5558616504683307,
"learning_rate": 1.3763799142856693e-07,
"loss": 0.0394,
"step": 559
},
{
"epoch": 1.8077544426494345,
"grad_norm": 0.5602513622205471,
"learning_rate": 1.3305332292068706e-07,
"loss": 0.0461,
"step": 560
},
{
"epoch": 1.8109854604200324,
"grad_norm": 0.4909817316983247,
"learning_rate": 1.285442223349806e-07,
"loss": 0.0391,
"step": 561
},
{
"epoch": 1.81421647819063,
"grad_norm": 0.6251726656596086,
"learning_rate": 1.2411083363048386e-07,
"loss": 0.0384,
"step": 562
},
{
"epoch": 1.8174474959612277,
"grad_norm": 0.4604846376779599,
"learning_rate": 1.1975329834903017e-07,
"loss": 0.0271,
"step": 563
},
{
"epoch": 1.8206785137318255,
"grad_norm": 0.5538018269789088,
"learning_rate": 1.1547175561073154e-07,
"loss": 0.0606,
"step": 564
},
{
"epoch": 1.8239095315024234,
"grad_norm": 0.716319643356168,
"learning_rate": 1.1126634210953751e-07,
"loss": 0.0455,
"step": 565
},
{
"epoch": 1.827140549273021,
"grad_norm": 0.5640740061628162,
"learning_rate": 1.071371921088693e-07,
"loss": 0.0243,
"step": 566
},
{
"epoch": 1.8303715670436187,
"grad_norm": 0.31893744573237565,
"learning_rate": 1.0308443743733548e-07,
"loss": 0.0239,
"step": 567
},
{
"epoch": 1.8336025848142166,
"grad_norm": 0.5743187176077259,
"learning_rate": 9.91082074845215e-08,
"loss": 0.0399,
"step": 568
},
{
"epoch": 1.8368336025848142,
"grad_norm": 0.555173849824351,
"learning_rate": 9.520862919685903e-08,
"loss": 0.0565,
"step": 569
},
{
"epoch": 1.8400646203554119,
"grad_norm": 0.5247191749483705,
"learning_rate": 9.138582707357429e-08,
"loss": 0.0317,
"step": 570
},
{
"epoch": 1.8432956381260097,
"grad_norm": 0.6462019405718445,
"learning_rate": 8.763992316271175e-08,
"loss": 0.0443,
"step": 571
},
{
"epoch": 1.8465266558966076,
"grad_norm": 0.5199310882556472,
"learning_rate": 8.397103705723774e-08,
"loss": 0.0496,
"step": 572
},
{
"epoch": 1.849757673667205,
"grad_norm": 0.5797413283647508,
"learning_rate": 8.037928589122306e-08,
"loss": 0.0362,
"step": 573
},
{
"epoch": 1.8529886914378029,
"grad_norm": 0.5916717165607657,
"learning_rate": 7.686478433610339e-08,
"loss": 0.036,
"step": 574
},
{
"epoch": 1.8562197092084007,
"grad_norm": 0.6872719525671662,
"learning_rate": 7.342764459701723e-08,
"loss": 0.0433,
"step": 575
},
{
"epoch": 1.8594507269789984,
"grad_norm": 0.5719774164295217,
"learning_rate": 7.006797640922436e-08,
"loss": 0.045,
"step": 576
},
{
"epoch": 1.862681744749596,
"grad_norm": 0.4607000516397503,
"learning_rate": 6.678588703460165e-08,
"loss": 0.0308,
"step": 577
},
{
"epoch": 1.865912762520194,
"grad_norm": 0.47224203890648525,
"learning_rate": 6.358148125822e-08,
"loss": 0.0263,
"step": 578
},
{
"epoch": 1.8691437802907918,
"grad_norm": 0.4659556647523011,
"learning_rate": 6.045486138499756e-08,
"loss": 0.0331,
"step": 579
},
{
"epoch": 1.8723747980613892,
"grad_norm": 0.48850435913378176,
"learning_rate": 5.7406127236434016e-08,
"loss": 0.0385,
"step": 580
},
{
"epoch": 1.875605815831987,
"grad_norm": 0.527755833512547,
"learning_rate": 5.4435376147423945e-08,
"loss": 0.0361,
"step": 581
},
{
"epoch": 1.878836833602585,
"grad_norm": 0.5906290141554674,
"learning_rate": 5.154270296314878e-08,
"loss": 0.0341,
"step": 582
},
{
"epoch": 1.8820678513731826,
"grad_norm": 0.5773632179611871,
"learning_rate": 4.872820003604922e-08,
"loss": 0.0436,
"step": 583
},
{
"epoch": 1.8852988691437802,
"grad_norm": 0.5457918200937637,
"learning_rate": 4.599195722287536e-08,
"loss": 0.0316,
"step": 584
},
{
"epoch": 1.888529886914378,
"grad_norm": 0.48490285444923714,
"learning_rate": 4.3334061881820934e-08,
"loss": 0.0367,
"step": 585
},
{
"epoch": 1.891760904684976,
"grad_norm": 0.6250593429748459,
"learning_rate": 4.0754598869730824e-08,
"loss": 0.048,
"step": 586
},
{
"epoch": 1.8949919224555734,
"grad_norm": 0.7932835837622026,
"learning_rate": 3.825365053939406e-08,
"loss": 0.0444,
"step": 587
},
{
"epoch": 1.8982229402261712,
"grad_norm": 0.694824220848745,
"learning_rate": 3.583129673691427e-08,
"loss": 0.046,
"step": 588
},
{
"epoch": 1.901453957996769,
"grad_norm": 0.6400347943270668,
"learning_rate": 3.3487614799159186e-08,
"loss": 0.0447,
"step": 589
},
{
"epoch": 1.9046849757673667,
"grad_norm": 0.6428387216930922,
"learning_rate": 3.1222679551293486e-08,
"loss": 0.0378,
"step": 590
},
{
"epoch": 1.9079159935379644,
"grad_norm": 0.5181562999964454,
"learning_rate": 2.9036563304389032e-08,
"loss": 0.0447,
"step": 591
},
{
"epoch": 1.9111470113085622,
"grad_norm": 0.626996440091958,
"learning_rate": 2.6929335853115302e-08,
"loss": 0.0549,
"step": 592
},
{
"epoch": 1.9143780290791599,
"grad_norm": 0.46884001067775155,
"learning_rate": 2.490106447351315e-08,
"loss": 0.0277,
"step": 593
},
{
"epoch": 1.9176090468497575,
"grad_norm": 0.6158232854353491,
"learning_rate": 2.295181392084511e-08,
"loss": 0.0601,
"step": 594
},
{
"epoch": 1.9208400646203554,
"grad_norm": 0.9359381094535966,
"learning_rate": 2.1081646427528468e-08,
"loss": 0.0318,
"step": 595
},
{
"epoch": 1.9240710823909533,
"grad_norm": 0.49020001937141494,
"learning_rate": 1.9290621701149315e-08,
"loss": 0.0426,
"step": 596
},
{
"epoch": 1.927302100161551,
"grad_norm": 0.6612576122540997,
"learning_rate": 1.757879692255493e-08,
"loss": 0.0425,
"step": 597
},
{
"epoch": 1.9305331179321485,
"grad_norm": 0.6686158894140516,
"learning_rate": 1.5946226744029402e-08,
"loss": 0.0513,
"step": 598
},
{
"epoch": 1.9337641357027464,
"grad_norm": 0.570200319629059,
"learning_rate": 1.43929632875478e-08,
"loss": 0.0433,
"step": 599
},
{
"epoch": 1.936995153473344,
"grad_norm": 0.7735949555352855,
"learning_rate": 1.2919056143113062e-08,
"loss": 0.0626,
"step": 600
},
{
"epoch": 1.9402261712439417,
"grad_norm": 0.870645903135215,
"learning_rate": 1.1524552367171705e-08,
"loss": 0.0325,
"step": 601
},
{
"epoch": 1.9434571890145396,
"grad_norm": 0.7457181182787564,
"learning_rate": 1.0209496481112247e-08,
"loss": 0.0429,
"step": 602
},
{
"epoch": 1.9466882067851374,
"grad_norm": 0.4589602142690698,
"learning_rate": 8.973930469844127e-09,
"loss": 0.0456,
"step": 603
},
{
"epoch": 1.949919224555735,
"grad_norm": 0.6365759321501641,
"learning_rate": 7.81789378045572e-09,
"loss": 0.0444,
"step": 604
},
{
"epoch": 1.9531502423263327,
"grad_norm": 0.5685501801373803,
"learning_rate": 6.741423320957286e-09,
"loss": 0.0389,
"step": 605
},
{
"epoch": 1.9563812600969306,
"grad_norm": 0.6248328079624079,
"learning_rate": 5.7445534591002435e-09,
"loss": 0.0359,
"step": 606
},
{
"epoch": 1.9596122778675282,
"grad_norm": 0.5081816835827963,
"learning_rate": 4.8273160212811145e-09,
"loss": 0.0372,
"step": 607
},
{
"epoch": 1.9628432956381259,
"grad_norm": 0.6052821334268623,
"learning_rate": 3.989740291526212e-09,
"loss": 0.0342,
"step": 608
},
{
"epoch": 1.9660743134087237,
"grad_norm": 0.6069914651621039,
"learning_rate": 3.2318530105546198e-09,
"loss": 0.0562,
"step": 609
},
{
"epoch": 1.9693053311793216,
"grad_norm": 0.6247816362826526,
"learning_rate": 2.553678374926649e-09,
"loss": 0.0345,
"step": 610
},
{
"epoch": 1.9725363489499192,
"grad_norm": 0.4803333487137274,
"learning_rate": 1.9552380362697355e-09,
"loss": 0.0389,
"step": 611
},
{
"epoch": 1.975767366720517,
"grad_norm": 0.4064456733315355,
"learning_rate": 1.4365511005878796e-09,
"loss": 0.038,
"step": 612
},
{
"epoch": 1.9789983844911148,
"grad_norm": 0.6627884418012588,
"learning_rate": 9.976341276521361e-10,
"loss": 0.0499,
"step": 613
},
{
"epoch": 1.9822294022617124,
"grad_norm": 0.5188386174906152,
"learning_rate": 6.385011304704814e-10,
"loss": 0.0504,
"step": 614
},
{
"epoch": 1.98546042003231,
"grad_norm": 0.4923517024917629,
"learning_rate": 3.59163574841781e-10,
"loss": 0.0313,
"step": 615
},
{
"epoch": 1.988691437802908,
"grad_norm": 1.0015315278245285,
"learning_rate": 1.5963037898913957e-10,
"loss": 0.0512,
"step": 616
},
{
"epoch": 1.9919224555735058,
"grad_norm": 0.6617805237667193,
"learning_rate": 3.9907913275683e-11,
"loss": 0.0487,
"step": 617
},
{
"epoch": 1.9951534733441034,
"grad_norm": 0.5338181066973476,
"learning_rate": 0.0,
"loss": 0.0393,
"step": 618
},
{
"epoch": 1.9951534733441034,
"step": 618,
"total_flos": 52320870432768.0,
"train_loss": 0.06418457114014135,
"train_runtime": 2400.3631,
"train_samples_per_second": 4.126,
"train_steps_per_second": 0.257
}
],
"logging_steps": 1,
"max_steps": 618,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 52320870432768.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}