| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.99510603588907, | |
| "eval_steps": 500, | |
| "global_step": 918, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03262642740619902, | |
| "grad_norm": 0.5258967280387878, | |
| "learning_rate": 5e-06, | |
| "loss": 1.8319, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06525285481239804, | |
| "grad_norm": 0.5200194120407104, | |
| "learning_rate": 9.5e-06, | |
| "loss": 1.7689, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09787928221859707, | |
| "grad_norm": 0.56816166639328, | |
| "learning_rate": 1.45e-05, | |
| "loss": 1.7576, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13050570962479607, | |
| "grad_norm": 0.5110853910446167, | |
| "learning_rate": 1.9500000000000003e-05, | |
| "loss": 1.6579, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1631321370309951, | |
| "grad_norm": 0.4745779037475586, | |
| "learning_rate": 2.45e-05, | |
| "loss": 1.5154, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.19575856443719414, | |
| "grad_norm": 0.5604385137557983, | |
| "learning_rate": 2.95e-05, | |
| "loss": 1.3892, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.22838499184339314, | |
| "grad_norm": 0.7449052333831787, | |
| "learning_rate": 3.45e-05, | |
| "loss": 1.2256, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.26101141924959215, | |
| "grad_norm": 0.7363050580024719, | |
| "learning_rate": 3.9500000000000005e-05, | |
| "loss": 0.982, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2936378466557912, | |
| "grad_norm": 0.5178012847900391, | |
| "learning_rate": 4.4500000000000004e-05, | |
| "loss": 0.7681, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3262642740619902, | |
| "grad_norm": 0.6690722703933716, | |
| "learning_rate": 4.9500000000000004e-05, | |
| "loss": 0.7226, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.35889070146818924, | |
| "grad_norm": 0.4482414424419403, | |
| "learning_rate": 4.944987775061125e-05, | |
| "loss": 0.6749, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3915171288743883, | |
| "grad_norm": 0.5283921957015991, | |
| "learning_rate": 4.883863080684597e-05, | |
| "loss": 0.6465, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.42414355628058725, | |
| "grad_norm": 0.5526198744773865, | |
| "learning_rate": 4.822738386308069e-05, | |
| "loss": 0.594, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4567699836867863, | |
| "grad_norm": 0.5214644074440002, | |
| "learning_rate": 4.761613691931541e-05, | |
| "loss": 0.5579, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4893964110929853, | |
| "grad_norm": 0.5322310924530029, | |
| "learning_rate": 4.7004889975550123e-05, | |
| "loss": 0.5305, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5220228384991843, | |
| "grad_norm": 0.7314450740814209, | |
| "learning_rate": 4.6393643031784844e-05, | |
| "loss": 0.4756, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5546492659053833, | |
| "grad_norm": 0.8103362917900085, | |
| "learning_rate": 4.5782396088019564e-05, | |
| "loss": 0.4592, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5872756933115824, | |
| "grad_norm": 0.6366791725158691, | |
| "learning_rate": 4.5171149144254284e-05, | |
| "loss": 0.4786, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6199021207177814, | |
| "grad_norm": 0.6715788245201111, | |
| "learning_rate": 4.4559902200489e-05, | |
| "loss": 0.4216, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6525285481239804, | |
| "grad_norm": 0.6936819553375244, | |
| "learning_rate": 4.394865525672372e-05, | |
| "loss": 0.3964, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6851549755301795, | |
| "grad_norm": 0.8318383693695068, | |
| "learning_rate": 4.333740831295844e-05, | |
| "loss": 0.4043, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7177814029363785, | |
| "grad_norm": 0.656146228313446, | |
| "learning_rate": 4.272616136919316e-05, | |
| "loss": 0.3881, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7504078303425775, | |
| "grad_norm": 0.9217523336410522, | |
| "learning_rate": 4.211491442542788e-05, | |
| "loss": 0.3378, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7830342577487766, | |
| "grad_norm": 0.7269819378852844, | |
| "learning_rate": 4.150366748166259e-05, | |
| "loss": 0.3052, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8156606851549756, | |
| "grad_norm": 0.7469998002052307, | |
| "learning_rate": 4.089242053789731e-05, | |
| "loss": 0.301, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8482871125611745, | |
| "grad_norm": 0.7021219730377197, | |
| "learning_rate": 4.028117359413203e-05, | |
| "loss": 0.32, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8809135399673735, | |
| "grad_norm": 0.6621549725532532, | |
| "learning_rate": 3.966992665036675e-05, | |
| "loss": 0.283, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9135399673735726, | |
| "grad_norm": 1.2602078914642334, | |
| "learning_rate": 3.905867970660147e-05, | |
| "loss": 0.3075, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9461663947797716, | |
| "grad_norm": 0.8152816295623779, | |
| "learning_rate": 3.8447432762836186e-05, | |
| "loss": 0.2951, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9787928221859706, | |
| "grad_norm": 0.7038506865501404, | |
| "learning_rate": 3.783618581907091e-05, | |
| "loss": 0.2964, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.0114192495921697, | |
| "grad_norm": 1.1956801414489746, | |
| "learning_rate": 3.722493887530563e-05, | |
| "loss": 0.2707, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.0440456769983686, | |
| "grad_norm": 0.6589512228965759, | |
| "learning_rate": 3.661369193154035e-05, | |
| "loss": 0.2689, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.0766721044045677, | |
| "grad_norm": 0.953842043876648, | |
| "learning_rate": 3.600244498777506e-05, | |
| "loss": 0.2898, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.1092985318107667, | |
| "grad_norm": 0.6870063543319702, | |
| "learning_rate": 3.539119804400978e-05, | |
| "loss": 0.2836, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.1419249592169658, | |
| "grad_norm": 0.9847205877304077, | |
| "learning_rate": 3.47799511002445e-05, | |
| "loss": 0.2707, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1745513866231647, | |
| "grad_norm": 1.0384355783462524, | |
| "learning_rate": 3.416870415647922e-05, | |
| "loss": 0.2578, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.2071778140293639, | |
| "grad_norm": 0.7339671850204468, | |
| "learning_rate": 3.355745721271394e-05, | |
| "loss": 0.2622, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.2398042414355628, | |
| "grad_norm": 0.8134469389915466, | |
| "learning_rate": 3.2946210268948655e-05, | |
| "loss": 0.2365, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.272430668841762, | |
| "grad_norm": 0.9134344458580017, | |
| "learning_rate": 3.2334963325183375e-05, | |
| "loss": 0.2586, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.3050570962479608, | |
| "grad_norm": 0.7556074261665344, | |
| "learning_rate": 3.1723716381418096e-05, | |
| "loss": 0.236, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.3376835236541598, | |
| "grad_norm": 1.0541133880615234, | |
| "learning_rate": 3.1112469437652816e-05, | |
| "loss": 0.2483, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.370309951060359, | |
| "grad_norm": 1.0119078159332275, | |
| "learning_rate": 3.0501222493887533e-05, | |
| "loss": 0.2462, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.4029363784665578, | |
| "grad_norm": 0.9008921980857849, | |
| "learning_rate": 2.988997555012225e-05, | |
| "loss": 0.2373, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.435562805872757, | |
| "grad_norm": 0.9207481741905212, | |
| "learning_rate": 2.927872860635697e-05, | |
| "loss": 0.2504, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.468189233278956, | |
| "grad_norm": 0.8675833344459534, | |
| "learning_rate": 2.866748166259169e-05, | |
| "loss": 0.2463, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.5008156606851548, | |
| "grad_norm": 1.7861591577529907, | |
| "learning_rate": 2.8056234718826407e-05, | |
| "loss": 0.2468, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.533442088091354, | |
| "grad_norm": 0.8898101449012756, | |
| "learning_rate": 2.7444987775061127e-05, | |
| "loss": 0.2254, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.566068515497553, | |
| "grad_norm": 0.9002168774604797, | |
| "learning_rate": 2.6833740831295844e-05, | |
| "loss": 0.2265, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.598694942903752, | |
| "grad_norm": 1.0384007692337036, | |
| "learning_rate": 2.6222493887530564e-05, | |
| "loss": 0.214, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.631321370309951, | |
| "grad_norm": 0.7849037647247314, | |
| "learning_rate": 2.561124694376528e-05, | |
| "loss": 0.2325, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.66394779771615, | |
| "grad_norm": 0.7699252367019653, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.2171, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.6965742251223492, | |
| "grad_norm": 0.9055914878845215, | |
| "learning_rate": 2.438875305623472e-05, | |
| "loss": 0.2322, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.7292006525285482, | |
| "grad_norm": 1.4799339771270752, | |
| "learning_rate": 2.3777506112469438e-05, | |
| "loss": 0.2165, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.761827079934747, | |
| "grad_norm": 0.9675979018211365, | |
| "learning_rate": 2.316625916870416e-05, | |
| "loss": 0.2218, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.7944535073409462, | |
| "grad_norm": 1.1401549577713013, | |
| "learning_rate": 2.2555012224938875e-05, | |
| "loss": 0.2123, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.8270799347471451, | |
| "grad_norm": 1.1033681631088257, | |
| "learning_rate": 2.1943765281173596e-05, | |
| "loss": 0.2285, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.8597063621533443, | |
| "grad_norm": 1.0548712015151978, | |
| "learning_rate": 2.1332518337408312e-05, | |
| "loss": 0.2136, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.8923327895595432, | |
| "grad_norm": 1.5485633611679077, | |
| "learning_rate": 2.0721271393643033e-05, | |
| "loss": 0.2055, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.9249592169657421, | |
| "grad_norm": 0.9844083786010742, | |
| "learning_rate": 2.0110024449877753e-05, | |
| "loss": 0.2142, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.9575856443719413, | |
| "grad_norm": 1.1740403175354004, | |
| "learning_rate": 1.949877750611247e-05, | |
| "loss": 0.1948, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.9902120717781404, | |
| "grad_norm": 0.9265509843826294, | |
| "learning_rate": 1.888753056234719e-05, | |
| "loss": 0.2142, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.0228384991843393, | |
| "grad_norm": 0.8141701221466064, | |
| "learning_rate": 1.8276283618581907e-05, | |
| "loss": 0.2023, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.0554649265905383, | |
| "grad_norm": 0.7996273040771484, | |
| "learning_rate": 1.7665036674816627e-05, | |
| "loss": 0.1924, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.088091353996737, | |
| "grad_norm": 0.9325422048568726, | |
| "learning_rate": 1.7053789731051344e-05, | |
| "loss": 0.1887, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.1207177814029365, | |
| "grad_norm": 0.9285069108009338, | |
| "learning_rate": 1.6442542787286064e-05, | |
| "loss": 0.1891, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.1533442088091355, | |
| "grad_norm": 1.1411644220352173, | |
| "learning_rate": 1.583129584352078e-05, | |
| "loss": 0.1898, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.1859706362153344, | |
| "grad_norm": 0.8324933052062988, | |
| "learning_rate": 1.5220048899755501e-05, | |
| "loss": 0.1854, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.2185970636215333, | |
| "grad_norm": 1.0230185985565186, | |
| "learning_rate": 1.460880195599022e-05, | |
| "loss": 0.1909, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.2512234910277327, | |
| "grad_norm": 1.1167818307876587, | |
| "learning_rate": 1.3997555012224938e-05, | |
| "loss": 0.2023, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.2838499184339316, | |
| "grad_norm": 0.8201693892478943, | |
| "learning_rate": 1.3386308068459657e-05, | |
| "loss": 0.1928, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.3164763458401305, | |
| "grad_norm": 1.2748645544052124, | |
| "learning_rate": 1.2775061124694377e-05, | |
| "loss": 0.2043, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.3491027732463294, | |
| "grad_norm": 0.9501346945762634, | |
| "learning_rate": 1.2163814180929096e-05, | |
| "loss": 0.1914, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.3817292006525284, | |
| "grad_norm": 0.8706419467926025, | |
| "learning_rate": 1.1552567237163816e-05, | |
| "loss": 0.1763, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.4143556280587277, | |
| "grad_norm": 1.8394954204559326, | |
| "learning_rate": 1.0941320293398534e-05, | |
| "loss": 0.1856, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.4469820554649266, | |
| "grad_norm": 0.9215448498725891, | |
| "learning_rate": 1.0330073349633253e-05, | |
| "loss": 0.181, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.4796084828711256, | |
| "grad_norm": 0.8978357911109924, | |
| "learning_rate": 9.718826405867972e-06, | |
| "loss": 0.18, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.5122349102773245, | |
| "grad_norm": 0.9564265608787537, | |
| "learning_rate": 9.10757946210269e-06, | |
| "loss": 0.1961, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.544861337683524, | |
| "grad_norm": 1.1424188613891602, | |
| "learning_rate": 8.496332518337409e-06, | |
| "loss": 0.1812, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.5774877650897228, | |
| "grad_norm": 1.0267794132232666, | |
| "learning_rate": 7.885085574572127e-06, | |
| "loss": 0.1861, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.6101141924959217, | |
| "grad_norm": 0.8952123522758484, | |
| "learning_rate": 7.273838630806847e-06, | |
| "loss": 0.1716, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.6427406199021206, | |
| "grad_norm": 1.2367416620254517, | |
| "learning_rate": 6.662591687041565e-06, | |
| "loss": 0.1863, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.6753670473083195, | |
| "grad_norm": 1.0904302597045898, | |
| "learning_rate": 6.051344743276284e-06, | |
| "loss": 0.1823, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.707993474714519, | |
| "grad_norm": 1.0542256832122803, | |
| "learning_rate": 5.440097799511003e-06, | |
| "loss": 0.1764, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.740619902120718, | |
| "grad_norm": 0.8970702886581421, | |
| "learning_rate": 4.828850855745722e-06, | |
| "loss": 0.1866, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.7732463295269167, | |
| "grad_norm": 0.8508768081665039, | |
| "learning_rate": 4.21760391198044e-06, | |
| "loss": 0.1791, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.8058727569331157, | |
| "grad_norm": 0.9417251944541931, | |
| "learning_rate": 3.606356968215159e-06, | |
| "loss": 0.18, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.838499184339315, | |
| "grad_norm": 0.8804249167442322, | |
| "learning_rate": 2.9951100244498777e-06, | |
| "loss": 0.1844, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.871125611745514, | |
| "grad_norm": 0.8565665483474731, | |
| "learning_rate": 2.3838630806845967e-06, | |
| "loss": 0.1781, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.903752039151713, | |
| "grad_norm": 0.9489305019378662, | |
| "learning_rate": 1.7726161369193154e-06, | |
| "loss": 0.1809, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.936378466557912, | |
| "grad_norm": 0.7006880044937134, | |
| "learning_rate": 1.1613691931540342e-06, | |
| "loss": 0.1712, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.9690048939641107, | |
| "grad_norm": 0.8745304942131042, | |
| "learning_rate": 5.501222493887531e-07, | |
| "loss": 0.1773, | |
| "step": 910 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 918, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.1375621636321444e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |