| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.9548387096774196, | |
| "eval_steps": 500, | |
| "global_step": 308, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012903225806451613, | |
| "grad_norm": 3.865084409713745, | |
| "learning_rate": 1.6129032258064516e-06, | |
| "loss": 0.2085, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.025806451612903226, | |
| "grad_norm": 4.324934482574463, | |
| "learning_rate": 3.225806451612903e-06, | |
| "loss": 0.2026, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.03870967741935484, | |
| "grad_norm": 3.4087750911712646, | |
| "learning_rate": 4.838709677419355e-06, | |
| "loss": 0.2162, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.05161290322580645, | |
| "grad_norm": 1.6109209060668945, | |
| "learning_rate": 6.451612903225806e-06, | |
| "loss": 0.1839, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.06451612903225806, | |
| "grad_norm": 1.3727400302886963, | |
| "learning_rate": 8.064516129032258e-06, | |
| "loss": 0.1952, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.07741935483870968, | |
| "grad_norm": 1.283871054649353, | |
| "learning_rate": 9.67741935483871e-06, | |
| "loss": 0.1879, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.09032258064516129, | |
| "grad_norm": 1.0045589208602905, | |
| "learning_rate": 1.129032258064516e-05, | |
| "loss": 0.1501, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.1032258064516129, | |
| "grad_norm": 1.3846447467803955, | |
| "learning_rate": 1.2903225806451613e-05, | |
| "loss": 0.1489, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.11612903225806452, | |
| "grad_norm": 1.364700198173523, | |
| "learning_rate": 1.4516129032258066e-05, | |
| "loss": 0.1484, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.12903225806451613, | |
| "grad_norm": 0.9269119501113892, | |
| "learning_rate": 1.6129032258064517e-05, | |
| "loss": 0.1388, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.14193548387096774, | |
| "grad_norm": 2.6850790977478027, | |
| "learning_rate": 1.774193548387097e-05, | |
| "loss": 0.1319, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.15483870967741936, | |
| "grad_norm": 1.2811543941497803, | |
| "learning_rate": 1.935483870967742e-05, | |
| "loss": 0.1246, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.16774193548387098, | |
| "grad_norm": 1.0256060361862183, | |
| "learning_rate": 2.0967741935483873e-05, | |
| "loss": 0.1319, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.18064516129032257, | |
| "grad_norm": 0.9394522905349731, | |
| "learning_rate": 2.258064516129032e-05, | |
| "loss": 0.1191, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.1935483870967742, | |
| "grad_norm": 0.8834514021873474, | |
| "learning_rate": 2.4193548387096777e-05, | |
| "loss": 0.1248, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2064516129032258, | |
| "grad_norm": 1.0034328699111938, | |
| "learning_rate": 2.5806451612903226e-05, | |
| "loss": 0.124, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.21935483870967742, | |
| "grad_norm": 0.8772470951080322, | |
| "learning_rate": 2.7419354838709678e-05, | |
| "loss": 0.1068, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.23225806451612904, | |
| "grad_norm": 0.9788472652435303, | |
| "learning_rate": 2.9032258064516133e-05, | |
| "loss": 0.1245, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.24516129032258063, | |
| "grad_norm": 0.8098297715187073, | |
| "learning_rate": 3.0645161290322585e-05, | |
| "loss": 0.1099, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.25806451612903225, | |
| "grad_norm": 0.8270663619041443, | |
| "learning_rate": 3.2258064516129034e-05, | |
| "loss": 0.0959, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2709677419354839, | |
| "grad_norm": 0.6456682682037354, | |
| "learning_rate": 3.387096774193548e-05, | |
| "loss": 0.0923, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.2838709677419355, | |
| "grad_norm": 0.8719914555549622, | |
| "learning_rate": 3.548387096774194e-05, | |
| "loss": 0.1088, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.2967741935483871, | |
| "grad_norm": 0.6625027060508728, | |
| "learning_rate": 3.7096774193548386e-05, | |
| "loss": 0.0882, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.3096774193548387, | |
| "grad_norm": 0.6829620003700256, | |
| "learning_rate": 3.870967741935484e-05, | |
| "loss": 0.0992, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "grad_norm": 0.8227680921554565, | |
| "learning_rate": 4.032258064516129e-05, | |
| "loss": 0.1224, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.33548387096774196, | |
| "grad_norm": 0.5915015339851379, | |
| "learning_rate": 4.1935483870967746e-05, | |
| "loss": 0.0846, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.34838709677419355, | |
| "grad_norm": 0.6318536400794983, | |
| "learning_rate": 4.3548387096774194e-05, | |
| "loss": 0.0864, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.36129032258064514, | |
| "grad_norm": 0.6643650531768799, | |
| "learning_rate": 4.516129032258064e-05, | |
| "loss": 0.103, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.3741935483870968, | |
| "grad_norm": 0.6708557605743408, | |
| "learning_rate": 4.67741935483871e-05, | |
| "loss": 0.1148, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.3870967741935484, | |
| "grad_norm": 0.5628843307495117, | |
| "learning_rate": 4.8387096774193554e-05, | |
| "loss": 0.0789, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.5701764225959778, | |
| "learning_rate": 5e-05, | |
| "loss": 0.0844, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.4129032258064516, | |
| "grad_norm": 0.565487265586853, | |
| "learning_rate": 4.999839214987031e-05, | |
| "loss": 0.0813, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.4258064516129032, | |
| "grad_norm": 0.5022628903388977, | |
| "learning_rate": 4.9993568806295786e-05, | |
| "loss": 0.0818, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.43870967741935485, | |
| "grad_norm": 0.5096875429153442, | |
| "learning_rate": 4.9985530589693516e-05, | |
| "loss": 0.0844, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.45161290322580644, | |
| "grad_norm": 0.6019471883773804, | |
| "learning_rate": 4.997427853400333e-05, | |
| "loss": 0.0903, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.4645161290322581, | |
| "grad_norm": 0.4620855152606964, | |
| "learning_rate": 4.995981408655473e-05, | |
| "loss": 0.0772, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.4774193548387097, | |
| "grad_norm": 0.46718630194664, | |
| "learning_rate": 4.9942139107880844e-05, | |
| "loss": 0.0786, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.49032258064516127, | |
| "grad_norm": 0.6058530211448669, | |
| "learning_rate": 4.9921255871479e-05, | |
| "loss": 0.0948, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.5032258064516129, | |
| "grad_norm": 0.5089570879936218, | |
| "learning_rate": 4.989716706351835e-05, | |
| "loss": 0.0805, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.5161290322580645, | |
| "grad_norm": 0.4403476417064667, | |
| "learning_rate": 4.986987578249432e-05, | |
| "loss": 0.0899, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5290322580645161, | |
| "grad_norm": 0.640173077583313, | |
| "learning_rate": 4.9839385538830104e-05, | |
| "loss": 0.0849, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.5419354838709678, | |
| "grad_norm": 0.5224989056587219, | |
| "learning_rate": 4.980570025442507e-05, | |
| "loss": 0.0845, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.5548387096774193, | |
| "grad_norm": 0.5968887805938721, | |
| "learning_rate": 4.9768824262150344e-05, | |
| "loss": 0.0912, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.567741935483871, | |
| "grad_norm": 0.5556889772415161, | |
| "learning_rate": 4.972876230529142e-05, | |
| "loss": 0.0907, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.5806451612903226, | |
| "grad_norm": 0.4258240759372711, | |
| "learning_rate": 4.968551953693813e-05, | |
| "loss": 0.063, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5935483870967742, | |
| "grad_norm": 0.5726487636566162, | |
| "learning_rate": 4.963910151932171e-05, | |
| "loss": 0.0816, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.6064516129032258, | |
| "grad_norm": 0.5164178609848022, | |
| "learning_rate": 4.958951422309942e-05, | |
| "loss": 0.0815, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.6193548387096774, | |
| "grad_norm": 0.5187662839889526, | |
| "learning_rate": 4.95367640265865e-05, | |
| "loss": 0.0834, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.632258064516129, | |
| "grad_norm": 0.4768002927303314, | |
| "learning_rate": 4.948085771493579e-05, | |
| "loss": 0.0818, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 1.3129404783248901, | |
| "learning_rate": 4.942180247926492e-05, | |
| "loss": 0.077, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6580645161290323, | |
| "grad_norm": 0.5702396631240845, | |
| "learning_rate": 4.935960591573135e-05, | |
| "loss": 0.0869, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.6709677419354839, | |
| "grad_norm": 0.5151292085647583, | |
| "learning_rate": 4.929427602455532e-05, | |
| "loss": 0.091, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.6838709677419355, | |
| "grad_norm": 0.4023990035057068, | |
| "learning_rate": 4.922582120899072e-05, | |
| "loss": 0.0772, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.6967741935483871, | |
| "grad_norm": 0.5167486667633057, | |
| "learning_rate": 4.91542502742443e-05, | |
| "loss": 0.0815, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.7096774193548387, | |
| "grad_norm": 0.4807862341403961, | |
| "learning_rate": 4.907957242634299e-05, | |
| "loss": 0.0819, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.7225806451612903, | |
| "grad_norm": 0.5813907384872437, | |
| "learning_rate": 4.900179727094978e-05, | |
| "loss": 0.0852, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.7354838709677419, | |
| "grad_norm": 0.44726839661598206, | |
| "learning_rate": 4.8920934812128164e-05, | |
| "loss": 0.0846, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.7483870967741936, | |
| "grad_norm": 0.4237375557422638, | |
| "learning_rate": 4.8836995451055344e-05, | |
| "loss": 0.0918, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.7612903225806451, | |
| "grad_norm": 0.3706034719944, | |
| "learning_rate": 4.874998998468433e-05, | |
| "loss": 0.0713, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.7741935483870968, | |
| "grad_norm": 0.5643853545188904, | |
| "learning_rate": 4.865992960435514e-05, | |
| "loss": 0.0838, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7870967741935484, | |
| "grad_norm": 0.4395618140697479, | |
| "learning_rate": 4.856682589435531e-05, | |
| "loss": 0.0948, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.4633603096008301, | |
| "learning_rate": 4.8470690830429823e-05, | |
| "loss": 0.076, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.8129032258064516, | |
| "grad_norm": 0.44225528836250305, | |
| "learning_rate": 4.837153677824067e-05, | |
| "loss": 0.0887, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.8258064516129032, | |
| "grad_norm": 0.3647634983062744, | |
| "learning_rate": 4.826937649177632e-05, | |
| "loss": 0.0752, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.8387096774193549, | |
| "grad_norm": 0.4087846577167511, | |
| "learning_rate": 4.816422311171115e-05, | |
| "loss": 0.0701, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.8516129032258064, | |
| "grad_norm": 0.3962288796901703, | |
| "learning_rate": 4.805609016371522e-05, | |
| "loss": 0.072, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.864516129032258, | |
| "grad_norm": 0.35116565227508545, | |
| "learning_rate": 4.7944991556714496e-05, | |
| "loss": 0.0639, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.8774193548387097, | |
| "grad_norm": 0.37532997131347656, | |
| "learning_rate": 4.783094158110174e-05, | |
| "loss": 0.0804, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.8903225806451613, | |
| "grad_norm": 0.4580182433128357, | |
| "learning_rate": 4.7713954906898415e-05, | |
| "loss": 0.0883, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.9032258064516129, | |
| "grad_norm": 0.46939095854759216, | |
| "learning_rate": 4.759404658186765e-05, | |
| "loss": 0.086, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.9161290322580645, | |
| "grad_norm": 0.38567599654197693, | |
| "learning_rate": 4.747123202957872e-05, | |
| "loss": 0.0627, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.9290322580645162, | |
| "grad_norm": 0.5291863083839417, | |
| "learning_rate": 4.7345527047423135e-05, | |
| "loss": 0.087, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.9419354838709677, | |
| "grad_norm": 0.5343989133834839, | |
| "learning_rate": 4.721694780458266e-05, | |
| "loss": 0.0817, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.9548387096774194, | |
| "grad_norm": 0.5286762118339539, | |
| "learning_rate": 4.7085510839949444e-05, | |
| "loss": 0.0824, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.967741935483871, | |
| "grad_norm": 0.6843681931495667, | |
| "learning_rate": 4.695123305999877e-05, | |
| "loss": 0.0757, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.9806451612903225, | |
| "grad_norm": 0.48566552996635437, | |
| "learning_rate": 4.681413173661429e-05, | |
| "loss": 0.0793, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.9935483870967742, | |
| "grad_norm": 0.5594713091850281, | |
| "learning_rate": 4.667422450486646e-05, | |
| "loss": 0.0919, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.5594713091850281, | |
| "learning_rate": 4.653152936074413e-05, | |
| "loss": 0.0812, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.0129032258064516, | |
| "grad_norm": 0.6600771546363831, | |
| "learning_rate": 4.638606465883979e-05, | |
| "loss": 0.0528, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.0258064516129033, | |
| "grad_norm": 0.4466298520565033, | |
| "learning_rate": 4.623784910998862e-05, | |
| "loss": 0.0562, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.038709677419355, | |
| "grad_norm": 0.46932584047317505, | |
| "learning_rate": 4.608690177886177e-05, | |
| "loss": 0.052, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.0516129032258064, | |
| "grad_norm": 0.3475061357021332, | |
| "learning_rate": 4.593324208151412e-05, | |
| "loss": 0.0477, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.064516129032258, | |
| "grad_norm": 0.3392031788825989, | |
| "learning_rate": 4.577688978288681e-05, | |
| "loss": 0.0552, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.0774193548387097, | |
| "grad_norm": 0.36944520473480225, | |
| "learning_rate": 4.5617864994264936e-05, | |
| "loss": 0.0623, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.0903225806451613, | |
| "grad_norm": 0.3976334035396576, | |
| "learning_rate": 4.545618817069066e-05, | |
| "loss": 0.055, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.103225806451613, | |
| "grad_norm": 0.3573772609233856, | |
| "learning_rate": 4.529188010833212e-05, | |
| "loss": 0.0652, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.1161290322580646, | |
| "grad_norm": 0.37986183166503906, | |
| "learning_rate": 4.512496194180846e-05, | |
| "loss": 0.0506, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.129032258064516, | |
| "grad_norm": 0.45936280488967896, | |
| "learning_rate": 4.495545514147134e-05, | |
| "loss": 0.0671, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.1419354838709677, | |
| "grad_norm": 0.3391832113265991, | |
| "learning_rate": 4.478338151064323e-05, | |
| "loss": 0.0537, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.1548387096774193, | |
| "grad_norm": 0.30911317467689514, | |
| "learning_rate": 4.460876318281291e-05, | |
| "loss": 0.0531, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.167741935483871, | |
| "grad_norm": 0.4350668489933014, | |
| "learning_rate": 4.443162261878846e-05, | |
| "loss": 0.051, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.1806451612903226, | |
| "grad_norm": 0.34030795097351074, | |
| "learning_rate": 4.425198260380818e-05, | |
| "loss": 0.0509, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.1935483870967742, | |
| "grad_norm": 0.31266239285469055, | |
| "learning_rate": 4.406986624460979e-05, | |
| "loss": 0.0493, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.206451612903226, | |
| "grad_norm": 0.3814278244972229, | |
| "learning_rate": 4.3885296966458224e-05, | |
| "loss": 0.0575, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.2193548387096773, | |
| "grad_norm": 0.2861946225166321, | |
| "learning_rate": 4.369829851013251e-05, | |
| "loss": 0.0518, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.232258064516129, | |
| "grad_norm": 0.35761791467666626, | |
| "learning_rate": 4.350889492887203e-05, | |
| "loss": 0.0509, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.2451612903225806, | |
| "grad_norm": 0.321609765291214, | |
| "learning_rate": 4.3317110585282595e-05, | |
| "loss": 0.0468, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.2580645161290323, | |
| "grad_norm": 0.33591514825820923, | |
| "learning_rate": 4.3122970148202744e-05, | |
| "loss": 0.055, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.270967741935484, | |
| "grad_norm": 0.38554760813713074, | |
| "learning_rate": 4.292649858953063e-05, | |
| "loss": 0.07, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.2838709677419355, | |
| "grad_norm": 0.5272489786148071, | |
| "learning_rate": 4.272772118101195e-05, | |
| "loss": 0.0564, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.2967741935483872, | |
| "grad_norm": 0.4388972818851471, | |
| "learning_rate": 4.252666349098926e-05, | |
| "loss": 0.0536, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.3096774193548386, | |
| "grad_norm": 0.33931440114974976, | |
| "learning_rate": 4.2323351381113206e-05, | |
| "loss": 0.0472, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.3225806451612903, | |
| "grad_norm": 0.2954395115375519, | |
| "learning_rate": 4.211781100301596e-05, | |
| "loss": 0.0645, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.335483870967742, | |
| "grad_norm": 0.5045889019966125, | |
| "learning_rate": 4.1910068794947404e-05, | |
| "loss": 0.056, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.3483870967741935, | |
| "grad_norm": 0.3853508532047272, | |
| "learning_rate": 4.170015147837444e-05, | |
| "loss": 0.0518, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.3612903225806452, | |
| "grad_norm": 0.27302005887031555, | |
| "learning_rate": 4.148808605454385e-05, | |
| "loss": 0.0487, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.3741935483870968, | |
| "grad_norm": 0.41000276803970337, | |
| "learning_rate": 4.127389980100915e-05, | |
| "loss": 0.0528, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.3870967741935485, | |
| "grad_norm": 0.3967251479625702, | |
| "learning_rate": 4.1057620268122004e-05, | |
| "loss": 0.0485, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.30169478058815, | |
| "learning_rate": 4.08392752754884e-05, | |
| "loss": 0.0485, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.4129032258064516, | |
| "grad_norm": 0.3874256908893585, | |
| "learning_rate": 4.0618892908390316e-05, | |
| "loss": 0.0514, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.4258064516129032, | |
| "grad_norm": 0.29030513763427734, | |
| "learning_rate": 4.039650151417316e-05, | |
| "loss": 0.0442, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.4387096774193548, | |
| "grad_norm": 0.3206923007965088, | |
| "learning_rate": 4.0172129698599484e-05, | |
| "loss": 0.052, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.4516129032258065, | |
| "grad_norm": 0.3883202075958252, | |
| "learning_rate": 3.994580632216952e-05, | |
| "loss": 0.057, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.4645161290322581, | |
| "grad_norm": 0.38505852222442627, | |
| "learning_rate": 3.971756049640888e-05, | |
| "loss": 0.0515, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.4774193548387098, | |
| "grad_norm": 0.27141115069389343, | |
| "learning_rate": 3.9487421580124e-05, | |
| "loss": 0.0524, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.4903225806451612, | |
| "grad_norm": 0.4098750054836273, | |
| "learning_rate": 3.92554191756258e-05, | |
| "loss": 0.0544, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.5032258064516129, | |
| "grad_norm": 0.3381025791168213, | |
| "learning_rate": 3.902158312492196e-05, | |
| "loss": 0.0468, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.5161290322580645, | |
| "grad_norm": 0.3622021973133087, | |
| "learning_rate": 3.878594350587844e-05, | |
| "loss": 0.0451, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.5290322580645161, | |
| "grad_norm": 0.31938228011131287, | |
| "learning_rate": 3.8548530628350623e-05, | |
| "loss": 0.0509, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.5419354838709678, | |
| "grad_norm": 0.3087027370929718, | |
| "learning_rate": 3.8309375030284566e-05, | |
| "loss": 0.0616, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.5548387096774192, | |
| "grad_norm": 0.36196205019950867, | |
| "learning_rate": 3.806850747378902e-05, | |
| "loss": 0.0514, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.567741935483871, | |
| "grad_norm": 0.31175723671913147, | |
| "learning_rate": 3.782595894117854e-05, | |
| "loss": 0.0478, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.5806451612903225, | |
| "grad_norm": 0.3437715768814087, | |
| "learning_rate": 3.758176063098829e-05, | |
| "loss": 0.0489, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.5935483870967742, | |
| "grad_norm": 0.3346206247806549, | |
| "learning_rate": 3.733594395396106e-05, | |
| "loss": 0.0516, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.6064516129032258, | |
| "grad_norm": 0.38583481311798096, | |
| "learning_rate": 3.7088540529006924e-05, | |
| "loss": 0.0461, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.6193548387096774, | |
| "grad_norm": 0.25379714369773865, | |
| "learning_rate": 3.68395821791362e-05, | |
| "loss": 0.0414, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.632258064516129, | |
| "grad_norm": 0.42217546701431274, | |
| "learning_rate": 3.65891009273661e-05, | |
| "loss": 0.0556, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.6451612903225805, | |
| "grad_norm": 0.36978331208229065, | |
| "learning_rate": 3.633712899260166e-05, | |
| "loss": 0.0575, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.6580645161290324, | |
| "grad_norm": 0.3139955401420593, | |
| "learning_rate": 3.608369878549154e-05, | |
| "loss": 0.0451, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.6709677419354838, | |
| "grad_norm": 0.3436100482940674, | |
| "learning_rate": 3.582884290425903e-05, | |
| "loss": 0.0522, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.6838709677419355, | |
| "grad_norm": 0.2942555248737335, | |
| "learning_rate": 3.557259413050907e-05, | |
| "loss": 0.049, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.696774193548387, | |
| "grad_norm": 0.32971078157424927, | |
| "learning_rate": 3.531498542501161e-05, | |
| "loss": 0.0495, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.7096774193548387, | |
| "grad_norm": 0.2842581272125244, | |
| "learning_rate": 3.505604992346187e-05, | |
| "loss": 0.0494, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.7225806451612904, | |
| "grad_norm": 0.37220752239227295, | |
| "learning_rate": 3.4795820932218235e-05, | |
| "loss": 0.0489, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.7354838709677418, | |
| "grad_norm": 0.3396855592727661, | |
| "learning_rate": 3.4534331924018095e-05, | |
| "loss": 0.058, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.7483870967741937, | |
| "grad_norm": 0.30913567543029785, | |
| "learning_rate": 3.427161653367231e-05, | |
| "loss": 0.0505, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.761290322580645, | |
| "grad_norm": 0.3557908535003662, | |
| "learning_rate": 3.4007708553738836e-05, | |
| "loss": 0.0454, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.7741935483870968, | |
| "grad_norm": 0.32114386558532715, | |
| "learning_rate": 3.3742641930176045e-05, | |
| "loss": 0.0462, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.7870967741935484, | |
| "grad_norm": 0.264426589012146, | |
| "learning_rate": 3.347645075797635e-05, | |
| "loss": 0.0436, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.3021955192089081, | |
| "learning_rate": 3.320916927678061e-05, | |
| "loss": 0.0386, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.8129032258064517, | |
| "grad_norm": 0.35488826036453247, | |
| "learning_rate": 3.2940831866473966e-05, | |
| "loss": 0.0462, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.8258064516129031, | |
| "grad_norm": 0.2876163125038147, | |
| "learning_rate": 3.2671473042763604e-05, | |
| "loss": 0.0545, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.838709677419355, | |
| "grad_norm": 0.3992978036403656, | |
| "learning_rate": 3.240112745273911e-05, | |
| "loss": 0.0434, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.8516129032258064, | |
| "grad_norm": 0.3140002489089966, | |
| "learning_rate": 3.212982987041582e-05, | |
| "loss": 0.0459, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.864516129032258, | |
| "grad_norm": 0.357721209526062, | |
| "learning_rate": 3.1857615192261984e-05, | |
| "loss": 0.0548, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.8774193548387097, | |
| "grad_norm": 0.32279306650161743, | |
| "learning_rate": 3.158451843271005e-05, | |
| "loss": 0.0593, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.8903225806451613, | |
| "grad_norm": 0.3754781484603882, | |
| "learning_rate": 3.131057471965283e-05, | |
| "loss": 0.0508, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.903225806451613, | |
| "grad_norm": 0.42713454365730286, | |
| "learning_rate": 3.10358192899251e-05, | |
| "loss": 0.0423, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.9161290322580644, | |
| "grad_norm": 0.28894898295402527, | |
| "learning_rate": 3.076028748477112e-05, | |
| "loss": 0.0442, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.9290322580645163, | |
| "grad_norm": 0.32115331292152405, | |
| "learning_rate": 3.0484014745298784e-05, | |
| "loss": 0.0571, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.9419354838709677, | |
| "grad_norm": 0.36180686950683594, | |
| "learning_rate": 3.0207036607920892e-05, | |
| "loss": 0.0498, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.9548387096774194, | |
| "grad_norm": 0.36315590143203735, | |
| "learning_rate": 2.9929388699784163e-05, | |
| "loss": 0.0548, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.967741935483871, | |
| "grad_norm": 0.24509944021701813, | |
| "learning_rate": 2.9651106734186618e-05, | |
| "loss": 0.0473, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.9806451612903224, | |
| "grad_norm": 0.30309274792671204, | |
| "learning_rate": 2.9372226505983802e-05, | |
| "loss": 0.0515, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.9935483870967743, | |
| "grad_norm": 0.31110507249832153, | |
| "learning_rate": 2.9092783886984615e-05, | |
| "loss": 0.049, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.5940459966659546, | |
| "learning_rate": 2.8812814821337147e-05, | |
| "loss": 0.0358, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.0129032258064514, | |
| "grad_norm": 0.23582255840301514, | |
| "learning_rate": 2.8532355320905274e-05, | |
| "loss": 0.0252, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 2.0258064516129033, | |
| "grad_norm": 0.2490931898355484, | |
| "learning_rate": 2.825144146063654e-05, | |
| "loss": 0.0242, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 2.0387096774193547, | |
| "grad_norm": 0.300824373960495, | |
| "learning_rate": 2.7970109373921878e-05, | |
| "loss": 0.0278, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.0516129032258066, | |
| "grad_norm": 0.2460525780916214, | |
| "learning_rate": 2.7688395247947842e-05, | |
| "loss": 0.0292, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.064516129032258, | |
| "grad_norm": 0.2941141724586487, | |
| "learning_rate": 2.740633531904196e-05, | |
| "loss": 0.0257, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 2.07741935483871, | |
| "grad_norm": 0.27139392495155334, | |
| "learning_rate": 2.712396586801169e-05, | |
| "loss": 0.0246, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.0903225806451613, | |
| "grad_norm": 0.3047543168067932, | |
| "learning_rate": 2.6841323215477714e-05, | |
| "loss": 0.0313, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 2.1032258064516127, | |
| "grad_norm": 0.3697696924209595, | |
| "learning_rate": 2.6558443717202076e-05, | |
| "loss": 0.0311, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 2.1161290322580646, | |
| "grad_norm": 0.4939499497413635, | |
| "learning_rate": 2.6275363759411815e-05, | |
| "loss": 0.036, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.129032258064516, | |
| "grad_norm": 0.2502232789993286, | |
| "learning_rate": 2.599211975411867e-05, | |
| "loss": 0.0194, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.141935483870968, | |
| "grad_norm": 0.2647090554237366, | |
| "learning_rate": 2.57087481344355e-05, | |
| "loss": 0.0259, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 2.1548387096774193, | |
| "grad_norm": 0.2818278968334198, | |
| "learning_rate": 2.5425285349889938e-05, | |
| "loss": 0.0239, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.167741935483871, | |
| "grad_norm": 0.28670820593833923, | |
| "learning_rate": 2.5141767861735976e-05, | |
| "loss": 0.0248, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.1806451612903226, | |
| "grad_norm": 0.2521524727344513, | |
| "learning_rate": 2.485823213826403e-05, | |
| "loss": 0.0259, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.193548387096774, | |
| "grad_norm": 0.2689652740955353, | |
| "learning_rate": 2.457471465011007e-05, | |
| "loss": 0.026, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 2.206451612903226, | |
| "grad_norm": 0.24064841866493225, | |
| "learning_rate": 2.4291251865564505e-05, | |
| "loss": 0.0252, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 2.2193548387096773, | |
| "grad_norm": 0.26753664016723633, | |
| "learning_rate": 2.400788024588134e-05, | |
| "loss": 0.0236, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 2.232258064516129, | |
| "grad_norm": 0.30342286825180054, | |
| "learning_rate": 2.3724636240588194e-05, | |
| "loss": 0.0346, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 2.2451612903225806, | |
| "grad_norm": 0.36332225799560547, | |
| "learning_rate": 2.3441556282797934e-05, | |
| "loss": 0.0351, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.258064516129032, | |
| "grad_norm": 0.2872660458087921, | |
| "learning_rate": 2.315867678452229e-05, | |
| "loss": 0.0257, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 2.270967741935484, | |
| "grad_norm": 0.33576029539108276, | |
| "learning_rate": 2.287603413198831e-05, | |
| "loss": 0.0307, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.2838709677419353, | |
| "grad_norm": 0.42973652482032776, | |
| "learning_rate": 2.2593664680958045e-05, | |
| "loss": 0.0325, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.296774193548387, | |
| "grad_norm": 0.28366559743881226, | |
| "learning_rate": 2.231160475205216e-05, | |
| "loss": 0.0279, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 2.3096774193548386, | |
| "grad_norm": 0.23520596325397491, | |
| "learning_rate": 2.202989062607813e-05, | |
| "loss": 0.0242, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.3225806451612905, | |
| "grad_norm": 0.23934811353683472, | |
| "learning_rate": 2.1748558539363458e-05, | |
| "loss": 0.0213, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 2.335483870967742, | |
| "grad_norm": 0.2189173400402069, | |
| "learning_rate": 2.1467644679094728e-05, | |
| "loss": 0.0202, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 2.3483870967741938, | |
| "grad_norm": 0.42822569608688354, | |
| "learning_rate": 2.118718517866286e-05, | |
| "loss": 0.0361, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 2.361290322580645, | |
| "grad_norm": 0.2864365875720978, | |
| "learning_rate": 2.0907216113015397e-05, | |
| "loss": 0.0303, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 2.3741935483870966, | |
| "grad_norm": 0.24325411021709442, | |
| "learning_rate": 2.0627773494016207e-05, | |
| "loss": 0.0265, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.3870967741935485, | |
| "grad_norm": 0.23324590921401978, | |
| "learning_rate": 2.0348893265813394e-05, | |
| "loss": 0.0233, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.2864069640636444, | |
| "learning_rate": 2.0070611300215843e-05, | |
| "loss": 0.0259, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 2.412903225806452, | |
| "grad_norm": 0.2672640085220337, | |
| "learning_rate": 1.979296339207912e-05, | |
| "loss": 0.0267, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 2.425806451612903, | |
| "grad_norm": 0.22086627781391144, | |
| "learning_rate": 1.951598525470122e-05, | |
| "loss": 0.02, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 2.4387096774193546, | |
| "grad_norm": 0.24434693157672882, | |
| "learning_rate": 1.923971251522888e-05, | |
| "loss": 0.0235, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.4516129032258065, | |
| "grad_norm": 0.2901481091976166, | |
| "learning_rate": 1.8964180710074905e-05, | |
| "loss": 0.0242, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 2.464516129032258, | |
| "grad_norm": 0.2797795236110687, | |
| "learning_rate": 1.8689425280347166e-05, | |
| "loss": 0.0246, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 2.47741935483871, | |
| "grad_norm": 0.22296951711177826, | |
| "learning_rate": 1.8415481567289954e-05, | |
| "loss": 0.0223, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 2.490322580645161, | |
| "grad_norm": 0.2699308395385742, | |
| "learning_rate": 1.8142384807738022e-05, | |
| "loss": 0.027, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 2.5032258064516126, | |
| "grad_norm": 0.23768910765647888, | |
| "learning_rate": 1.787017012958419e-05, | |
| "loss": 0.0241, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.5161290322580645, | |
| "grad_norm": 0.2236274927854538, | |
| "learning_rate": 1.75988725472609e-05, | |
| "loss": 0.0241, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 2.5290322580645164, | |
| "grad_norm": 0.31210729479789734, | |
| "learning_rate": 1.7328526957236406e-05, | |
| "loss": 0.0302, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 2.541935483870968, | |
| "grad_norm": 0.22504620254039764, | |
| "learning_rate": 1.7059168133526043e-05, | |
| "loss": 0.0236, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 2.554838709677419, | |
| "grad_norm": 0.24498535692691803, | |
| "learning_rate": 1.6790830723219398e-05, | |
| "loss": 0.0228, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 2.567741935483871, | |
| "grad_norm": 0.2544174790382385, | |
| "learning_rate": 1.6523549242023655e-05, | |
| "loss": 0.027, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.5806451612903225, | |
| "grad_norm": 0.2597277760505676, | |
| "learning_rate": 1.6257358069823965e-05, | |
| "loss": 0.0304, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 2.5935483870967744, | |
| "grad_norm": 0.2677987515926361, | |
| "learning_rate": 1.5992291446261166e-05, | |
| "loss": 0.0254, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 2.606451612903226, | |
| "grad_norm": 0.27171266078948975, | |
| "learning_rate": 1.5728383466327684e-05, | |
| "loss": 0.0222, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 2.6193548387096772, | |
| "grad_norm": 0.2346351146697998, | |
| "learning_rate": 1.5465668075981904e-05, | |
| "loss": 0.0219, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.632258064516129, | |
| "grad_norm": 0.25438180565834045, | |
| "learning_rate": 1.520417906778176e-05, | |
| "loss": 0.0264, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.6451612903225805, | |
| "grad_norm": 0.2690618336200714, | |
| "learning_rate": 1.4943950076538135e-05, | |
| "loss": 0.0237, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.6580645161290324, | |
| "grad_norm": 0.29505690932273865, | |
| "learning_rate": 1.4685014574988393e-05, | |
| "loss": 0.0313, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.670967741935484, | |
| "grad_norm": 0.26153627038002014, | |
| "learning_rate": 1.4427405869490923e-05, | |
| "loss": 0.0236, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.6838709677419352, | |
| "grad_norm": 0.2462136447429657, | |
| "learning_rate": 1.4171157095740976e-05, | |
| "loss": 0.027, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.696774193548387, | |
| "grad_norm": 0.33909931778907776, | |
| "learning_rate": 1.391630121450847e-05, | |
| "loss": 0.0257, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.709677419354839, | |
| "grad_norm": 0.2736709415912628, | |
| "learning_rate": 1.3662871007398348e-05, | |
| "loss": 0.0274, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.7225806451612904, | |
| "grad_norm": 0.3460533022880554, | |
| "learning_rate": 1.3410899072633915e-05, | |
| "loss": 0.0292, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.735483870967742, | |
| "grad_norm": 0.28637826442718506, | |
| "learning_rate": 1.3160417820863807e-05, | |
| "loss": 0.0249, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.7483870967741937, | |
| "grad_norm": 0.21176108717918396, | |
| "learning_rate": 1.2911459470993085e-05, | |
| "loss": 0.019, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.761290322580645, | |
| "grad_norm": 0.22695419192314148, | |
| "learning_rate": 1.2664056046038942e-05, | |
| "loss": 0.0248, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.774193548387097, | |
| "grad_norm": 0.3826678693294525, | |
| "learning_rate": 1.2418239369011712e-05, | |
| "loss": 0.0206, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.7870967741935484, | |
| "grad_norm": 0.21120679378509521, | |
| "learning_rate": 1.217404105882147e-05, | |
| "loss": 0.02, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.266303151845932, | |
| "learning_rate": 1.1931492526210988e-05, | |
| "loss": 0.0217, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.8129032258064517, | |
| "grad_norm": 0.2666541635990143, | |
| "learning_rate": 1.1690624969715441e-05, | |
| "loss": 0.0248, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.825806451612903, | |
| "grad_norm": 0.2325655072927475, | |
| "learning_rate": 1.145146937164938e-05, | |
| "loss": 0.0226, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.838709677419355, | |
| "grad_norm": 0.277413934469223, | |
| "learning_rate": 1.121405649412156e-05, | |
| "loss": 0.0272, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.8516129032258064, | |
| "grad_norm": 0.23814360797405243, | |
| "learning_rate": 1.0978416875078042e-05, | |
| "loss": 0.0195, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.864516129032258, | |
| "grad_norm": 0.3257719576358795, | |
| "learning_rate": 1.0744580824374217e-05, | |
| "loss": 0.0336, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.8774193548387097, | |
| "grad_norm": 0.24910488724708557, | |
| "learning_rate": 1.0512578419876004e-05, | |
| "loss": 0.0236, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.8903225806451616, | |
| "grad_norm": 0.27725309133529663, | |
| "learning_rate": 1.0282439503591135e-05, | |
| "loss": 0.0272, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.903225806451613, | |
| "grad_norm": 0.23216702044010162, | |
| "learning_rate": 1.0054193677830481e-05, | |
| "loss": 0.0244, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.9161290322580644, | |
| "grad_norm": 0.27499204874038696, | |
| "learning_rate": 9.827870301400527e-06, | |
| "loss": 0.0214, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.9290322580645163, | |
| "grad_norm": 0.2503328323364258, | |
| "learning_rate": 9.603498485826848e-06, | |
| "loss": 0.0223, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.9419354838709677, | |
| "grad_norm": 0.23868584632873535, | |
| "learning_rate": 9.38110709160969e-06, | |
| "loss": 0.0227, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.9548387096774196, | |
| "grad_norm": 0.3298948407173157, | |
| "learning_rate": 9.160724724511608e-06, | |
| "loss": 0.0288, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.967741935483871, | |
| "grad_norm": 0.23431891202926636, | |
| "learning_rate": 8.942379731877992e-06, | |
| "loss": 0.023, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.9806451612903224, | |
| "grad_norm": 0.2671021819114685, | |
| "learning_rate": 8.726100198990853e-06, | |
| "loss": 0.0213, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.9935483870967743, | |
| "grad_norm": 0.29010385274887085, | |
| "learning_rate": 8.51191394545615e-06, | |
| "loss": 0.0314, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.29010385274887085, | |
| "learning_rate": 8.299848521625563e-06, | |
| "loss": 0.0159, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 3.0129032258064514, | |
| "grad_norm": 0.28189539909362793, | |
| "learning_rate": 8.089931205052597e-06, | |
| "loss": 0.0106, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 3.0258064516129033, | |
| "grad_norm": 0.1651533842086792, | |
| "learning_rate": 7.882188996984046e-06, | |
| "loss": 0.0122, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 3.0387096774193547, | |
| "grad_norm": 0.1792869120836258, | |
| "learning_rate": 7.676648618886798e-06, | |
| "loss": 0.0129, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 3.0516129032258066, | |
| "grad_norm": 0.16915558278560638, | |
| "learning_rate": 7.473336509010742e-06, | |
| "loss": 0.0099, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 3.064516129032258, | |
| "grad_norm": 0.15584175288677216, | |
| "learning_rate": 7.2722788189880545e-06, | |
| "loss": 0.0123, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 3.07741935483871, | |
| "grad_norm": 0.1726021021604538, | |
| "learning_rate": 7.073501410469371e-06, | |
| "loss": 0.0114, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.0903225806451613, | |
| "grad_norm": 0.15609051287174225, | |
| "learning_rate": 6.877029851797265e-06, | |
| "loss": 0.0091, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 3.1032258064516127, | |
| "grad_norm": 0.14097200334072113, | |
| "learning_rate": 6.6828894147174165e-06, | |
| "loss": 0.0084, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 3.1161290322580646, | |
| "grad_norm": 0.17083188891410828, | |
| "learning_rate": 6.491105071127984e-06, | |
| "loss": 0.0118, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 3.129032258064516, | |
| "grad_norm": 0.18476150929927826, | |
| "learning_rate": 6.3017014898674955e-06, | |
| "loss": 0.0079, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 3.141935483870968, | |
| "grad_norm": 0.22733564674854279, | |
| "learning_rate": 6.114703033541783e-06, | |
| "loss": 0.013, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 3.1548387096774193, | |
| "grad_norm": 0.15913838148117065, | |
| "learning_rate": 5.930133755390216e-06, | |
| "loss": 0.0068, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 3.167741935483871, | |
| "grad_norm": 0.1369076371192932, | |
| "learning_rate": 5.7480173961918266e-06, | |
| "loss": 0.0077, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 3.1806451612903226, | |
| "grad_norm": 0.2318694293498993, | |
| "learning_rate": 5.568377381211548e-06, | |
| "loss": 0.0095, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 3.193548387096774, | |
| "grad_norm": 0.21019090712070465, | |
| "learning_rate": 5.391236817187095e-06, | |
| "loss": 0.0095, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 3.206451612903226, | |
| "grad_norm": 0.1967868059873581, | |
| "learning_rate": 5.216618489356773e-06, | |
| "loss": 0.0111, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.2193548387096773, | |
| "grad_norm": 0.24731586873531342, | |
| "learning_rate": 5.0445448585286684e-06, | |
| "loss": 0.0072, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 3.232258064516129, | |
| "grad_norm": 0.19581767916679382, | |
| "learning_rate": 4.87503805819155e-06, | |
| "loss": 0.0055, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 3.2451612903225806, | |
| "grad_norm": 0.1791073977947235, | |
| "learning_rate": 4.708119891667892e-06, | |
| "loss": 0.0088, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 3.258064516129032, | |
| "grad_norm": 0.25945040583610535, | |
| "learning_rate": 4.5438118293093416e-06, | |
| "loss": 0.0071, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 3.270967741935484, | |
| "grad_norm": 0.21385855972766876, | |
| "learning_rate": 4.382135005735072e-06, | |
| "loss": 0.0173, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 3.2838709677419353, | |
| "grad_norm": 0.5789936780929565, | |
| "learning_rate": 4.223110217113191e-06, | |
| "loss": 0.0079, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 3.296774193548387, | |
| "grad_norm": 0.23074735701084137, | |
| "learning_rate": 4.066757918485886e-06, | |
| "loss": 0.0069, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 3.3096774193548386, | |
| "grad_norm": 0.20934291183948517, | |
| "learning_rate": 3.913098221138237e-06, | |
| "loss": 0.0088, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 3.3225806451612905, | |
| "grad_norm": 0.2551226019859314, | |
| "learning_rate": 3.762150890011387e-06, | |
| "loss": 0.0104, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 3.335483870967742, | |
| "grad_norm": 0.22543351352214813, | |
| "learning_rate": 3.6139353411602154e-06, | |
| "loss": 0.009, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.3483870967741938, | |
| "grad_norm": 0.22513934969902039, | |
| "learning_rate": 3.468470639255872e-06, | |
| "loss": 0.0103, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 3.361290322580645, | |
| "grad_norm": 0.23426640033721924, | |
| "learning_rate": 3.325775495133546e-06, | |
| "loss": 0.0086, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 3.3741935483870966, | |
| "grad_norm": 0.17604398727416992, | |
| "learning_rate": 3.1858682633857105e-06, | |
| "loss": 0.0118, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 3.3870967741935485, | |
| "grad_norm": 0.22662527859210968, | |
| "learning_rate": 3.0487669400012382e-06, | |
| "loss": 0.0089, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 0.1959221363067627, | |
| "learning_rate": 2.914489160050554e-06, | |
| "loss": 0.0101, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 3.412903225806452, | |
| "grad_norm": 0.20400796830654144, | |
| "learning_rate": 2.7830521954173543e-06, | |
| "loss": 0.0088, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 3.425806451612903, | |
| "grad_norm": 0.21109241247177124, | |
| "learning_rate": 2.654472952576864e-06, | |
| "loss": 0.0083, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 3.4387096774193546, | |
| "grad_norm": 0.20984111726284027, | |
| "learning_rate": 2.5287679704212836e-06, | |
| "loss": 0.0091, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 3.4516129032258065, | |
| "grad_norm": 0.22513136267662048, | |
| "learning_rate": 2.4059534181323588e-06, | |
| "loss": 0.0102, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 3.464516129032258, | |
| "grad_norm": 0.23501525819301605, | |
| "learning_rate": 2.2860450931015853e-06, | |
| "loss": 0.0095, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.47741935483871, | |
| "grad_norm": 0.19732145965099335, | |
| "learning_rate": 2.1690584188982592e-06, | |
| "loss": 0.0087, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 3.490322580645161, | |
| "grad_norm": 0.19582946598529816, | |
| "learning_rate": 2.0550084432855056e-06, | |
| "loss": 0.0074, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 3.5032258064516126, | |
| "grad_norm": 0.19379809498786926, | |
| "learning_rate": 1.9439098362847825e-06, | |
| "loss": 0.008, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 3.5161290322580645, | |
| "grad_norm": 0.20334380865097046, | |
| "learning_rate": 1.835776888288851e-06, | |
| "loss": 0.0104, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 3.5290322580645164, | |
| "grad_norm": 0.1963813155889511, | |
| "learning_rate": 1.730623508223686e-06, | |
| "loss": 0.0114, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 3.541935483870968, | |
| "grad_norm": 0.2684271037578583, | |
| "learning_rate": 1.62846322175933e-06, | |
| "loss": 0.0106, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 3.554838709677419, | |
| "grad_norm": 0.19259046018123627, | |
| "learning_rate": 1.5293091695701839e-06, | |
| "loss": 0.0083, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 3.567741935483871, | |
| "grad_norm": 0.19738364219665527, | |
| "learning_rate": 1.4331741056446968e-06, | |
| "loss": 0.0166, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 3.5806451612903225, | |
| "grad_norm": 0.22224943339824677, | |
| "learning_rate": 1.3400703956448684e-06, | |
| "loss": 0.0074, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 3.5935483870967744, | |
| "grad_norm": 0.23735593259334564, | |
| "learning_rate": 1.2500100153156779e-06, | |
| "loss": 0.0091, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.606451612903226, | |
| "grad_norm": 0.1716444343328476, | |
| "learning_rate": 1.1630045489446572e-06, | |
| "loss": 0.0078, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 3.6193548387096772, | |
| "grad_norm": 0.1820307821035385, | |
| "learning_rate": 1.079065187871839e-06, | |
| "loss": 0.009, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 3.632258064516129, | |
| "grad_norm": 0.1972561627626419, | |
| "learning_rate": 9.982027290502238e-07, | |
| "loss": 0.0122, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 3.6451612903225805, | |
| "grad_norm": 0.21676687896251678, | |
| "learning_rate": 9.204275736570151e-07, | |
| "loss": 0.0082, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 3.6580645161290324, | |
| "grad_norm": 0.1852135956287384, | |
| "learning_rate": 8.45749725755704e-07, | |
| "loss": 0.0078, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 3.670967741935484, | |
| "grad_norm": 0.15526923537254333, | |
| "learning_rate": 7.741787910092807e-07, | |
| "loss": 0.0058, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 3.6838709677419352, | |
| "grad_norm": 0.25308313965797424, | |
| "learning_rate": 7.05723975444686e-07, | |
| "loss": 0.0072, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 3.696774193548387, | |
| "grad_norm": 0.16028578579425812, | |
| "learning_rate": 6.403940842686474e-07, | |
| "loss": 0.0086, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 3.709677419354839, | |
| "grad_norm": 0.164954274892807, | |
| "learning_rate": 5.781975207350826e-07, | |
| "loss": 0.0074, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 3.7225806451612904, | |
| "grad_norm": 0.21616578102111816, | |
| "learning_rate": 5.191422850642113e-07, | |
| "loss": 0.0111, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.735483870967742, | |
| "grad_norm": 0.23164188861846924, | |
| "learning_rate": 4.6323597341350054e-07, | |
| "loss": 0.0093, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 3.7483870967741937, | |
| "grad_norm": 0.2091163843870163, | |
| "learning_rate": 4.1048577690058766e-07, | |
| "loss": 0.008, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 3.761290322580645, | |
| "grad_norm": 0.17184507846832275, | |
| "learning_rate": 3.608984806782928e-07, | |
| "loss": 0.0089, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 3.774193548387097, | |
| "grad_norm": 0.2208150029182434, | |
| "learning_rate": 3.144804630618725e-07, | |
| "loss": 0.0098, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 3.7870967741935484, | |
| "grad_norm": 0.18591627478599548, | |
| "learning_rate": 2.71237694708576e-07, | |
| "loss": 0.0091, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 0.22179189324378967, | |
| "learning_rate": 2.3117573784966206e-07, | |
| "loss": 0.0096, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 3.8129032258064517, | |
| "grad_norm": 0.247935950756073, | |
| "learning_rate": 1.9429974557493014e-07, | |
| "loss": 0.0075, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 3.825806451612903, | |
| "grad_norm": 0.19308875501155853, | |
| "learning_rate": 1.6061446116990008e-07, | |
| "loss": 0.0097, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 3.838709677419355, | |
| "grad_norm": 0.28245624899864197, | |
| "learning_rate": 1.3012421750568105e-07, | |
| "loss": 0.0105, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 3.8516129032258064, | |
| "grad_norm": 0.15781083703041077, | |
| "learning_rate": 1.0283293648165605e-07, | |
| "loss": 0.0087, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.864516129032258, | |
| "grad_norm": 0.20132358372211456, | |
| "learning_rate": 7.874412852099944e-08, | |
| "loss": 0.0081, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 3.8774193548387097, | |
| "grad_norm": 0.19697970151901245, | |
| "learning_rate": 5.786089211915202e-08, | |
| "loss": 0.0116, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 3.8903225806451616, | |
| "grad_norm": 0.189448282122612, | |
| "learning_rate": 4.018591344526479e-08, | |
| "loss": 0.0079, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 3.903225806451613, | |
| "grad_norm": 0.1814439296722412, | |
| "learning_rate": 2.5721465996675355e-08, | |
| "loss": 0.0061, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 3.9161290322580644, | |
| "grad_norm": 0.17018909752368927, | |
| "learning_rate": 1.4469410306480746e-08, | |
| "loss": 0.0074, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 3.9290322580645163, | |
| "grad_norm": 0.2005801647901535, | |
| "learning_rate": 6.431193704217741e-09, | |
| "loss": 0.0084, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 3.9419354838709677, | |
| "grad_norm": 0.2100239396095276, | |
| "learning_rate": 1.6078501296951099e-09, | |
| "loss": 0.0118, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 3.9548387096774196, | |
| "grad_norm": 0.17311210930347443, | |
| "learning_rate": 0.0, | |
| "loss": 0.0059, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 3.9548387096774196, | |
| "step": 308, | |
| "total_flos": 5.413829559357276e+17, | |
| "train_loss": 0.04737277891541311, | |
| "train_runtime": 1301.7465, | |
| "train_samples_per_second": 15.216, | |
| "train_steps_per_second": 0.237 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 308, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.413829559357276e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |