{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8666183924692252, "eval_steps": 50, "global_step": 187, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004634322954380883, "grad_norm": 1.1059749126434326, "learning_rate": 7.499999999999999e-06, "loss": 2.7798, "step": 1 }, { "epoch": 0.004634322954380883, "eval_loss": 2.9304847717285156, "eval_runtime": 146.6617, "eval_samples_per_second": 4.957, "eval_steps_per_second": 2.482, "step": 1 }, { "epoch": 0.009268645908761766, "grad_norm": 1.1082885265350342, "learning_rate": 1.4999999999999999e-05, "loss": 2.681, "step": 2 }, { "epoch": 0.013902968863142651, "grad_norm": 1.2035174369812012, "learning_rate": 2.2499999999999998e-05, "loss": 2.9068, "step": 3 }, { "epoch": 0.018537291817523532, "grad_norm": 1.1457459926605225, "learning_rate": 2.9999999999999997e-05, "loss": 2.9637, "step": 4 }, { "epoch": 0.023171614771904415, "grad_norm": 1.1010493040084839, "learning_rate": 3.75e-05, "loss": 2.7247, "step": 5 }, { "epoch": 0.027805937726285302, "grad_norm": 1.2098731994628906, "learning_rate": 4.4999999999999996e-05, "loss": 2.8731, "step": 6 }, { "epoch": 0.03244026068066618, "grad_norm": 1.0666383504867554, "learning_rate": 5.2499999999999995e-05, "loss": 2.6556, "step": 7 }, { "epoch": 0.037074583635047065, "grad_norm": 0.8229893445968628, "learning_rate": 5.9999999999999995e-05, "loss": 2.5, "step": 8 }, { "epoch": 0.04170890658942795, "grad_norm": 1.1555801630020142, "learning_rate": 6.75e-05, "loss": 2.663, "step": 9 }, { "epoch": 0.04634322954380883, "grad_norm": 1.2436870336532593, "learning_rate": 7.5e-05, "loss": 2.3639, "step": 10 }, { "epoch": 0.05097755249818972, "grad_norm": 1.1677322387695312, "learning_rate": 8.25e-05, "loss": 2.3472, "step": 11 }, { "epoch": 0.055611875452570604, "grad_norm": 1.1217212677001953, "learning_rate": 8.999999999999999e-05, "loss": 2.2676, "step": 12 }, { "epoch": 0.06024619840695149, "grad_norm": 0.7882691621780396, "learning_rate": 9.75e-05, "loss": 2.1962, "step": 13 }, { "epoch": 0.06488052136133236, "grad_norm": 0.8494845032691956, "learning_rate": 0.00010499999999999999, "loss": 2.1433, "step": 14 }, { "epoch": 0.06951484431571325, "grad_norm": 0.980268120765686, "learning_rate": 0.0001125, "loss": 2.152, "step": 15 }, { "epoch": 0.07414916727009413, "grad_norm": 0.9087497591972351, "learning_rate": 0.00011999999999999999, "loss": 2.0726, "step": 16 }, { "epoch": 0.07878349022447502, "grad_norm": 0.6476942300796509, "learning_rate": 0.00012749999999999998, "loss": 1.9891, "step": 17 }, { "epoch": 0.0834178131788559, "grad_norm": 0.7011496424674988, "learning_rate": 0.000135, "loss": 1.9417, "step": 18 }, { "epoch": 0.08805213613323679, "grad_norm": 0.7312366962432861, "learning_rate": 0.0001425, "loss": 2.0363, "step": 19 }, { "epoch": 0.09268645908761766, "grad_norm": 0.6832443475723267, "learning_rate": 0.00015, "loss": 1.9493, "step": 20 }, { "epoch": 0.09732078204199855, "grad_norm": 0.5798928737640381, "learning_rate": 0.00014998857713672935, "loss": 1.9732, "step": 21 }, { "epoch": 0.10195510499637944, "grad_norm": 0.5472275018692017, "learning_rate": 0.00014995431202643217, "loss": 1.8398, "step": 22 }, { "epoch": 0.10658942795076032, "grad_norm": 0.7329992055892944, "learning_rate": 0.000149897215106593, "loss": 1.8098, "step": 23 }, { "epoch": 0.11122375090514121, "grad_norm": 0.6771075129508972, "learning_rate": 0.0001498173037694868, "loss": 1.7746, "step": 24 }, { "epoch": 0.11585807385952208, "grad_norm": 0.8174684643745422, "learning_rate": 0.0001497146023568809, "loss": 1.7504, "step": 25 }, { "epoch": 0.12049239681390297, "grad_norm": 0.6408036351203918, "learning_rate": 0.00014958914215262048, "loss": 1.7606, "step": 26 }, { "epoch": 0.12512671976828385, "grad_norm": 0.7159773111343384, "learning_rate": 0.00014944096137309914, "loss": 1.7529, "step": 27 }, { "epoch": 0.12976104272266473, "grad_norm": 0.8010567426681519, "learning_rate": 0.00014927010515561776, "loss": 1.9922, "step": 28 }, { "epoch": 0.13439536567704563, "grad_norm": 0.6913322806358337, "learning_rate": 0.00014907662554463532, "loss": 1.6162, "step": 29 }, { "epoch": 0.1390296886314265, "grad_norm": 0.6494315266609192, "learning_rate": 0.0001488605814759156, "loss": 1.6571, "step": 30 }, { "epoch": 0.14366401158580738, "grad_norm": 0.6595631241798401, "learning_rate": 0.00014862203875857477, "loss": 1.5886, "step": 31 }, { "epoch": 0.14829833454018826, "grad_norm": 0.7777145504951477, "learning_rate": 0.0001483610700550354, "loss": 1.5729, "step": 32 }, { "epoch": 0.15293265749456916, "grad_norm": 0.7189599871635437, "learning_rate": 0.00014807775485889264, "loss": 1.6492, "step": 33 }, { "epoch": 0.15756698044895004, "grad_norm": 0.7974843978881836, "learning_rate": 0.0001477721794706997, "loss": 1.5415, "step": 34 }, { "epoch": 0.16220130340333092, "grad_norm": 0.8167005181312561, "learning_rate": 0.0001474444369716801, "loss": 1.4696, "step": 35 }, { "epoch": 0.1668356263577118, "grad_norm": 0.7308511137962341, "learning_rate": 0.0001470946271953739, "loss": 1.6394, "step": 36 }, { "epoch": 0.1714699493120927, "grad_norm": 0.808505654335022, "learning_rate": 0.00014672285669722765, "loss": 1.4614, "step": 37 }, { "epoch": 0.17610427226647357, "grad_norm": 0.7852684259414673, "learning_rate": 0.00014632923872213652, "loss": 1.4592, "step": 38 }, { "epoch": 0.18073859522085445, "grad_norm": 0.7765493988990784, "learning_rate": 0.00014591389316994876, "loss": 1.3337, "step": 39 }, { "epoch": 0.18537291817523532, "grad_norm": 0.9292432069778442, "learning_rate": 0.0001454769465589431, "loss": 1.4594, "step": 40 }, { "epoch": 0.19000724112961623, "grad_norm": 0.9222384095191956, "learning_rate": 0.00014501853198729012, "loss": 1.5136, "step": 41 }, { "epoch": 0.1946415640839971, "grad_norm": 0.8896085619926453, "learning_rate": 0.00014453878909250904, "loss": 1.4321, "step": 42 }, { "epoch": 0.19927588703837798, "grad_norm": 0.9805691242218018, "learning_rate": 0.00014403786400893302, "loss": 1.2422, "step": 43 }, { "epoch": 0.20391020999275888, "grad_norm": 0.9406644701957703, "learning_rate": 0.00014351590932319504, "loss": 1.3904, "step": 44 }, { "epoch": 0.20854453294713976, "grad_norm": 1.2533518075942993, "learning_rate": 0.00014297308402774875, "loss": 1.2121, "step": 45 }, { "epoch": 0.21317885590152064, "grad_norm": 1.020983338356018, "learning_rate": 0.0001424095534724375, "loss": 1.148, "step": 46 }, { "epoch": 0.2178131788559015, "grad_norm": 0.895693302154541, "learning_rate": 0.00014182548931412757, "loss": 1.3044, "step": 47 }, { "epoch": 0.22244750181028242, "grad_norm": 1.1574026346206665, "learning_rate": 0.0001412210694644195, "loss": 1.2857, "step": 48 }, { "epoch": 0.2270818247646633, "grad_norm": 1.032139539718628, "learning_rate": 0.00014059647803545467, "loss": 1.2987, "step": 49 }, { "epoch": 0.23171614771904417, "grad_norm": 0.9623427391052246, "learning_rate": 0.0001399519052838329, "loss": 1.1087, "step": 50 }, { "epoch": 0.23171614771904417, "eval_loss": 1.1839566230773926, "eval_runtime": 146.108, "eval_samples_per_second": 4.976, "eval_steps_per_second": 2.491, "step": 50 }, { "epoch": 0.23635047067342504, "grad_norm": 1.263940691947937, "learning_rate": 0.00013928754755265842, "loss": 1.0973, "step": 51 }, { "epoch": 0.24098479362780595, "grad_norm": 1.0416457653045654, "learning_rate": 0.00013860360721173193, "loss": 1.1655, "step": 52 }, { "epoch": 0.24561911658218682, "grad_norm": 1.1260313987731934, "learning_rate": 0.0001379002925959068, "loss": 1.0404, "step": 53 }, { "epoch": 0.2502534395365677, "grad_norm": 0.9762699604034424, "learning_rate": 0.0001371778179416281, "loss": 1.0744, "step": 54 }, { "epoch": 0.2548877624909486, "grad_norm": 1.2078801393508911, "learning_rate": 0.00013643640332167438, "loss": 1.0079, "step": 55 }, { "epoch": 0.25952208544532945, "grad_norm": 1.075649380683899, "learning_rate": 0.00013567627457812106, "loss": 1.0864, "step": 56 }, { "epoch": 0.26415640839971033, "grad_norm": 1.1446782350540161, "learning_rate": 0.00013489766325354695, "loss": 1.0853, "step": 57 }, { "epoch": 0.26879073135409126, "grad_norm": 1.1303826570510864, "learning_rate": 0.00013410080652050412, "loss": 1.0711, "step": 58 }, { "epoch": 0.27342505430847214, "grad_norm": 1.216537356376648, "learning_rate": 0.0001332859471092728, "loss": 0.9193, "step": 59 }, { "epoch": 0.278059377262853, "grad_norm": 1.4602874517440796, "learning_rate": 0.00013245333323392333, "loss": 0.9987, "step": 60 }, { "epoch": 0.2826937002172339, "grad_norm": 1.0730090141296387, "learning_rate": 0.0001316032185167079, "loss": 1.0272, "step": 61 }, { "epoch": 0.28732802317161477, "grad_norm": 1.0437711477279663, "learning_rate": 0.00013073586191080457, "loss": 1.0511, "step": 62 }, { "epoch": 0.29196234612599564, "grad_norm": 1.0983786582946777, "learning_rate": 0.00012985152762143778, "loss": 0.9506, "step": 63 }, { "epoch": 0.2965966690803765, "grad_norm": 1.3723191022872925, "learning_rate": 0.00012895048502539882, "loss": 1.0643, "step": 64 }, { "epoch": 0.30123099203475745, "grad_norm": 1.2514058351516724, "learning_rate": 0.00012803300858899104, "loss": 1.0204, "step": 65 }, { "epoch": 0.3058653149891383, "grad_norm": 1.038238525390625, "learning_rate": 0.0001270993777844248, "loss": 0.7876, "step": 66 }, { "epoch": 0.3104996379435192, "grad_norm": 1.313706874847412, "learning_rate": 0.0001261498770046874, "loss": 1.0337, "step": 67 }, { "epoch": 0.3151339608979001, "grad_norm": 1.1323295831680298, "learning_rate": 0.00012518479547691435, "loss": 0.8007, "step": 68 }, { "epoch": 0.31976828385228095, "grad_norm": 1.0297540426254272, "learning_rate": 0.00012420442717428804, "loss": 0.9792, "step": 69 }, { "epoch": 0.32440260680666183, "grad_norm": 1.201749563217163, "learning_rate": 0.00012320907072649044, "loss": 1.0579, "step": 70 }, { "epoch": 0.3290369297610427, "grad_norm": 1.0487987995147705, "learning_rate": 0.0001221990293287378, "loss": 0.8679, "step": 71 }, { "epoch": 0.3336712527154236, "grad_norm": 0.9669104814529419, "learning_rate": 0.00012117461064942435, "loss": 0.9195, "step": 72 }, { "epoch": 0.3383055756698045, "grad_norm": 1.1065351963043213, "learning_rate": 0.00012013612673640363, "loss": 0.9479, "step": 73 }, { "epoch": 0.3429398986241854, "grad_norm": 1.1072312593460083, "learning_rate": 0.00011908389392193547, "loss": 0.8467, "step": 74 }, { "epoch": 0.34757422157856627, "grad_norm": 1.1270943880081177, "learning_rate": 0.00011801823272632844, "loss": 0.7588, "step": 75 }, { "epoch": 0.35220854453294714, "grad_norm": 1.2270523309707642, "learning_rate": 0.00011693946776030599, "loss": 0.9326, "step": 76 }, { "epoch": 0.356842867487328, "grad_norm": 1.2635189294815063, "learning_rate": 0.00011584792762612703, "loss": 0.9417, "step": 77 }, { "epoch": 0.3614771904417089, "grad_norm": 1.3055872917175293, "learning_rate": 0.00011474394481749035, "loss": 0.9683, "step": 78 }, { "epoch": 0.36611151339608977, "grad_norm": 1.0037323236465454, "learning_rate": 0.00011362785561825406, "loss": 0.6438, "step": 79 }, { "epoch": 0.37074583635047065, "grad_norm": 1.0581717491149902, "learning_rate": 0.0001125, "loss": 0.8729, "step": 80 }, { "epoch": 0.3753801593048516, "grad_norm": 1.194458246231079, "learning_rate": 0.00011136072151847529, "loss": 0.7414, "step": 81 }, { "epoch": 0.38001448225923246, "grad_norm": 1.3376874923706055, "learning_rate": 0.00011021036720894179, "loss": 0.7101, "step": 82 }, { "epoch": 0.38464880521361333, "grad_norm": 1.5701568126678467, "learning_rate": 0.00010904928748046599, "loss": 0.6721, "step": 83 }, { "epoch": 0.3892831281679942, "grad_norm": 1.53782057762146, "learning_rate": 0.0001078778360091808, "loss": 0.7689, "step": 84 }, { "epoch": 0.3939174511223751, "grad_norm": 1.1315703392028809, "learning_rate": 0.00010669636963055245, "loss": 0.6933, "step": 85 }, { "epoch": 0.39855177407675596, "grad_norm": 1.1853336095809937, "learning_rate": 0.00010550524823068502, "loss": 0.7163, "step": 86 }, { "epoch": 0.40318609703113684, "grad_norm": 1.1081809997558594, "learning_rate": 0.00010430483463669551, "loss": 0.8206, "step": 87 }, { "epoch": 0.40782041998551777, "grad_norm": 1.1602833271026611, "learning_rate": 0.0001030954945061934, "loss": 0.632, "step": 88 }, { "epoch": 0.41245474293989864, "grad_norm": 1.3947807550430298, "learning_rate": 0.0001018775962158975, "loss": 0.668, "step": 89 }, { "epoch": 0.4170890658942795, "grad_norm": 1.2622536420822144, "learning_rate": 0.00010065151074942516, "loss": 0.5766, "step": 90 }, { "epoch": 0.4217233888486604, "grad_norm": 1.39005708694458, "learning_rate": 9.941761158428674e-05, "loss": 0.5234, "step": 91 }, { "epoch": 0.4263577118030413, "grad_norm": 1.3181982040405273, "learning_rate": 9.817627457812105e-05, "loss": 0.575, "step": 92 }, { "epoch": 0.43099203475742215, "grad_norm": 1.385236144065857, "learning_rate": 9.692787785420525e-05, "loss": 0.7406, "step": 93 }, { "epoch": 0.435626357711803, "grad_norm": 1.2025413513183594, "learning_rate": 9.567280168627493e-05, "loss": 0.565, "step": 94 }, { "epoch": 0.4402606806661839, "grad_norm": 1.1163705587387085, "learning_rate": 9.441142838268905e-05, "loss": 0.5613, "step": 95 }, { "epoch": 0.44489500362056483, "grad_norm": 1.2544317245483398, "learning_rate": 9.314414216997507e-05, "loss": 0.7618, "step": 96 }, { "epoch": 0.4495293265749457, "grad_norm": 1.1881450414657593, "learning_rate": 9.187132907578987e-05, "loss": 0.7518, "step": 97 }, { "epoch": 0.4541636495293266, "grad_norm": 1.1118865013122559, "learning_rate": 9.059337681133192e-05, "loss": 0.5474, "step": 98 }, { "epoch": 0.45879797248370746, "grad_norm": 1.0739349126815796, "learning_rate": 8.931067465324085e-05, "loss": 0.5227, "step": 99 }, { "epoch": 0.46343229543808834, "grad_norm": 1.2055435180664062, "learning_rate": 8.802361332501978e-05, "loss": 0.6742, "step": 100 }, { "epoch": 0.46343229543808834, "eval_loss": 0.6104360818862915, "eval_runtime": 135.03, "eval_samples_per_second": 5.384, "eval_steps_per_second": 2.696, "step": 100 }, { "epoch": 0.4680666183924692, "grad_norm": 1.332160472869873, "learning_rate": 8.673258487801731e-05, "loss": 0.5866, "step": 101 }, { "epoch": 0.4727009413468501, "grad_norm": 1.0405492782592773, "learning_rate": 8.54379825720049e-05, "loss": 0.5656, "step": 102 }, { "epoch": 0.47733526430123097, "grad_norm": 1.373766303062439, "learning_rate": 8.414020075538605e-05, "loss": 0.5363, "step": 103 }, { "epoch": 0.4819695872556119, "grad_norm": 1.1279383897781372, "learning_rate": 8.2839634745074e-05, "loss": 0.474, "step": 104 }, { "epoch": 0.4866039102099928, "grad_norm": 1.3224053382873535, "learning_rate": 8.153668070607437e-05, "loss": 0.7299, "step": 105 }, { "epoch": 0.49123823316437365, "grad_norm": 1.1764270067214966, "learning_rate": 8.023173553080938e-05, "loss": 0.5553, "step": 106 }, { "epoch": 0.4958725561187545, "grad_norm": 1.3858931064605713, "learning_rate": 7.89251967182208e-05, "loss": 0.6029, "step": 107 }, { "epoch": 0.5005068790731354, "grad_norm": 1.2218059301376343, "learning_rate": 7.761746225268758e-05, "loss": 0.5179, "step": 108 }, { "epoch": 0.5051412020275163, "grad_norm": 1.186583399772644, "learning_rate": 7.630893048279627e-05, "loss": 0.592, "step": 109 }, { "epoch": 0.5097755249818972, "grad_norm": 1.1901055574417114, "learning_rate": 7.5e-05, "loss": 0.4786, "step": 110 }, { "epoch": 0.5144098479362781, "grad_norm": 1.2109345197677612, "learning_rate": 7.369106951720373e-05, "loss": 0.4874, "step": 111 }, { "epoch": 0.5190441708906589, "grad_norm": 1.2095303535461426, "learning_rate": 7.238253774731244e-05, "loss": 0.5826, "step": 112 }, { "epoch": 0.5236784938450398, "grad_norm": 1.4487143754959106, "learning_rate": 7.10748032817792e-05, "loss": 0.6947, "step": 113 }, { "epoch": 0.5283128167994207, "grad_norm": 1.4283783435821533, "learning_rate": 6.976826446919059e-05, "loss": 0.6258, "step": 114 }, { "epoch": 0.5329471397538016, "grad_norm": 1.387428879737854, "learning_rate": 6.846331929392562e-05, "loss": 0.5088, "step": 115 }, { "epoch": 0.5375814627081825, "grad_norm": 1.3152918815612793, "learning_rate": 6.7160365254926e-05, "loss": 0.4329, "step": 116 }, { "epoch": 0.5422157856625633, "grad_norm": 1.493445873260498, "learning_rate": 6.585979924461394e-05, "loss": 0.5035, "step": 117 }, { "epoch": 0.5468501086169443, "grad_norm": 1.2388755083084106, "learning_rate": 6.45620174279951e-05, "loss": 0.4064, "step": 118 }, { "epoch": 0.5514844315713251, "grad_norm": 1.163062572479248, "learning_rate": 6.326741512198266e-05, "loss": 0.3774, "step": 119 }, { "epoch": 0.556118754525706, "grad_norm": 1.305509328842163, "learning_rate": 6.197638667498022e-05, "loss": 0.4626, "step": 120 }, { "epoch": 0.5607530774800868, "grad_norm": 1.2311328649520874, "learning_rate": 6.068932534675913e-05, "loss": 0.4069, "step": 121 }, { "epoch": 0.5653874004344678, "grad_norm": 1.3704758882522583, "learning_rate": 5.9406623188668055e-05, "loss": 0.5308, "step": 122 }, { "epoch": 0.5700217233888487, "grad_norm": 1.2310731410980225, "learning_rate": 5.812867092421013e-05, "loss": 0.4324, "step": 123 }, { "epoch": 0.5746560463432295, "grad_norm": 1.2407798767089844, "learning_rate": 5.685585783002493e-05, "loss": 0.4992, "step": 124 }, { "epoch": 0.5792903692976105, "grad_norm": 1.2309188842773438, "learning_rate": 5.558857161731093e-05, "loss": 0.3967, "step": 125 }, { "epoch": 0.5839246922519913, "grad_norm": 1.2751799821853638, "learning_rate": 5.4327198313725064e-05, "loss": 0.4109, "step": 126 }, { "epoch": 0.5885590152063722, "grad_norm": 1.3883857727050781, "learning_rate": 5.307212214579474e-05, "loss": 0.3867, "step": 127 }, { "epoch": 0.593193338160753, "grad_norm": 1.410216212272644, "learning_rate": 5.182372542187895e-05, "loss": 0.3541, "step": 128 }, { "epoch": 0.597827661115134, "grad_norm": 1.3435072898864746, "learning_rate": 5.058238841571326e-05, "loss": 0.4067, "step": 129 }, { "epoch": 0.6024619840695149, "grad_norm": 1.3180471658706665, "learning_rate": 4.934848925057484e-05, "loss": 0.3048, "step": 130 }, { "epoch": 0.6070963070238957, "grad_norm": 1.1778309345245361, "learning_rate": 4.812240378410248e-05, "loss": 0.3317, "step": 131 }, { "epoch": 0.6117306299782767, "grad_norm": 1.2751339673995972, "learning_rate": 4.690450549380659e-05, "loss": 0.3, "step": 132 }, { "epoch": 0.6163649529326575, "grad_norm": 1.1635668277740479, "learning_rate": 4.569516536330447e-05, "loss": 0.2464, "step": 133 }, { "epoch": 0.6209992758870384, "grad_norm": 1.4394093751907349, "learning_rate": 4.449475176931499e-05, "loss": 0.3776, "step": 134 }, { "epoch": 0.6256335988414192, "grad_norm": 1.3860478401184082, "learning_rate": 4.3303630369447554e-05, "loss": 0.3414, "step": 135 }, { "epoch": 0.6302679217958002, "grad_norm": 0.9536806344985962, "learning_rate": 4.212216399081918e-05, "loss": 0.2116, "step": 136 }, { "epoch": 0.634902244750181, "grad_norm": 1.092250943183899, "learning_rate": 4.095071251953399e-05, "loss": 0.2411, "step": 137 }, { "epoch": 0.6395365677045619, "grad_norm": 1.3056756258010864, "learning_rate": 3.978963279105821e-05, "loss": 0.3435, "step": 138 }, { "epoch": 0.6441708906589428, "grad_norm": 1.1005077362060547, "learning_rate": 3.863927848152472e-05, "loss": 0.2612, "step": 139 }, { "epoch": 0.6488052136133237, "grad_norm": 1.237870454788208, "learning_rate": 3.750000000000001e-05, "loss": 0.3284, "step": 140 }, { "epoch": 0.6534395365677046, "grad_norm": 1.3307782411575317, "learning_rate": 3.637214438174593e-05, "loss": 0.2536, "step": 141 }, { "epoch": 0.6580738595220854, "grad_norm": 1.4346413612365723, "learning_rate": 3.525605518250964e-05, "loss": 0.2911, "step": 142 }, { "epoch": 0.6627081824764663, "grad_norm": 1.2083615064620972, "learning_rate": 3.415207237387297e-05, "loss": 0.233, "step": 143 }, { "epoch": 0.6673425054308472, "grad_norm": 1.4748581647872925, "learning_rate": 3.3060532239693994e-05, "loss": 0.319, "step": 144 }, { "epoch": 0.6719768283852281, "grad_norm": 1.2144207954406738, "learning_rate": 3.198176727367156e-05, "loss": 0.1959, "step": 145 }, { "epoch": 0.676611151339609, "grad_norm": 1.5745162963867188, "learning_rate": 3.091610607806452e-05, "loss": 0.3077, "step": 146 }, { "epoch": 0.6812454742939898, "grad_norm": 1.1295483112335205, "learning_rate": 2.986387326359637e-05, "loss": 0.2328, "step": 147 }, { "epoch": 0.6858797972483708, "grad_norm": 1.218430757522583, "learning_rate": 2.8825389350575624e-05, "loss": 0.2504, "step": 148 }, { "epoch": 0.6905141202027516, "grad_norm": 1.1782724857330322, "learning_rate": 2.78009706712622e-05, "loss": 0.2519, "step": 149 }, { "epoch": 0.6951484431571325, "grad_norm": 1.3294053077697754, "learning_rate": 2.6790929273509545e-05, "loss": 0.249, "step": 150 }, { "epoch": 0.6951484431571325, "eval_loss": 0.25855064392089844, "eval_runtime": 135.0303, "eval_samples_per_second": 5.384, "eval_steps_per_second": 2.696, "step": 150 }, { "epoch": 0.6997827661115134, "grad_norm": 0.8559562563896179, "learning_rate": 2.579557282571196e-05, "loss": 0.1331, "step": 151 }, { "epoch": 0.7044170890658943, "grad_norm": 1.5178470611572266, "learning_rate": 2.4815204523085654e-05, "loss": 0.3093, "step": 152 }, { "epoch": 0.7090514120202752, "grad_norm": 1.3319201469421387, "learning_rate": 2.385012299531262e-05, "loss": 0.3123, "step": 153 }, { "epoch": 0.713685734974656, "grad_norm": 1.3699779510498047, "learning_rate": 2.2900622215575197e-05, "loss": 0.2744, "step": 154 }, { "epoch": 0.718320057929037, "grad_norm": 1.4394081830978394, "learning_rate": 2.1966991411008938e-05, "loss": 0.2854, "step": 155 }, { "epoch": 0.7229543808834178, "grad_norm": 1.1516926288604736, "learning_rate": 2.1049514974601175e-05, "loss": 0.232, "step": 156 }, { "epoch": 0.7275887038377987, "grad_norm": 1.226991891860962, "learning_rate": 2.0148472378562215e-05, "loss": 0.2174, "step": 157 }, { "epoch": 0.7322230267921795, "grad_norm": 1.3236987590789795, "learning_rate": 1.926413808919542e-05, "loss": 0.2859, "step": 158 }, { "epoch": 0.7368573497465605, "grad_norm": 1.3909859657287598, "learning_rate": 1.8396781483292098e-05, "loss": 0.1741, "step": 159 }, { "epoch": 0.7414916727009413, "grad_norm": 1.3062829971313477, "learning_rate": 1.7546666766076655e-05, "loss": 0.2003, "step": 160 }, { "epoch": 0.7461259956553222, "grad_norm": 1.4208601713180542, "learning_rate": 1.671405289072718e-05, "loss": 0.2306, "step": 161 }, { "epoch": 0.7507603186097032, "grad_norm": 1.130900502204895, "learning_rate": 1.5899193479495857e-05, "loss": 0.1712, "step": 162 }, { "epoch": 0.755394641564084, "grad_norm": 1.3274939060211182, "learning_rate": 1.5102336746453053e-05, "loss": 0.2057, "step": 163 }, { "epoch": 0.7600289645184649, "grad_norm": 1.5153155326843262, "learning_rate": 1.4323725421878949e-05, "loss": 0.2792, "step": 164 }, { "epoch": 0.7646632874728457, "grad_norm": 1.2106071710586548, "learning_rate": 1.3563596678325606e-05, "loss": 0.213, "step": 165 }, { "epoch": 0.7692976104272267, "grad_norm": 1.3996082544326782, "learning_rate": 1.2822182058371878e-05, "loss": 0.2113, "step": 166 }, { "epoch": 0.7739319333816075, "grad_norm": 1.1402256488800049, "learning_rate": 1.2099707404093203e-05, "loss": 0.1599, "step": 167 }, { "epoch": 0.7785662563359884, "grad_norm": 1.6671884059906006, "learning_rate": 1.1396392788268052e-05, "loss": 0.301, "step": 168 }, { "epoch": 0.7832005792903693, "grad_norm": 1.1182959079742432, "learning_rate": 1.0712452447341582e-05, "loss": 0.1367, "step": 169 }, { "epoch": 0.7878349022447502, "grad_norm": 1.0370293855667114, "learning_rate": 1.0048094716167095e-05, "loss": 0.1441, "step": 170 }, { "epoch": 0.7924692251991311, "grad_norm": 1.155311942100525, "learning_rate": 9.40352196454532e-06, "loss": 0.1666, "step": 171 }, { "epoch": 0.7971035481535119, "grad_norm": 1.0911647081375122, "learning_rate": 8.778930535580474e-06, "loss": 0.185, "step": 172 }, { "epoch": 0.8017378711078929, "grad_norm": 0.9174829721450806, "learning_rate": 8.174510685872415e-06, "loss": 0.1147, "step": 173 }, { "epoch": 0.8063721940622737, "grad_norm": 1.362585425376892, "learning_rate": 7.5904465275624884e-06, "loss": 0.2452, "step": 174 }, { "epoch": 0.8110065170166546, "grad_norm": 1.197021722793579, "learning_rate": 7.026915972251254e-06, "loss": 0.2385, "step": 175 }, { "epoch": 0.8156408399710355, "grad_norm": 1.0614386796951294, "learning_rate": 6.484090676804926e-06, "loss": 0.1824, "step": 176 }, { "epoch": 0.8202751629254164, "grad_norm": 1.1456388235092163, "learning_rate": 5.962135991066971e-06, "loss": 0.1921, "step": 177 }, { "epoch": 0.8249094858797973, "grad_norm": 1.2095783948898315, "learning_rate": 5.461210907490951e-06, "loss": 0.1803, "step": 178 }, { "epoch": 0.8295438088341781, "grad_norm": 1.2167255878448486, "learning_rate": 4.981468012709877e-06, "loss": 0.1777, "step": 179 }, { "epoch": 0.834178131788559, "grad_norm": 1.2779210805892944, "learning_rate": 4.523053441056876e-06, "loss": 0.2648, "step": 180 }, { "epoch": 0.8388124547429399, "grad_norm": 1.1896816492080688, "learning_rate": 4.086106830051236e-06, "loss": 0.1531, "step": 181 }, { "epoch": 0.8434467776973208, "grad_norm": 1.0845763683319092, "learning_rate": 3.670761277863485e-06, "loss": 0.1953, "step": 182 }, { "epoch": 0.8480811006517016, "grad_norm": 1.338794231414795, "learning_rate": 3.277143302772342e-06, "loss": 0.1878, "step": 183 }, { "epoch": 0.8527154236060825, "grad_norm": 1.036252737045288, "learning_rate": 2.9053728046260825e-06, "loss": 0.1352, "step": 184 }, { "epoch": 0.8573497465604635, "grad_norm": 1.3027642965316772, "learning_rate": 2.555563028319885e-06, "loss": 0.2213, "step": 185 }, { "epoch": 0.8619840695148443, "grad_norm": 1.0671361684799194, "learning_rate": 2.227820529300264e-06, "loss": 0.2172, "step": 186 }, { "epoch": 0.8666183924692252, "grad_norm": 1.0457696914672852, "learning_rate": 1.9222451411073645e-06, "loss": 0.1704, "step": 187 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 17, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1160316364382536e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }