{ "best_metric": 2.7491917610168457, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.06791171477079797, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00033955857385398983, "grad_norm": 21.100221633911133, "learning_rate": 1.0100000000000002e-05, "loss": 5.7028, "step": 1 }, { "epoch": 0.00033955857385398983, "eval_loss": 3.8351783752441406, "eval_runtime": 134.0344, "eval_samples_per_second": 9.251, "eval_steps_per_second": 2.313, "step": 1 }, { "epoch": 0.0006791171477079797, "grad_norm": 32.934295654296875, "learning_rate": 2.0200000000000003e-05, "loss": 6.2743, "step": 2 }, { "epoch": 0.0010186757215619694, "grad_norm": 11.293846130371094, "learning_rate": 3.0299999999999998e-05, "loss": 5.5001, "step": 3 }, { "epoch": 0.0013582342954159593, "grad_norm": 12.974593162536621, "learning_rate": 4.0400000000000006e-05, "loss": 5.5615, "step": 4 }, { "epoch": 0.001697792869269949, "grad_norm": 11.986504554748535, "learning_rate": 5.05e-05, "loss": 6.0771, "step": 5 }, { "epoch": 0.0020373514431239388, "grad_norm": 11.836446762084961, "learning_rate": 6.0599999999999996e-05, "loss": 5.833, "step": 6 }, { "epoch": 0.0023769100169779285, "grad_norm": 12.108397483825684, "learning_rate": 7.07e-05, "loss": 6.3409, "step": 7 }, { "epoch": 0.0027164685908319186, "grad_norm": 13.400388717651367, "learning_rate": 8.080000000000001e-05, "loss": 6.262, "step": 8 }, { "epoch": 0.0030560271646859084, "grad_norm": 12.49435806274414, "learning_rate": 9.09e-05, "loss": 5.619, "step": 9 }, { "epoch": 0.003395585738539898, "grad_norm": 12.52596378326416, "learning_rate": 0.000101, "loss": 5.1372, "step": 10 }, { "epoch": 0.003735144312393888, "grad_norm": 12.6868314743042, "learning_rate": 0.00010046842105263158, "loss": 5.2604, "step": 11 }, { "epoch": 0.0040747028862478775, "grad_norm": 14.099748611450195, "learning_rate": 9.993684210526315e-05, "loss": 5.58, "step": 12 }, { "epoch": 0.004414261460101867, "grad_norm": 12.148128509521484, "learning_rate": 9.940526315789473e-05, "loss": 5.5032, "step": 13 }, { "epoch": 0.004753820033955857, "grad_norm": 14.858428001403809, "learning_rate": 9.887368421052632e-05, "loss": 6.4015, "step": 14 }, { "epoch": 0.0050933786078098476, "grad_norm": 14.074804306030273, "learning_rate": 9.83421052631579e-05, "loss": 5.938, "step": 15 }, { "epoch": 0.005432937181663837, "grad_norm": 13.298508644104004, "learning_rate": 9.781052631578948e-05, "loss": 5.9138, "step": 16 }, { "epoch": 0.005772495755517827, "grad_norm": 13.888445854187012, "learning_rate": 9.727894736842106e-05, "loss": 5.5768, "step": 17 }, { "epoch": 0.006112054329371817, "grad_norm": 12.229842185974121, "learning_rate": 9.674736842105263e-05, "loss": 5.4015, "step": 18 }, { "epoch": 0.0064516129032258064, "grad_norm": 17.1883487701416, "learning_rate": 9.621578947368421e-05, "loss": 6.27, "step": 19 }, { "epoch": 0.006791171477079796, "grad_norm": 13.889158248901367, "learning_rate": 9.568421052631578e-05, "loss": 5.3761, "step": 20 }, { "epoch": 0.007130730050933786, "grad_norm": 18.03790283203125, "learning_rate": 9.515263157894737e-05, "loss": 5.8293, "step": 21 }, { "epoch": 0.007470288624787776, "grad_norm": 15.712692260742188, "learning_rate": 9.462105263157895e-05, "loss": 6.1728, "step": 22 }, { "epoch": 0.007809847198641765, "grad_norm": 15.400139808654785, "learning_rate": 9.408947368421054e-05, "loss": 6.7802, "step": 23 }, { "epoch": 0.008149405772495755, "grad_norm": 14.763883590698242, "learning_rate": 9.355789473684211e-05, "loss": 5.7079, "step": 24 }, { "epoch": 0.008488964346349746, "grad_norm": 12.111189842224121, "learning_rate": 9.302631578947369e-05, "loss": 5.5747, "step": 25 }, { "epoch": 0.008828522920203734, "grad_norm": 14.711851119995117, "learning_rate": 9.249473684210526e-05, "loss": 6.0876, "step": 26 }, { "epoch": 0.009168081494057725, "grad_norm": 15.358607292175293, "learning_rate": 9.196315789473685e-05, "loss": 5.6319, "step": 27 }, { "epoch": 0.009507640067911714, "grad_norm": 13.506231307983398, "learning_rate": 9.143157894736843e-05, "loss": 5.8136, "step": 28 }, { "epoch": 0.009847198641765705, "grad_norm": 14.769192695617676, "learning_rate": 9.09e-05, "loss": 5.8328, "step": 29 }, { "epoch": 0.010186757215619695, "grad_norm": 14.589241027832031, "learning_rate": 9.036842105263158e-05, "loss": 5.5436, "step": 30 }, { "epoch": 0.010526315789473684, "grad_norm": 16.822694778442383, "learning_rate": 8.983684210526316e-05, "loss": 6.7274, "step": 31 }, { "epoch": 0.010865874363327675, "grad_norm": 17.69041633605957, "learning_rate": 8.930526315789474e-05, "loss": 6.3115, "step": 32 }, { "epoch": 0.011205432937181663, "grad_norm": 16.131786346435547, "learning_rate": 8.877368421052632e-05, "loss": 5.8964, "step": 33 }, { "epoch": 0.011544991511035654, "grad_norm": 17.694215774536133, "learning_rate": 8.82421052631579e-05, "loss": 6.6347, "step": 34 }, { "epoch": 0.011884550084889643, "grad_norm": 16.214025497436523, "learning_rate": 8.771052631578948e-05, "loss": 5.3229, "step": 35 }, { "epoch": 0.012224108658743633, "grad_norm": 17.1286678314209, "learning_rate": 8.717894736842105e-05, "loss": 6.4127, "step": 36 }, { "epoch": 0.012563667232597622, "grad_norm": 17.29891014099121, "learning_rate": 8.664736842105263e-05, "loss": 5.6328, "step": 37 }, { "epoch": 0.012903225806451613, "grad_norm": 22.239364624023438, "learning_rate": 8.61157894736842e-05, "loss": 6.787, "step": 38 }, { "epoch": 0.013242784380305603, "grad_norm": 17.34881591796875, "learning_rate": 8.55842105263158e-05, "loss": 5.9971, "step": 39 }, { "epoch": 0.013582342954159592, "grad_norm": 31.443096160888672, "learning_rate": 8.505263157894737e-05, "loss": 7.1717, "step": 40 }, { "epoch": 0.013921901528013583, "grad_norm": 20.705217361450195, "learning_rate": 8.452105263157896e-05, "loss": 6.672, "step": 41 }, { "epoch": 0.014261460101867572, "grad_norm": 22.882652282714844, "learning_rate": 8.398947368421053e-05, "loss": 6.7516, "step": 42 }, { "epoch": 0.014601018675721562, "grad_norm": 25.97607421875, "learning_rate": 8.345789473684211e-05, "loss": 7.0149, "step": 43 }, { "epoch": 0.014940577249575551, "grad_norm": 29.19485855102539, "learning_rate": 8.292631578947368e-05, "loss": 7.1149, "step": 44 }, { "epoch": 0.015280135823429542, "grad_norm": 26.053762435913086, "learning_rate": 8.239473684210526e-05, "loss": 6.5975, "step": 45 }, { "epoch": 0.01561969439728353, "grad_norm": 28.610328674316406, "learning_rate": 8.186315789473683e-05, "loss": 6.2714, "step": 46 }, { "epoch": 0.01595925297113752, "grad_norm": 35.02290344238281, "learning_rate": 8.133157894736842e-05, "loss": 7.4225, "step": 47 }, { "epoch": 0.01629881154499151, "grad_norm": 51.69056701660156, "learning_rate": 8.080000000000001e-05, "loss": 6.8981, "step": 48 }, { "epoch": 0.016638370118845502, "grad_norm": 50.08887481689453, "learning_rate": 8.026842105263159e-05, "loss": 7.0015, "step": 49 }, { "epoch": 0.01697792869269949, "grad_norm": 40.95683288574219, "learning_rate": 7.973684210526316e-05, "loss": 6.7858, "step": 50 }, { "epoch": 0.01697792869269949, "eval_loss": 3.2727584838867188, "eval_runtime": 136.3007, "eval_samples_per_second": 9.098, "eval_steps_per_second": 2.274, "step": 50 }, { "epoch": 0.01731748726655348, "grad_norm": 8.816231727600098, "learning_rate": 7.920526315789474e-05, "loss": 6.4331, "step": 51 }, { "epoch": 0.01765704584040747, "grad_norm": 7.692835807800293, "learning_rate": 7.867368421052631e-05, "loss": 6.1655, "step": 52 }, { "epoch": 0.01799660441426146, "grad_norm": 6.719763278961182, "learning_rate": 7.814210526315789e-05, "loss": 5.9821, "step": 53 }, { "epoch": 0.01833616298811545, "grad_norm": 6.741839408874512, "learning_rate": 7.761052631578946e-05, "loss": 5.5779, "step": 54 }, { "epoch": 0.01867572156196944, "grad_norm": 7.938393592834473, "learning_rate": 7.707894736842105e-05, "loss": 6.1207, "step": 55 }, { "epoch": 0.019015280135823428, "grad_norm": 7.241247177124023, "learning_rate": 7.654736842105264e-05, "loss": 5.5408, "step": 56 }, { "epoch": 0.01935483870967742, "grad_norm": 7.55157470703125, "learning_rate": 7.601578947368422e-05, "loss": 5.5802, "step": 57 }, { "epoch": 0.01969439728353141, "grad_norm": 7.256725311279297, "learning_rate": 7.548421052631579e-05, "loss": 5.7427, "step": 58 }, { "epoch": 0.020033955857385398, "grad_norm": 8.38663101196289, "learning_rate": 7.495263157894737e-05, "loss": 5.8768, "step": 59 }, { "epoch": 0.02037351443123939, "grad_norm": 7.507662296295166, "learning_rate": 7.442105263157894e-05, "loss": 5.8298, "step": 60 }, { "epoch": 0.02071307300509338, "grad_norm": 8.123747825622559, "learning_rate": 7.388947368421053e-05, "loss": 5.6901, "step": 61 }, { "epoch": 0.021052631578947368, "grad_norm": 7.707481384277344, "learning_rate": 7.335789473684211e-05, "loss": 5.4775, "step": 62 }, { "epoch": 0.021392190152801357, "grad_norm": 8.684199333190918, "learning_rate": 7.282631578947368e-05, "loss": 4.7097, "step": 63 }, { "epoch": 0.02173174872665535, "grad_norm": 9.479657173156738, "learning_rate": 7.229473684210527e-05, "loss": 6.1553, "step": 64 }, { "epoch": 0.022071307300509338, "grad_norm": 7.97694206237793, "learning_rate": 7.176315789473685e-05, "loss": 5.1793, "step": 65 }, { "epoch": 0.022410865874363327, "grad_norm": 8.905004501342773, "learning_rate": 7.123157894736842e-05, "loss": 5.1063, "step": 66 }, { "epoch": 0.02275042444821732, "grad_norm": 9.290450096130371, "learning_rate": 7.07e-05, "loss": 5.6181, "step": 67 }, { "epoch": 0.023089983022071308, "grad_norm": 9.563346862792969, "learning_rate": 7.016842105263159e-05, "loss": 5.3551, "step": 68 }, { "epoch": 0.023429541595925297, "grad_norm": 9.26034927368164, "learning_rate": 6.963684210526316e-05, "loss": 5.5433, "step": 69 }, { "epoch": 0.023769100169779286, "grad_norm": 9.813597679138184, "learning_rate": 6.910526315789474e-05, "loss": 5.3692, "step": 70 }, { "epoch": 0.024108658743633278, "grad_norm": 9.946206092834473, "learning_rate": 6.857368421052631e-05, "loss": 5.4733, "step": 71 }, { "epoch": 0.024448217317487267, "grad_norm": 11.333394050598145, "learning_rate": 6.80421052631579e-05, "loss": 6.3697, "step": 72 }, { "epoch": 0.024787775891341256, "grad_norm": 10.807990074157715, "learning_rate": 6.751052631578948e-05, "loss": 5.7684, "step": 73 }, { "epoch": 0.025127334465195245, "grad_norm": 10.34673023223877, "learning_rate": 6.697894736842105e-05, "loss": 5.5579, "step": 74 }, { "epoch": 0.025466893039049237, "grad_norm": 10.379812240600586, "learning_rate": 6.644736842105264e-05, "loss": 6.0875, "step": 75 }, { "epoch": 0.025806451612903226, "grad_norm": 10.365467071533203, "learning_rate": 6.591578947368422e-05, "loss": 5.4606, "step": 76 }, { "epoch": 0.026146010186757215, "grad_norm": 12.18764877319336, "learning_rate": 6.538421052631579e-05, "loss": 5.7223, "step": 77 }, { "epoch": 0.026485568760611207, "grad_norm": 10.989704132080078, "learning_rate": 6.485263157894737e-05, "loss": 5.3804, "step": 78 }, { "epoch": 0.026825127334465196, "grad_norm": 11.694632530212402, "learning_rate": 6.432105263157894e-05, "loss": 5.822, "step": 79 }, { "epoch": 0.027164685908319185, "grad_norm": 12.42897891998291, "learning_rate": 6.378947368421053e-05, "loss": 5.6253, "step": 80 }, { "epoch": 0.027504244482173174, "grad_norm": 12.49673080444336, "learning_rate": 6.32578947368421e-05, "loss": 5.885, "step": 81 }, { "epoch": 0.027843803056027166, "grad_norm": 12.554586410522461, "learning_rate": 6.27263157894737e-05, "loss": 5.4057, "step": 82 }, { "epoch": 0.028183361629881155, "grad_norm": 12.602128028869629, "learning_rate": 6.219473684210527e-05, "loss": 5.8701, "step": 83 }, { "epoch": 0.028522920203735144, "grad_norm": 14.500311851501465, "learning_rate": 6.166315789473685e-05, "loss": 5.7579, "step": 84 }, { "epoch": 0.028862478777589132, "grad_norm": 12.415670394897461, "learning_rate": 6.113157894736842e-05, "loss": 5.589, "step": 85 }, { "epoch": 0.029202037351443125, "grad_norm": 12.579917907714844, "learning_rate": 6.0599999999999996e-05, "loss": 5.7132, "step": 86 }, { "epoch": 0.029541595925297114, "grad_norm": 14.4943208694458, "learning_rate": 6.006842105263158e-05, "loss": 6.1524, "step": 87 }, { "epoch": 0.029881154499151102, "grad_norm": 13.979001998901367, "learning_rate": 5.953684210526315e-05, "loss": 5.4524, "step": 88 }, { "epoch": 0.030220713073005095, "grad_norm": 12.837852478027344, "learning_rate": 5.900526315789474e-05, "loss": 5.0063, "step": 89 }, { "epoch": 0.030560271646859084, "grad_norm": 15.69062614440918, "learning_rate": 5.847368421052632e-05, "loss": 5.7173, "step": 90 }, { "epoch": 0.030899830220713072, "grad_norm": 18.907155990600586, "learning_rate": 5.79421052631579e-05, "loss": 5.5478, "step": 91 }, { "epoch": 0.03123938879456706, "grad_norm": 13.907947540283203, "learning_rate": 5.7410526315789475e-05, "loss": 5.1368, "step": 92 }, { "epoch": 0.031578947368421054, "grad_norm": 21.56955337524414, "learning_rate": 5.687894736842105e-05, "loss": 5.4158, "step": 93 }, { "epoch": 0.03191850594227504, "grad_norm": 19.460166931152344, "learning_rate": 5.6347368421052625e-05, "loss": 6.2592, "step": 94 }, { "epoch": 0.03225806451612903, "grad_norm": 22.000574111938477, "learning_rate": 5.5815789473684214e-05, "loss": 5.9925, "step": 95 }, { "epoch": 0.03259762308998302, "grad_norm": 21.29176139831543, "learning_rate": 5.5284210526315796e-05, "loss": 6.2012, "step": 96 }, { "epoch": 0.03293718166383701, "grad_norm": 24.593799591064453, "learning_rate": 5.475263157894737e-05, "loss": 6.3382, "step": 97 }, { "epoch": 0.033276740237691005, "grad_norm": 25.268535614013672, "learning_rate": 5.422105263157895e-05, "loss": 6.17, "step": 98 }, { "epoch": 0.033616298811544994, "grad_norm": 37.28253173828125, "learning_rate": 5.368947368421053e-05, "loss": 7.309, "step": 99 }, { "epoch": 0.03395585738539898, "grad_norm": 61.91118240356445, "learning_rate": 5.3157894736842104e-05, "loss": 9.4064, "step": 100 }, { "epoch": 0.03395585738539898, "eval_loss": 3.108811855316162, "eval_runtime": 133.8842, "eval_samples_per_second": 9.262, "eval_steps_per_second": 2.315, "step": 100 }, { "epoch": 0.03429541595925297, "grad_norm": 8.512744903564453, "learning_rate": 5.262631578947368e-05, "loss": 6.0228, "step": 101 }, { "epoch": 0.03463497453310696, "grad_norm": 7.949807643890381, "learning_rate": 5.209473684210527e-05, "loss": 5.8142, "step": 102 }, { "epoch": 0.03497453310696095, "grad_norm": 6.752256870269775, "learning_rate": 5.1563157894736844e-05, "loss": 5.552, "step": 103 }, { "epoch": 0.03531409168081494, "grad_norm": 6.490177631378174, "learning_rate": 5.1031578947368426e-05, "loss": 5.8519, "step": 104 }, { "epoch": 0.035653650254668934, "grad_norm": 5.849376678466797, "learning_rate": 5.05e-05, "loss": 5.6065, "step": 105 }, { "epoch": 0.03599320882852292, "grad_norm": 6.028791427612305, "learning_rate": 4.9968421052631576e-05, "loss": 5.6156, "step": 106 }, { "epoch": 0.03633276740237691, "grad_norm": 5.619626522064209, "learning_rate": 4.943684210526316e-05, "loss": 5.2623, "step": 107 }, { "epoch": 0.0366723259762309, "grad_norm": 5.889388084411621, "learning_rate": 4.890526315789474e-05, "loss": 5.3881, "step": 108 }, { "epoch": 0.03701188455008489, "grad_norm": 6.1949615478515625, "learning_rate": 4.8373684210526316e-05, "loss": 5.1539, "step": 109 }, { "epoch": 0.03735144312393888, "grad_norm": 6.2401442527771, "learning_rate": 4.784210526315789e-05, "loss": 5.1559, "step": 110 }, { "epoch": 0.03769100169779287, "grad_norm": 6.51352071762085, "learning_rate": 4.731052631578947e-05, "loss": 4.9546, "step": 111 }, { "epoch": 0.038030560271646856, "grad_norm": 7.465339660644531, "learning_rate": 4.6778947368421055e-05, "loss": 5.389, "step": 112 }, { "epoch": 0.03837011884550085, "grad_norm": 8.441889762878418, "learning_rate": 4.624736842105263e-05, "loss": 5.6321, "step": 113 }, { "epoch": 0.03870967741935484, "grad_norm": 8.055974006652832, "learning_rate": 4.571578947368421e-05, "loss": 5.5059, "step": 114 }, { "epoch": 0.03904923599320883, "grad_norm": 7.581737041473389, "learning_rate": 4.518421052631579e-05, "loss": 5.1159, "step": 115 }, { "epoch": 0.03938879456706282, "grad_norm": 8.991089820861816, "learning_rate": 4.465263157894737e-05, "loss": 5.9822, "step": 116 }, { "epoch": 0.03972835314091681, "grad_norm": 8.726984024047852, "learning_rate": 4.412105263157895e-05, "loss": 5.4402, "step": 117 }, { "epoch": 0.040067911714770796, "grad_norm": 8.529667854309082, "learning_rate": 4.358947368421053e-05, "loss": 5.2337, "step": 118 }, { "epoch": 0.040407470288624785, "grad_norm": 8.97454833984375, "learning_rate": 4.30578947368421e-05, "loss": 5.6533, "step": 119 }, { "epoch": 0.04074702886247878, "grad_norm": 8.54892349243164, "learning_rate": 4.2526315789473685e-05, "loss": 5.4748, "step": 120 }, { "epoch": 0.04108658743633277, "grad_norm": 8.859085083007812, "learning_rate": 4.199473684210527e-05, "loss": 5.733, "step": 121 }, { "epoch": 0.04142614601018676, "grad_norm": 8.932308197021484, "learning_rate": 4.146315789473684e-05, "loss": 5.3355, "step": 122 }, { "epoch": 0.04176570458404075, "grad_norm": 9.009238243103027, "learning_rate": 4.093157894736842e-05, "loss": 5.2937, "step": 123 }, { "epoch": 0.042105263157894736, "grad_norm": 8.809886932373047, "learning_rate": 4.0400000000000006e-05, "loss": 5.6176, "step": 124 }, { "epoch": 0.042444821731748725, "grad_norm": 10.109439849853516, "learning_rate": 3.986842105263158e-05, "loss": 5.5681, "step": 125 }, { "epoch": 0.042784380305602714, "grad_norm": 9.18508243560791, "learning_rate": 3.933684210526316e-05, "loss": 5.263, "step": 126 }, { "epoch": 0.04312393887945671, "grad_norm": 10.614432334899902, "learning_rate": 3.880526315789473e-05, "loss": 5.6346, "step": 127 }, { "epoch": 0.0434634974533107, "grad_norm": 11.10940933227539, "learning_rate": 3.827368421052632e-05, "loss": 5.6721, "step": 128 }, { "epoch": 0.04380305602716469, "grad_norm": 9.783493041992188, "learning_rate": 3.7742105263157896e-05, "loss": 5.2759, "step": 129 }, { "epoch": 0.044142614601018676, "grad_norm": 10.094010353088379, "learning_rate": 3.721052631578947e-05, "loss": 5.0748, "step": 130 }, { "epoch": 0.044482173174872665, "grad_norm": 11.673230171203613, "learning_rate": 3.6678947368421054e-05, "loss": 6.0693, "step": 131 }, { "epoch": 0.044821731748726654, "grad_norm": 13.237796783447266, "learning_rate": 3.6147368421052636e-05, "loss": 5.8695, "step": 132 }, { "epoch": 0.04516129032258064, "grad_norm": 11.816963195800781, "learning_rate": 3.561578947368421e-05, "loss": 4.9874, "step": 133 }, { "epoch": 0.04550084889643464, "grad_norm": 11.55286979675293, "learning_rate": 3.508421052631579e-05, "loss": 5.6631, "step": 134 }, { "epoch": 0.04584040747028863, "grad_norm": 14.232548713684082, "learning_rate": 3.455263157894737e-05, "loss": 5.5924, "step": 135 }, { "epoch": 0.046179966044142616, "grad_norm": 14.204998970031738, "learning_rate": 3.402105263157895e-05, "loss": 6.4456, "step": 136 }, { "epoch": 0.046519524617996605, "grad_norm": 16.168073654174805, "learning_rate": 3.3489473684210526e-05, "loss": 6.0166, "step": 137 }, { "epoch": 0.046859083191850594, "grad_norm": 13.623854637145996, "learning_rate": 3.295789473684211e-05, "loss": 4.8727, "step": 138 }, { "epoch": 0.04719864176570458, "grad_norm": 14.182967185974121, "learning_rate": 3.242631578947368e-05, "loss": 5.2488, "step": 139 }, { "epoch": 0.04753820033955857, "grad_norm": 15.692301750183105, "learning_rate": 3.1894736842105265e-05, "loss": 5.7354, "step": 140 }, { "epoch": 0.04787775891341256, "grad_norm": 13.835912704467773, "learning_rate": 3.136315789473685e-05, "loss": 4.7608, "step": 141 }, { "epoch": 0.048217317487266556, "grad_norm": 23.79447364807129, "learning_rate": 3.083157894736842e-05, "loss": 5.9428, "step": 142 }, { "epoch": 0.048556876061120545, "grad_norm": 20.368270874023438, "learning_rate": 3.0299999999999998e-05, "loss": 5.8896, "step": 143 }, { "epoch": 0.048896434634974534, "grad_norm": 26.974061965942383, "learning_rate": 2.9768421052631577e-05, "loss": 6.8198, "step": 144 }, { "epoch": 0.04923599320882852, "grad_norm": 21.44305419921875, "learning_rate": 2.923684210526316e-05, "loss": 6.8624, "step": 145 }, { "epoch": 0.04957555178268251, "grad_norm": 22.52785301208496, "learning_rate": 2.8705263157894737e-05, "loss": 5.6798, "step": 146 }, { "epoch": 0.0499151103565365, "grad_norm": 27.52121353149414, "learning_rate": 2.8173684210526313e-05, "loss": 6.4979, "step": 147 }, { "epoch": 0.05025466893039049, "grad_norm": 34.942691802978516, "learning_rate": 2.7642105263157898e-05, "loss": 7.2382, "step": 148 }, { "epoch": 0.050594227504244485, "grad_norm": 44.1684684753418, "learning_rate": 2.7110526315789473e-05, "loss": 7.3026, "step": 149 }, { "epoch": 0.050933786078098474, "grad_norm": 29.29156494140625, "learning_rate": 2.6578947368421052e-05, "loss": 5.9047, "step": 150 }, { "epoch": 0.050933786078098474, "eval_loss": 2.8410627841949463, "eval_runtime": 134.1501, "eval_samples_per_second": 9.243, "eval_steps_per_second": 2.311, "step": 150 }, { "epoch": 0.05127334465195246, "grad_norm": 4.4673285484313965, "learning_rate": 2.6047368421052634e-05, "loss": 5.1138, "step": 151 }, { "epoch": 0.05161290322580645, "grad_norm": 5.355312347412109, "learning_rate": 2.5515789473684213e-05, "loss": 5.4617, "step": 152 }, { "epoch": 0.05195246179966044, "grad_norm": 5.351836681365967, "learning_rate": 2.4984210526315788e-05, "loss": 5.3663, "step": 153 }, { "epoch": 0.05229202037351443, "grad_norm": 6.066408634185791, "learning_rate": 2.445263157894737e-05, "loss": 5.3583, "step": 154 }, { "epoch": 0.05263157894736842, "grad_norm": 5.661759376525879, "learning_rate": 2.3921052631578946e-05, "loss": 5.3968, "step": 155 }, { "epoch": 0.052971137521222414, "grad_norm": 5.76517391204834, "learning_rate": 2.3389473684210528e-05, "loss": 5.0868, "step": 156 }, { "epoch": 0.0533106960950764, "grad_norm": 6.425754070281982, "learning_rate": 2.2857894736842106e-05, "loss": 5.7214, "step": 157 }, { "epoch": 0.05365025466893039, "grad_norm": 6.3002753257751465, "learning_rate": 2.2326315789473685e-05, "loss": 4.8879, "step": 158 }, { "epoch": 0.05398981324278438, "grad_norm": 5.956075191497803, "learning_rate": 2.1794736842105264e-05, "loss": 5.3174, "step": 159 }, { "epoch": 0.05432937181663837, "grad_norm": 6.668689727783203, "learning_rate": 2.1263157894736842e-05, "loss": 5.1679, "step": 160 }, { "epoch": 0.05466893039049236, "grad_norm": 6.492646217346191, "learning_rate": 2.073157894736842e-05, "loss": 5.1203, "step": 161 }, { "epoch": 0.05500848896434635, "grad_norm": 6.742474555969238, "learning_rate": 2.0200000000000003e-05, "loss": 5.3391, "step": 162 }, { "epoch": 0.05534804753820034, "grad_norm": 7.066228866577148, "learning_rate": 1.966842105263158e-05, "loss": 5.2756, "step": 163 }, { "epoch": 0.05568760611205433, "grad_norm": 7.766740798950195, "learning_rate": 1.913684210526316e-05, "loss": 5.0874, "step": 164 }, { "epoch": 0.05602716468590832, "grad_norm": 7.978595733642578, "learning_rate": 1.8605263157894736e-05, "loss": 5.2179, "step": 165 }, { "epoch": 0.05636672325976231, "grad_norm": 7.731940269470215, "learning_rate": 1.8073684210526318e-05, "loss": 5.0202, "step": 166 }, { "epoch": 0.0567062818336163, "grad_norm": 7.676203727722168, "learning_rate": 1.7542105263157897e-05, "loss": 5.3613, "step": 167 }, { "epoch": 0.05704584040747029, "grad_norm": 8.548066139221191, "learning_rate": 1.7010526315789475e-05, "loss": 5.5421, "step": 168 }, { "epoch": 0.057385398981324276, "grad_norm": 7.955386638641357, "learning_rate": 1.6478947368421054e-05, "loss": 5.2485, "step": 169 }, { "epoch": 0.057724957555178265, "grad_norm": 8.47363567352295, "learning_rate": 1.5947368421052633e-05, "loss": 5.5402, "step": 170 }, { "epoch": 0.05806451612903226, "grad_norm": 8.166704177856445, "learning_rate": 1.541578947368421e-05, "loss": 4.6554, "step": 171 }, { "epoch": 0.05840407470288625, "grad_norm": 9.387300491333008, "learning_rate": 1.4884210526315788e-05, "loss": 5.4705, "step": 172 }, { "epoch": 0.05874363327674024, "grad_norm": 11.64377212524414, "learning_rate": 1.4352631578947369e-05, "loss": 5.0644, "step": 173 }, { "epoch": 0.05908319185059423, "grad_norm": 9.732513427734375, "learning_rate": 1.3821052631578949e-05, "loss": 5.5656, "step": 174 }, { "epoch": 0.059422750424448216, "grad_norm": 9.858539581298828, "learning_rate": 1.3289473684210526e-05, "loss": 5.3905, "step": 175 }, { "epoch": 0.059762308998302205, "grad_norm": 11.403061866760254, "learning_rate": 1.2757894736842106e-05, "loss": 5.5977, "step": 176 }, { "epoch": 0.060101867572156194, "grad_norm": 9.382144927978516, "learning_rate": 1.2226315789473685e-05, "loss": 5.2198, "step": 177 }, { "epoch": 0.06044142614601019, "grad_norm": 12.708952903747559, "learning_rate": 1.1694736842105264e-05, "loss": 5.279, "step": 178 }, { "epoch": 0.06078098471986418, "grad_norm": 11.602399826049805, "learning_rate": 1.1163157894736842e-05, "loss": 5.6348, "step": 179 }, { "epoch": 0.06112054329371817, "grad_norm": 11.256779670715332, "learning_rate": 1.0631578947368421e-05, "loss": 5.3102, "step": 180 }, { "epoch": 0.061460101867572156, "grad_norm": 13.216877937316895, "learning_rate": 1.0100000000000002e-05, "loss": 5.7058, "step": 181 }, { "epoch": 0.061799660441426145, "grad_norm": 11.540813446044922, "learning_rate": 9.56842105263158e-06, "loss": 5.7871, "step": 182 }, { "epoch": 0.062139219015280134, "grad_norm": 11.003501892089844, "learning_rate": 9.036842105263159e-06, "loss": 4.9942, "step": 183 }, { "epoch": 0.06247877758913412, "grad_norm": 12.439997673034668, "learning_rate": 8.505263157894738e-06, "loss": 5.6214, "step": 184 }, { "epoch": 0.06281833616298811, "grad_norm": 13.413476943969727, "learning_rate": 7.973684210526316e-06, "loss": 5.2558, "step": 185 }, { "epoch": 0.06315789473684211, "grad_norm": 12.21358585357666, "learning_rate": 7.442105263157894e-06, "loss": 5.3534, "step": 186 }, { "epoch": 0.06349745331069609, "grad_norm": 17.38026237487793, "learning_rate": 6.9105263157894745e-06, "loss": 5.9743, "step": 187 }, { "epoch": 0.06383701188455009, "grad_norm": 14.174423217773438, "learning_rate": 6.378947368421053e-06, "loss": 6.3298, "step": 188 }, { "epoch": 0.06417657045840408, "grad_norm": 15.029065132141113, "learning_rate": 5.847368421052632e-06, "loss": 6.079, "step": 189 }, { "epoch": 0.06451612903225806, "grad_norm": 18.191574096679688, "learning_rate": 5.315789473684211e-06, "loss": 6.2973, "step": 190 }, { "epoch": 0.06485568760611206, "grad_norm": 20.552350997924805, "learning_rate": 4.78421052631579e-06, "loss": 5.2437, "step": 191 }, { "epoch": 0.06519524617996604, "grad_norm": 15.131223678588867, "learning_rate": 4.252631578947369e-06, "loss": 4.9799, "step": 192 }, { "epoch": 0.06553480475382004, "grad_norm": 19.161144256591797, "learning_rate": 3.721052631578947e-06, "loss": 5.4162, "step": 193 }, { "epoch": 0.06587436332767402, "grad_norm": 17.99496841430664, "learning_rate": 3.1894736842105266e-06, "loss": 6.2559, "step": 194 }, { "epoch": 0.06621392190152801, "grad_norm": 21.827606201171875, "learning_rate": 2.6578947368421053e-06, "loss": 6.9087, "step": 195 }, { "epoch": 0.06655348047538201, "grad_norm": 22.512189865112305, "learning_rate": 2.1263157894736844e-06, "loss": 5.9861, "step": 196 }, { "epoch": 0.06689303904923599, "grad_norm": 20.787433624267578, "learning_rate": 1.5947368421052633e-06, "loss": 6.6073, "step": 197 }, { "epoch": 0.06723259762308999, "grad_norm": 34.3043098449707, "learning_rate": 1.0631578947368422e-06, "loss": 6.9233, "step": 198 }, { "epoch": 0.06757215619694397, "grad_norm": 39.098968505859375, "learning_rate": 5.315789473684211e-07, "loss": 7.7742, "step": 199 }, { "epoch": 0.06791171477079797, "grad_norm": 41.64736557006836, "learning_rate": 0.0, "loss": 7.8041, "step": 200 }, { "epoch": 0.06791171477079797, "eval_loss": 2.7491917610168457, "eval_runtime": 134.4966, "eval_samples_per_second": 9.22, "eval_steps_per_second": 2.305, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.229663245605274e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }