{ "best_metric": 1.2692981958389282, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.018789928598271326, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00018789928598271326, "grad_norm": 8.089397430419922, "learning_rate": 3.3333333333333333e-06, "loss": 4.6183, "step": 1 }, { "epoch": 0.00018789928598271326, "eval_loss": 2.3376758098602295, "eval_runtime": 822.7797, "eval_samples_per_second": 10.895, "eval_steps_per_second": 5.447, "step": 1 }, { "epoch": 0.0003757985719654265, "grad_norm": 9.32201862335205, "learning_rate": 6.666666666666667e-06, "loss": 4.6686, "step": 2 }, { "epoch": 0.0005636978579481398, "grad_norm": 8.962303161621094, "learning_rate": 1e-05, "loss": 4.6015, "step": 3 }, { "epoch": 0.000751597143930853, "grad_norm": 6.618922710418701, "learning_rate": 1.3333333333333333e-05, "loss": 4.42, "step": 4 }, { "epoch": 0.0009394964299135663, "grad_norm": 5.780028343200684, "learning_rate": 1.6666666666666667e-05, "loss": 3.8984, "step": 5 }, { "epoch": 0.0011273957158962795, "grad_norm": 5.112532138824463, "learning_rate": 2e-05, "loss": 3.9961, "step": 6 }, { "epoch": 0.0013152950018789928, "grad_norm": 4.7758402824401855, "learning_rate": 2.3333333333333336e-05, "loss": 4.0382, "step": 7 }, { "epoch": 0.001503194287861706, "grad_norm": 4.002161026000977, "learning_rate": 2.6666666666666667e-05, "loss": 6.447, "step": 8 }, { "epoch": 0.0016910935738444193, "grad_norm": 3.693472146987915, "learning_rate": 3e-05, "loss": 6.4715, "step": 9 }, { "epoch": 0.0018789928598271326, "grad_norm": 4.906447410583496, "learning_rate": 3.3333333333333335e-05, "loss": 4.8352, "step": 10 }, { "epoch": 0.002066892145809846, "grad_norm": 4.642434597015381, "learning_rate": 3.6666666666666666e-05, "loss": 5.1161, "step": 11 }, { "epoch": 0.002254791431792559, "grad_norm": 4.053666591644287, "learning_rate": 4e-05, "loss": 5.2271, "step": 12 }, { "epoch": 0.0024426907177752723, "grad_norm": 4.311800479888916, "learning_rate": 4.3333333333333334e-05, "loss": 5.7124, "step": 13 }, { "epoch": 0.0026305900037579856, "grad_norm": 3.9106714725494385, "learning_rate": 4.666666666666667e-05, "loss": 4.7916, "step": 14 }, { "epoch": 0.002818489289740699, "grad_norm": 3.9047529697418213, "learning_rate": 5e-05, "loss": 4.9605, "step": 15 }, { "epoch": 0.003006388575723412, "grad_norm": 6.001646041870117, "learning_rate": 5.333333333333333e-05, "loss": 5.2707, "step": 16 }, { "epoch": 0.0031942878617061254, "grad_norm": 4.10309362411499, "learning_rate": 5.666666666666667e-05, "loss": 5.8945, "step": 17 }, { "epoch": 0.0033821871476888386, "grad_norm": 3.8678953647613525, "learning_rate": 6e-05, "loss": 5.1987, "step": 18 }, { "epoch": 0.003570086433671552, "grad_norm": 3.8875012397766113, "learning_rate": 6.333333333333333e-05, "loss": 4.8459, "step": 19 }, { "epoch": 0.003757985719654265, "grad_norm": 3.875514507293701, "learning_rate": 6.666666666666667e-05, "loss": 4.5654, "step": 20 }, { "epoch": 0.003945885005636978, "grad_norm": 4.3774943351745605, "learning_rate": 7e-05, "loss": 5.0304, "step": 21 }, { "epoch": 0.004133784291619692, "grad_norm": 9.754307746887207, "learning_rate": 7.333333333333333e-05, "loss": 5.6285, "step": 22 }, { "epoch": 0.004321683577602405, "grad_norm": 5.715425968170166, "learning_rate": 7.666666666666667e-05, "loss": 6.7236, "step": 23 }, { "epoch": 0.004509582863585118, "grad_norm": 6.149569034576416, "learning_rate": 8e-05, "loss": 6.1906, "step": 24 }, { "epoch": 0.004697482149567831, "grad_norm": 5.921987056732178, "learning_rate": 8.333333333333334e-05, "loss": 6.3798, "step": 25 }, { "epoch": 0.004885381435550545, "grad_norm": 6.240573883056641, "learning_rate": 8.666666666666667e-05, "loss": 6.1837, "step": 26 }, { "epoch": 0.005073280721533258, "grad_norm": 5.769016265869141, "learning_rate": 9e-05, "loss": 6.4518, "step": 27 }, { "epoch": 0.005261180007515971, "grad_norm": 5.863321781158447, "learning_rate": 9.333333333333334e-05, "loss": 6.2235, "step": 28 }, { "epoch": 0.0054490792934986845, "grad_norm": 6.841501712799072, "learning_rate": 9.666666666666667e-05, "loss": 5.9651, "step": 29 }, { "epoch": 0.005636978579481398, "grad_norm": 5.785431385040283, "learning_rate": 0.0001, "loss": 5.9554, "step": 30 }, { "epoch": 0.005824877865464111, "grad_norm": 5.980314254760742, "learning_rate": 9.994965332706573e-05, "loss": 5.6978, "step": 31 }, { "epoch": 0.006012777151446824, "grad_norm": 6.056541442871094, "learning_rate": 9.979871469976196e-05, "loss": 5.2931, "step": 32 }, { "epoch": 0.0062006764374295375, "grad_norm": 5.904792308807373, "learning_rate": 9.954748808839674e-05, "loss": 5.5411, "step": 33 }, { "epoch": 0.006388575723412251, "grad_norm": 5.62144136428833, "learning_rate": 9.919647942993148e-05, "loss": 4.1701, "step": 34 }, { "epoch": 0.006576475009394964, "grad_norm": 6.057515621185303, "learning_rate": 9.874639560909117e-05, "loss": 4.739, "step": 35 }, { "epoch": 0.006764374295377677, "grad_norm": 7.6517558097839355, "learning_rate": 9.819814303479267e-05, "loss": 5.8531, "step": 36 }, { "epoch": 0.0069522735813603905, "grad_norm": 5.870713710784912, "learning_rate": 9.755282581475769e-05, "loss": 4.2783, "step": 37 }, { "epoch": 0.007140172867343104, "grad_norm": 6.46204137802124, "learning_rate": 9.681174353198687e-05, "loss": 4.375, "step": 38 }, { "epoch": 0.007328072153325817, "grad_norm": 7.084475994110107, "learning_rate": 9.597638862757255e-05, "loss": 4.1746, "step": 39 }, { "epoch": 0.00751597143930853, "grad_norm": 6.561005115509033, "learning_rate": 9.504844339512095e-05, "loss": 5.4644, "step": 40 }, { "epoch": 0.0077038707252912435, "grad_norm": 7.431431770324707, "learning_rate": 9.40297765928369e-05, "loss": 4.7945, "step": 41 }, { "epoch": 0.007891770011273957, "grad_norm": 8.801129341125488, "learning_rate": 9.292243968009331e-05, "loss": 6.055, "step": 42 }, { "epoch": 0.00807966929725667, "grad_norm": 9.020700454711914, "learning_rate": 9.172866268606513e-05, "loss": 7.4741, "step": 43 }, { "epoch": 0.008267568583239383, "grad_norm": 8.685187339782715, "learning_rate": 9.045084971874738e-05, "loss": 6.0838, "step": 44 }, { "epoch": 0.008455467869222097, "grad_norm": 8.159883499145508, "learning_rate": 8.90915741234015e-05, "loss": 5.7845, "step": 45 }, { "epoch": 0.00864336715520481, "grad_norm": 8.369349479675293, "learning_rate": 8.765357330018056e-05, "loss": 5.8667, "step": 46 }, { "epoch": 0.008831266441187523, "grad_norm": 8.60290813446045, "learning_rate": 8.613974319136958e-05, "loss": 5.4422, "step": 47 }, { "epoch": 0.009019165727170236, "grad_norm": 9.08957290649414, "learning_rate": 8.455313244934324e-05, "loss": 5.5388, "step": 48 }, { "epoch": 0.00920706501315295, "grad_norm": 8.95803165435791, "learning_rate": 8.289693629698564e-05, "loss": 5.2326, "step": 49 }, { "epoch": 0.009394964299135663, "grad_norm": 7.402853965759277, "learning_rate": 8.117449009293668e-05, "loss": 4.5367, "step": 50 }, { "epoch": 0.009394964299135663, "eval_loss": 1.5429913997650146, "eval_runtime": 828.4457, "eval_samples_per_second": 10.82, "eval_steps_per_second": 5.41, "step": 50 }, { "epoch": 0.009582863585118376, "grad_norm": 12.12710189819336, "learning_rate": 7.938926261462366e-05, "loss": 4.1207, "step": 51 }, { "epoch": 0.00977076287110109, "grad_norm": 6.877650260925293, "learning_rate": 7.754484907260513e-05, "loss": 3.854, "step": 52 }, { "epoch": 0.009958662157083803, "grad_norm": 3.922449827194214, "learning_rate": 7.564496387029532e-05, "loss": 3.875, "step": 53 }, { "epoch": 0.010146561443066516, "grad_norm": 2.6158945560455322, "learning_rate": 7.369343312364993e-05, "loss": 3.8383, "step": 54 }, { "epoch": 0.01033446072904923, "grad_norm": 2.6309781074523926, "learning_rate": 7.169418695587791e-05, "loss": 3.3104, "step": 55 }, { "epoch": 0.010522360015031942, "grad_norm": 2.4806551933288574, "learning_rate": 6.965125158269619e-05, "loss": 3.6959, "step": 56 }, { "epoch": 0.010710259301014656, "grad_norm": 2.6647136211395264, "learning_rate": 6.756874120406714e-05, "loss": 4.4607, "step": 57 }, { "epoch": 0.010898158586997369, "grad_norm": 2.6696109771728516, "learning_rate": 6.545084971874738e-05, "loss": 5.5749, "step": 58 }, { "epoch": 0.011086057872980082, "grad_norm": 2.808939218521118, "learning_rate": 6.330184227833376e-05, "loss": 4.7221, "step": 59 }, { "epoch": 0.011273957158962795, "grad_norm": 2.9471914768218994, "learning_rate": 6.112604669781572e-05, "loss": 4.6405, "step": 60 }, { "epoch": 0.011461856444945509, "grad_norm": 2.7732815742492676, "learning_rate": 5.8927844739931834e-05, "loss": 4.311, "step": 61 }, { "epoch": 0.011649755730928222, "grad_norm": 2.873124599456787, "learning_rate": 5.6711663290882776e-05, "loss": 4.8382, "step": 62 }, { "epoch": 0.011837655016910935, "grad_norm": 2.967327117919922, "learning_rate": 5.448196544517168e-05, "loss": 4.4859, "step": 63 }, { "epoch": 0.012025554302893648, "grad_norm": 3.0220985412597656, "learning_rate": 5.2243241517525754e-05, "loss": 4.8879, "step": 64 }, { "epoch": 0.012213453588876362, "grad_norm": 5.608536720275879, "learning_rate": 5e-05, "loss": 4.8269, "step": 65 }, { "epoch": 0.012401352874859075, "grad_norm": 2.988602876663208, "learning_rate": 4.775675848247427e-05, "loss": 5.6113, "step": 66 }, { "epoch": 0.012589252160841788, "grad_norm": 2.927661180496216, "learning_rate": 4.551803455482833e-05, "loss": 4.4028, "step": 67 }, { "epoch": 0.012777151446824501, "grad_norm": 2.921163320541382, "learning_rate": 4.328833670911724e-05, "loss": 4.3565, "step": 68 }, { "epoch": 0.012965050732807215, "grad_norm": 3.0370848178863525, "learning_rate": 4.107215526006817e-05, "loss": 4.7969, "step": 69 }, { "epoch": 0.013152950018789928, "grad_norm": 3.3245177268981934, "learning_rate": 3.887395330218429e-05, "loss": 5.602, "step": 70 }, { "epoch": 0.013340849304772641, "grad_norm": 3.5475261211395264, "learning_rate": 3.6698157721666246e-05, "loss": 4.6884, "step": 71 }, { "epoch": 0.013528748590755355, "grad_norm": 3.5381057262420654, "learning_rate": 3.4549150281252636e-05, "loss": 5.7957, "step": 72 }, { "epoch": 0.013716647876738068, "grad_norm": 4.085561752319336, "learning_rate": 3.243125879593286e-05, "loss": 5.4665, "step": 73 }, { "epoch": 0.013904547162720781, "grad_norm": 4.82763671875, "learning_rate": 3.0348748417303823e-05, "loss": 5.9452, "step": 74 }, { "epoch": 0.014092446448703494, "grad_norm": 4.237555980682373, "learning_rate": 2.8305813044122097e-05, "loss": 5.4789, "step": 75 }, { "epoch": 0.014280345734686208, "grad_norm": 4.894902229309082, "learning_rate": 2.630656687635007e-05, "loss": 6.3016, "step": 76 }, { "epoch": 0.01446824502066892, "grad_norm": 4.798618316650391, "learning_rate": 2.43550361297047e-05, "loss": 5.9612, "step": 77 }, { "epoch": 0.014656144306651634, "grad_norm": 7.736368179321289, "learning_rate": 2.245515092739488e-05, "loss": 6.1193, "step": 78 }, { "epoch": 0.014844043592634347, "grad_norm": 5.555391788482666, "learning_rate": 2.061073738537635e-05, "loss": 6.2544, "step": 79 }, { "epoch": 0.01503194287861706, "grad_norm": 6.320089817047119, "learning_rate": 1.8825509907063327e-05, "loss": 5.6853, "step": 80 }, { "epoch": 0.015219842164599774, "grad_norm": 5.559424877166748, "learning_rate": 1.7103063703014372e-05, "loss": 4.8867, "step": 81 }, { "epoch": 0.015407741450582487, "grad_norm": 5.159763813018799, "learning_rate": 1.544686755065677e-05, "loss": 5.3778, "step": 82 }, { "epoch": 0.0155956407365652, "grad_norm": 5.190471649169922, "learning_rate": 1.3860256808630428e-05, "loss": 4.3495, "step": 83 }, { "epoch": 0.015783540022547914, "grad_norm": 5.162275314331055, "learning_rate": 1.2346426699819458e-05, "loss": 4.7606, "step": 84 }, { "epoch": 0.01597143930853063, "grad_norm": 5.0200347900390625, "learning_rate": 1.090842587659851e-05, "loss": 3.9641, "step": 85 }, { "epoch": 0.01615933859451334, "grad_norm": 4.721124172210693, "learning_rate": 9.549150281252633e-06, "loss": 3.9436, "step": 86 }, { "epoch": 0.016347237880496055, "grad_norm": 5.321893215179443, "learning_rate": 8.271337313934869e-06, "loss": 4.0494, "step": 87 }, { "epoch": 0.016535137166478767, "grad_norm": 5.561007022857666, "learning_rate": 7.077560319906695e-06, "loss": 4.481, "step": 88 }, { "epoch": 0.01672303645246148, "grad_norm": 6.137837886810303, "learning_rate": 5.9702234071631e-06, "loss": 4.0547, "step": 89 }, { "epoch": 0.016910935738444193, "grad_norm": 6.851164817810059, "learning_rate": 4.951556604879048e-06, "loss": 5.9195, "step": 90 }, { "epoch": 0.017098835024426908, "grad_norm": 6.387176036834717, "learning_rate": 4.023611372427471e-06, "loss": 6.3249, "step": 91 }, { "epoch": 0.01728673431040962, "grad_norm": 8.208516120910645, "learning_rate": 3.18825646801314e-06, "loss": 7.7682, "step": 92 }, { "epoch": 0.017474633596392335, "grad_norm": 8.880036354064941, "learning_rate": 2.4471741852423237e-06, "loss": 7.4679, "step": 93 }, { "epoch": 0.017662532882375046, "grad_norm": 9.490116119384766, "learning_rate": 1.8018569652073381e-06, "loss": 6.5789, "step": 94 }, { "epoch": 0.01785043216835776, "grad_norm": 10.559647560119629, "learning_rate": 1.2536043909088191e-06, "loss": 7.196, "step": 95 }, { "epoch": 0.018038331454340473, "grad_norm": 10.684134483337402, "learning_rate": 8.035205700685167e-07, "loss": 5.8043, "step": 96 }, { "epoch": 0.018226230740323188, "grad_norm": 9.896649360656738, "learning_rate": 4.52511911603265e-07, "loss": 5.7818, "step": 97 }, { "epoch": 0.0184141300263059, "grad_norm": 10.521429061889648, "learning_rate": 2.012853002380466e-07, "loss": 5.4138, "step": 98 }, { "epoch": 0.018602029312288614, "grad_norm": 12.07073974609375, "learning_rate": 5.0346672934270534e-08, "loss": 5.933, "step": 99 }, { "epoch": 0.018789928598271326, "grad_norm": 15.48339557647705, "learning_rate": 0.0, "loss": 5.1382, "step": 100 }, { "epoch": 0.018789928598271326, "eval_loss": 1.2692981958389282, "eval_runtime": 828.1962, "eval_samples_per_second": 10.824, "eval_steps_per_second": 5.412, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6778451655380173e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }