{ "best_metric": 0.22296833992004395, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.5586592178770949, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002793296089385475, "eval_loss": 3.2835192680358887, "eval_runtime": 17.0347, "eval_samples_per_second": 35.398, "eval_steps_per_second": 8.864, "step": 1 }, { "epoch": 0.008379888268156424, "grad_norm": 0.6404779553413391, "learning_rate": 3e-05, "loss": 2.4727, "step": 3 }, { "epoch": 0.01675977653631285, "grad_norm": 0.7394810914993286, "learning_rate": 6e-05, "loss": 2.6608, "step": 6 }, { "epoch": 0.025139664804469275, "grad_norm": 0.7213653326034546, "learning_rate": 9e-05, "loss": 2.8923, "step": 9 }, { "epoch": 0.0335195530726257, "grad_norm": 0.9916292428970337, "learning_rate": 9.997266286704631e-05, "loss": 2.4576, "step": 12 }, { "epoch": 0.04189944134078212, "grad_norm": 0.8784328699111938, "learning_rate": 9.98292246503335e-05, "loss": 2.013, "step": 15 }, { "epoch": 0.05027932960893855, "grad_norm": 0.6947599649429321, "learning_rate": 9.956320346634876e-05, "loss": 1.6163, "step": 18 }, { "epoch": 0.05865921787709497, "grad_norm": 0.9758700728416443, "learning_rate": 9.917525374361912e-05, "loss": 1.3896, "step": 21 }, { "epoch": 0.0670391061452514, "grad_norm": 0.7843855023384094, "learning_rate": 9.86663298624003e-05, "loss": 1.0163, "step": 24 }, { "epoch": 0.07541899441340782, "grad_norm": 0.9721394777297974, "learning_rate": 9.803768380684242e-05, "loss": 1.104, "step": 27 }, { "epoch": 0.08379888268156424, "grad_norm": 0.7433673143386841, "learning_rate": 9.729086208503174e-05, "loss": 1.1567, "step": 30 }, { "epoch": 0.09217877094972067, "grad_norm": 0.730903148651123, "learning_rate": 9.642770192448536e-05, "loss": 1.1533, "step": 33 }, { "epoch": 0.1005586592178771, "grad_norm": 0.6961589455604553, "learning_rate": 9.545032675245813e-05, "loss": 1.212, "step": 36 }, { "epoch": 0.10893854748603352, "grad_norm": 0.9013598561286926, "learning_rate": 9.43611409721806e-05, "loss": 1.1623, "step": 39 }, { "epoch": 0.11731843575418995, "grad_norm": 1.0598002672195435, "learning_rate": 9.316282404787871e-05, "loss": 1.2984, "step": 42 }, { "epoch": 0.12569832402234637, "grad_norm": 1.1929737329483032, "learning_rate": 9.185832391312644e-05, "loss": 1.3138, "step": 45 }, { "epoch": 0.1340782122905028, "grad_norm": 1.1458548307418823, "learning_rate": 9.045084971874738e-05, "loss": 1.0941, "step": 48 }, { "epoch": 0.13966480446927373, "eval_loss": 1.0100281238555908, "eval_runtime": 17.101, "eval_samples_per_second": 35.261, "eval_steps_per_second": 8.83, "step": 50 }, { "epoch": 0.1424581005586592, "grad_norm": 4.544776916503906, "learning_rate": 8.894386393810563e-05, "loss": 1.6615, "step": 51 }, { "epoch": 0.15083798882681565, "grad_norm": 2.975677967071533, "learning_rate": 8.73410738492077e-05, "loss": 1.7424, "step": 54 }, { "epoch": 0.15921787709497207, "grad_norm": 1.2031599283218384, "learning_rate": 8.564642241456986e-05, "loss": 1.0062, "step": 57 }, { "epoch": 0.16759776536312848, "grad_norm": 0.7234984636306763, "learning_rate": 8.386407858128706e-05, "loss": 0.6319, "step": 60 }, { "epoch": 0.17597765363128492, "grad_norm": 0.6598789691925049, "learning_rate": 8.199842702516583e-05, "loss": 0.5383, "step": 63 }, { "epoch": 0.18435754189944134, "grad_norm": 0.5707410573959351, "learning_rate": 8.005405736415126e-05, "loss": 0.51, "step": 66 }, { "epoch": 0.19273743016759776, "grad_norm": 0.44198137521743774, "learning_rate": 7.803575286758364e-05, "loss": 0.447, "step": 69 }, { "epoch": 0.2011173184357542, "grad_norm": 0.5998854637145996, "learning_rate": 7.594847868906076e-05, "loss": 0.412, "step": 72 }, { "epoch": 0.20949720670391062, "grad_norm": 0.5717155337333679, "learning_rate": 7.379736965185368e-05, "loss": 0.4041, "step": 75 }, { "epoch": 0.21787709497206703, "grad_norm": 0.6459977626800537, "learning_rate": 7.158771761692464e-05, "loss": 0.5011, "step": 78 }, { "epoch": 0.22625698324022347, "grad_norm": 0.6296533346176147, "learning_rate": 6.932495846462261e-05, "loss": 0.4557, "step": 81 }, { "epoch": 0.2346368715083799, "grad_norm": 0.914438009262085, "learning_rate": 6.701465872208216e-05, "loss": 0.4752, "step": 84 }, { "epoch": 0.2430167597765363, "grad_norm": 1.0927045345306396, "learning_rate": 6.466250186922325e-05, "loss": 0.4553, "step": 87 }, { "epoch": 0.25139664804469275, "grad_norm": 1.3984386920928955, "learning_rate": 6.227427435703997e-05, "loss": 0.4644, "step": 90 }, { "epoch": 0.25977653631284914, "grad_norm": 1.0319483280181885, "learning_rate": 5.985585137257401e-05, "loss": 0.4658, "step": 93 }, { "epoch": 0.2681564245810056, "grad_norm": 0.9646169543266296, "learning_rate": 5.74131823855921e-05, "loss": 0.3963, "step": 96 }, { "epoch": 0.276536312849162, "grad_norm": 1.8801255226135254, "learning_rate": 5.495227651252315e-05, "loss": 0.4449, "step": 99 }, { "epoch": 0.27932960893854747, "eval_loss": 0.5376428961753845, "eval_runtime": 17.1283, "eval_samples_per_second": 35.205, "eval_steps_per_second": 8.816, "step": 100 }, { "epoch": 0.2849162011173184, "grad_norm": 2.7768867015838623, "learning_rate": 5.247918773366112e-05, "loss": 1.171, "step": 102 }, { "epoch": 0.29329608938547486, "grad_norm": 1.3445883989334106, "learning_rate": 5e-05, "loss": 0.5237, "step": 105 }, { "epoch": 0.3016759776536313, "grad_norm": 1.087411880493164, "learning_rate": 4.7520812266338885e-05, "loss": 0.4251, "step": 108 }, { "epoch": 0.3100558659217877, "grad_norm": 0.613654613494873, "learning_rate": 4.504772348747687e-05, "loss": 0.2683, "step": 111 }, { "epoch": 0.31843575418994413, "grad_norm": 0.8458730578422546, "learning_rate": 4.2586817614407895e-05, "loss": 0.279, "step": 114 }, { "epoch": 0.3268156424581006, "grad_norm": 0.5617647171020508, "learning_rate": 4.0144148627425993e-05, "loss": 0.253, "step": 117 }, { "epoch": 0.33519553072625696, "grad_norm": 0.6647018194198608, "learning_rate": 3.772572564296005e-05, "loss": 0.2877, "step": 120 }, { "epoch": 0.3435754189944134, "grad_norm": 0.4727783501148224, "learning_rate": 3.533749813077677e-05, "loss": 0.2442, "step": 123 }, { "epoch": 0.35195530726256985, "grad_norm": 0.5167345404624939, "learning_rate": 3.298534127791785e-05, "loss": 0.2438, "step": 126 }, { "epoch": 0.36033519553072624, "grad_norm": 0.6400461792945862, "learning_rate": 3.0675041535377405e-05, "loss": 0.2373, "step": 129 }, { "epoch": 0.3687150837988827, "grad_norm": 0.7134339809417725, "learning_rate": 2.8412282383075363e-05, "loss": 0.2713, "step": 132 }, { "epoch": 0.3770949720670391, "grad_norm": 0.6741611361503601, "learning_rate": 2.6202630348146324e-05, "loss": 0.2519, "step": 135 }, { "epoch": 0.3854748603351955, "grad_norm": 0.6766725182533264, "learning_rate": 2.405152131093926e-05, "loss": 0.3096, "step": 138 }, { "epoch": 0.39385474860335196, "grad_norm": 0.7892684936523438, "learning_rate": 2.196424713241637e-05, "loss": 0.3055, "step": 141 }, { "epoch": 0.4022346368715084, "grad_norm": 0.6781839728355408, "learning_rate": 1.9945942635848748e-05, "loss": 0.3243, "step": 144 }, { "epoch": 0.4106145251396648, "grad_norm": 0.8571739196777344, "learning_rate": 1.800157297483417e-05, "loss": 0.3238, "step": 147 }, { "epoch": 0.41899441340782123, "grad_norm": 1.0887181758880615, "learning_rate": 1.6135921418712956e-05, "loss": 0.3541, "step": 150 }, { "epoch": 0.41899441340782123, "eval_loss": 0.2837798595428467, "eval_runtime": 17.1062, "eval_samples_per_second": 35.25, "eval_steps_per_second": 8.827, "step": 150 }, { "epoch": 0.4273743016759777, "grad_norm": 1.305745244026184, "learning_rate": 1.435357758543015e-05, "loss": 0.5962, "step": 153 }, { "epoch": 0.43575418994413406, "grad_norm": 0.8109059929847717, "learning_rate": 1.2658926150792322e-05, "loss": 0.2629, "step": 156 }, { "epoch": 0.4441340782122905, "grad_norm": 1.5648621320724487, "learning_rate": 1.1056136061894384e-05, "loss": 0.2267, "step": 159 }, { "epoch": 0.45251396648044695, "grad_norm": 1.0514497756958008, "learning_rate": 9.549150281252633e-06, "loss": 0.2514, "step": 162 }, { "epoch": 0.46089385474860334, "grad_norm": 0.6054957509040833, "learning_rate": 8.141676086873572e-06, "loss": 0.1733, "step": 165 }, { "epoch": 0.4692737430167598, "grad_norm": 0.910860538482666, "learning_rate": 6.837175952121306e-06, "loss": 0.2035, "step": 168 }, { "epoch": 0.4776536312849162, "grad_norm": 1.2614514827728271, "learning_rate": 5.6388590278194096e-06, "loss": 0.2135, "step": 171 }, { "epoch": 0.4860335195530726, "grad_norm": 0.7164639234542847, "learning_rate": 4.549673247541875e-06, "loss": 0.1844, "step": 174 }, { "epoch": 0.49441340782122906, "grad_norm": 0.6105827689170837, "learning_rate": 3.5722980755146517e-06, "loss": 0.2386, "step": 177 }, { "epoch": 0.5027932960893855, "grad_norm": 0.95228511095047, "learning_rate": 2.7091379149682685e-06, "loss": 0.2286, "step": 180 }, { "epoch": 0.5111731843575419, "grad_norm": 0.7105367183685303, "learning_rate": 1.962316193157593e-06, "loss": 0.1992, "step": 183 }, { "epoch": 0.5195530726256983, "grad_norm": 0.7583916187286377, "learning_rate": 1.333670137599713e-06, "loss": 0.2144, "step": 186 }, { "epoch": 0.5279329608938548, "grad_norm": 0.7920324206352234, "learning_rate": 8.247462563808817e-07, "loss": 0.2418, "step": 189 }, { "epoch": 0.5363128491620112, "grad_norm": 0.847964346408844, "learning_rate": 4.367965336512403e-07, "loss": 0.259, "step": 192 }, { "epoch": 0.5446927374301676, "grad_norm": 0.6220384836196899, "learning_rate": 1.7077534966650766e-07, "loss": 0.282, "step": 195 }, { "epoch": 0.553072625698324, "grad_norm": 0.7298959493637085, "learning_rate": 2.7337132953697554e-08, "loss": 0.2359, "step": 198 }, { "epoch": 0.5586592178770949, "eval_loss": 0.22296833992004395, "eval_runtime": 17.0705, "eval_samples_per_second": 35.324, "eval_steps_per_second": 8.846, "step": 200 } ], "logging_steps": 3, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.61808020979712e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }