{ "best_metric": 0.07799232006072998, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.09390826903961755, "eval_steps": 100, "global_step": 124, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007573247503194964, "grad_norm": 0.9691388607025146, "learning_rate": 2e-05, "loss": 1.3196, "step": 1 }, { "epoch": 0.0007573247503194964, "eval_loss": 1.60433828830719, "eval_runtime": 661.4874, "eval_samples_per_second": 6.724, "eval_steps_per_second": 1.681, "step": 1 }, { "epoch": 0.0015146495006389928, "grad_norm": 1.2349046468734741, "learning_rate": 4e-05, "loss": 1.5521, "step": 2 }, { "epoch": 0.002271974250958489, "grad_norm": 1.330872654914856, "learning_rate": 6e-05, "loss": 1.4937, "step": 3 }, { "epoch": 0.0030292990012779856, "grad_norm": 1.3608745336532593, "learning_rate": 8e-05, "loss": 1.5829, "step": 4 }, { "epoch": 0.003786623751597482, "grad_norm": 1.3821661472320557, "learning_rate": 0.0001, "loss": 1.4121, "step": 5 }, { "epoch": 0.004543948501916978, "grad_norm": 1.752526879310608, "learning_rate": 0.00012, "loss": 1.353, "step": 6 }, { "epoch": 0.005301273252236474, "grad_norm": 1.3683583736419678, "learning_rate": 0.00014, "loss": 1.2337, "step": 7 }, { "epoch": 0.006058598002555971, "grad_norm": 1.4449743032455444, "learning_rate": 0.00016, "loss": 0.9753, "step": 8 }, { "epoch": 0.006815922752875467, "grad_norm": 1.432096242904663, "learning_rate": 0.00018, "loss": 0.8742, "step": 9 }, { "epoch": 0.007573247503194964, "grad_norm": 1.2310497760772705, "learning_rate": 0.0002, "loss": 0.7185, "step": 10 }, { "epoch": 0.00833057225351446, "grad_norm": 1.158161997795105, "learning_rate": 0.00019996203070249516, "loss": 0.633, "step": 11 }, { "epoch": 0.009087897003833957, "grad_norm": 1.1543669700622559, "learning_rate": 0.00019984815164333163, "loss": 0.5801, "step": 12 }, { "epoch": 0.009845221754153453, "grad_norm": 0.9982590079307556, "learning_rate": 0.000199658449300667, "loss": 0.5354, "step": 13 }, { "epoch": 0.010602546504472949, "grad_norm": 0.8578788638114929, "learning_rate": 0.00019939306773179497, "loss": 0.5006, "step": 14 }, { "epoch": 0.011359871254792446, "grad_norm": 0.5724056363105774, "learning_rate": 0.00019905220846375032, "loss": 0.4145, "step": 15 }, { "epoch": 0.012117196005111942, "grad_norm": 0.5412246584892273, "learning_rate": 0.00019863613034027224, "loss": 0.4199, "step": 16 }, { "epoch": 0.012874520755431438, "grad_norm": 0.4610505700111389, "learning_rate": 0.0001981451493252418, "loss": 0.4001, "step": 17 }, { "epoch": 0.013631845505750934, "grad_norm": 0.47394484281539917, "learning_rate": 0.00019757963826274357, "loss": 0.3721, "step": 18 }, { "epoch": 0.014389170256070432, "grad_norm": 0.43060505390167236, "learning_rate": 0.00019694002659393305, "loss": 0.3345, "step": 19 }, { "epoch": 0.015146495006389928, "grad_norm": 0.46104079484939575, "learning_rate": 0.00019622680003092503, "loss": 0.4648, "step": 20 }, { "epoch": 0.015903819756709424, "grad_norm": 0.38638371229171753, "learning_rate": 0.00019544050018795075, "loss": 0.2985, "step": 21 }, { "epoch": 0.01666114450702892, "grad_norm": 0.4296252727508545, "learning_rate": 0.00019458172417006347, "loss": 0.2837, "step": 22 }, { "epoch": 0.017418469257348416, "grad_norm": 0.4201738238334656, "learning_rate": 0.0001936511241197055, "loss": 0.2629, "step": 23 }, { "epoch": 0.018175794007667914, "grad_norm": 0.46103158593177795, "learning_rate": 0.00019264940672148018, "loss": 0.2894, "step": 24 }, { "epoch": 0.01893311875798741, "grad_norm": 0.36362382769584656, "learning_rate": 0.00019157733266550575, "loss": 0.2094, "step": 25 }, { "epoch": 0.019690443508306905, "grad_norm": 0.42857611179351807, "learning_rate": 0.00019043571606975777, "loss": 0.2518, "step": 26 }, { "epoch": 0.020447768258626403, "grad_norm": 0.7166701555252075, "learning_rate": 0.0001892254238618394, "loss": 0.2777, "step": 27 }, { "epoch": 0.021205093008945897, "grad_norm": 0.3727664053440094, "learning_rate": 0.0001879473751206489, "loss": 0.2025, "step": 28 }, { "epoch": 0.021962417759265395, "grad_norm": 0.3549087941646576, "learning_rate": 0.00018660254037844388, "loss": 0.1908, "step": 29 }, { "epoch": 0.022719742509584893, "grad_norm": 0.33737561106681824, "learning_rate": 0.00018519194088383273, "loss": 0.1957, "step": 30 }, { "epoch": 0.023477067259904387, "grad_norm": 0.34675121307373047, "learning_rate": 0.00018371664782625287, "loss": 0.1625, "step": 31 }, { "epoch": 0.024234392010223885, "grad_norm": 0.3653319478034973, "learning_rate": 0.0001821777815225245, "loss": 0.1832, "step": 32 }, { "epoch": 0.02499171676054338, "grad_norm": 0.31412753462791443, "learning_rate": 0.00018057651056609784, "loss": 0.1717, "step": 33 }, { "epoch": 0.025749041510862877, "grad_norm": 0.3502964675426483, "learning_rate": 0.00017891405093963938, "loss": 0.159, "step": 34 }, { "epoch": 0.026506366261182374, "grad_norm": 0.3261137008666992, "learning_rate": 0.0001771916650916321, "loss": 0.1488, "step": 35 }, { "epoch": 0.02726369101150187, "grad_norm": 0.31498923897743225, "learning_rate": 0.00017541066097768963, "loss": 0.156, "step": 36 }, { "epoch": 0.028021015761821366, "grad_norm": 0.3408859670162201, "learning_rate": 0.00017357239106731317, "loss": 0.1598, "step": 37 }, { "epoch": 0.028778340512140864, "grad_norm": 0.324367880821228, "learning_rate": 0.00017167825131684513, "loss": 0.145, "step": 38 }, { "epoch": 0.029535665262460358, "grad_norm": 0.31265532970428467, "learning_rate": 0.00016972968010939954, "loss": 0.1514, "step": 39 }, { "epoch": 0.030292990012779856, "grad_norm": 0.34943684935569763, "learning_rate": 0.00016772815716257412, "loss": 0.1219, "step": 40 }, { "epoch": 0.03105031476309935, "grad_norm": 0.3439270853996277, "learning_rate": 0.00016567520240477344, "loss": 0.135, "step": 41 }, { "epoch": 0.03180763951341885, "grad_norm": 0.29284727573394775, "learning_rate": 0.00016357237482099684, "loss": 0.1091, "step": 42 }, { "epoch": 0.032564964263738345, "grad_norm": 0.3866511881351471, "learning_rate": 0.0001614212712689668, "loss": 0.1463, "step": 43 }, { "epoch": 0.03332228901405784, "grad_norm": 0.35192635655403137, "learning_rate": 0.00015922352526649803, "loss": 0.1351, "step": 44 }, { "epoch": 0.034079613764377334, "grad_norm": 0.3105197846889496, "learning_rate": 0.00015698080575102661, "loss": 0.125, "step": 45 }, { "epoch": 0.03483693851469683, "grad_norm": 0.4554622173309326, "learning_rate": 0.00015469481581224272, "loss": 0.1594, "step": 46 }, { "epoch": 0.03559426326501633, "grad_norm": 0.34930619597435, "learning_rate": 0.00015236729139878782, "loss": 0.1222, "step": 47 }, { "epoch": 0.03635158801533583, "grad_norm": 0.35271599888801575, "learning_rate": 0.00015000000000000001, "loss": 0.116, "step": 48 }, { "epoch": 0.037108912765655325, "grad_norm": 0.29121342301368713, "learning_rate": 0.00014759473930370736, "loss": 0.1188, "step": 49 }, { "epoch": 0.03786623751597482, "grad_norm": 0.320047527551651, "learning_rate": 0.00014515333583108896, "loss": 0.1343, "step": 50 }, { "epoch": 0.03862356226629431, "grad_norm": 0.38673272728919983, "learning_rate": 0.00014267764354964038, "loss": 0.1142, "step": 51 }, { "epoch": 0.03938088701661381, "grad_norm": 0.5377465486526489, "learning_rate": 0.00014016954246529696, "loss": 0.3442, "step": 52 }, { "epoch": 0.04013821176693331, "grad_norm": 0.33489689230918884, "learning_rate": 0.00013763093719478358, "loss": 0.113, "step": 53 }, { "epoch": 0.040895536517252806, "grad_norm": 0.29426538944244385, "learning_rate": 0.00013506375551927547, "loss": 0.092, "step": 54 }, { "epoch": 0.041652861267572304, "grad_norm": 0.2807617485523224, "learning_rate": 0.00013246994692046836, "loss": 0.1682, "step": 55 }, { "epoch": 0.042410186017891795, "grad_norm": 0.32574039697647095, "learning_rate": 0.00012985148110016947, "loss": 0.1055, "step": 56 }, { "epoch": 0.04316751076821129, "grad_norm": 0.37014421820640564, "learning_rate": 0.00012721034648453353, "loss": 0.1322, "step": 57 }, { "epoch": 0.04392483551853079, "grad_norm": 0.2937864065170288, "learning_rate": 0.00012454854871407994, "loss": 0.0958, "step": 58 }, { "epoch": 0.04468216026885029, "grad_norm": 0.2736242711544037, "learning_rate": 0.0001218681091206376, "loss": 0.0937, "step": 59 }, { "epoch": 0.045439485019169785, "grad_norm": 0.2477613240480423, "learning_rate": 0.00011917106319237386, "loss": 0.0855, "step": 60 }, { "epoch": 0.046196809769489276, "grad_norm": 0.34489548206329346, "learning_rate": 0.00011645945902807341, "loss": 0.0924, "step": 61 }, { "epoch": 0.046954134519808774, "grad_norm": 0.28233641386032104, "learning_rate": 0.00011373535578184082, "loss": 0.0963, "step": 62 }, { "epoch": 0.04771145927012827, "grad_norm": 0.259147584438324, "learning_rate": 0.00011100082209940795, "loss": 0.0803, "step": 63 }, { "epoch": 0.04846878402044777, "grad_norm": 0.2993817627429962, "learning_rate": 0.00010825793454723325, "loss": 0.1052, "step": 64 }, { "epoch": 0.04922610877076727, "grad_norm": 0.30414098501205444, "learning_rate": 0.00010550877603558655, "loss": 0.0841, "step": 65 }, { "epoch": 0.04998343352108676, "grad_norm": 0.39788779616355896, "learning_rate": 0.00010275543423681621, "loss": 0.0723, "step": 66 }, { "epoch": 0.050740758271406255, "grad_norm": 0.28072524070739746, "learning_rate": 0.0001, "loss": 0.0866, "step": 67 }, { "epoch": 0.05149808302172575, "grad_norm": 0.2475721836090088, "learning_rate": 9.724456576318381e-05, "loss": 0.089, "step": 68 }, { "epoch": 0.05225540777204525, "grad_norm": 0.4116728901863098, "learning_rate": 9.449122396441345e-05, "loss": 0.3701, "step": 69 }, { "epoch": 0.05301273252236475, "grad_norm": 0.3344607353210449, "learning_rate": 9.174206545276677e-05, "loss": 0.1114, "step": 70 }, { "epoch": 0.053770057272684246, "grad_norm": 0.27849143743515015, "learning_rate": 8.899917790059208e-05, "loss": 0.0977, "step": 71 }, { "epoch": 0.05452738202300374, "grad_norm": 0.3085162341594696, "learning_rate": 8.626464421815919e-05, "loss": 0.0901, "step": 72 }, { "epoch": 0.055284706773323235, "grad_norm": 0.28304585814476013, "learning_rate": 8.35405409719266e-05, "loss": 0.0881, "step": 73 }, { "epoch": 0.05604203152364273, "grad_norm": 0.31556403636932373, "learning_rate": 8.082893680762619e-05, "loss": 0.0758, "step": 74 }, { "epoch": 0.05679935627396223, "grad_norm": 0.25185614824295044, "learning_rate": 7.813189087936243e-05, "loss": 0.0932, "step": 75 }, { "epoch": 0.05755668102428173, "grad_norm": 0.31156933307647705, "learning_rate": 7.54514512859201e-05, "loss": 0.0927, "step": 76 }, { "epoch": 0.05831400577460122, "grad_norm": 0.29737532138824463, "learning_rate": 7.278965351546648e-05, "loss": 0.0872, "step": 77 }, { "epoch": 0.059071330524920716, "grad_norm": 0.3499886393547058, "learning_rate": 7.014851889983057e-05, "loss": 0.0822, "step": 78 }, { "epoch": 0.059828655275240214, "grad_norm": 0.31357136368751526, "learning_rate": 6.753005307953167e-05, "loss": 0.0902, "step": 79 }, { "epoch": 0.06058598002555971, "grad_norm": 0.2909204065799713, "learning_rate": 6.493624448072457e-05, "loss": 0.0752, "step": 80 }, { "epoch": 0.06134330477587921, "grad_norm": 0.2916364371776581, "learning_rate": 6.236906280521646e-05, "loss": 0.0905, "step": 81 }, { "epoch": 0.0621006295261987, "grad_norm": 0.3733891546726227, "learning_rate": 5.983045753470308e-05, "loss": 0.0775, "step": 82 }, { "epoch": 0.0628579542765182, "grad_norm": 0.23621852695941925, "learning_rate": 5.732235645035964e-05, "loss": 0.075, "step": 83 }, { "epoch": 0.0636152790268377, "grad_norm": 0.2937829792499542, "learning_rate": 5.484666416891109e-05, "loss": 0.1098, "step": 84 }, { "epoch": 0.06437260377715719, "grad_norm": 0.24242699146270752, "learning_rate": 5.240526069629265e-05, "loss": 0.0769, "step": 85 }, { "epoch": 0.06512992852747669, "grad_norm": 0.47885003685951233, "learning_rate": 5.000000000000002e-05, "loss": 0.225, "step": 86 }, { "epoch": 0.06588725327779618, "grad_norm": 0.2480865865945816, "learning_rate": 4.763270860121222e-05, "loss": 0.0739, "step": 87 }, { "epoch": 0.06664457802811569, "grad_norm": 0.33025848865509033, "learning_rate": 4.530518418775733e-05, "loss": 0.0982, "step": 88 }, { "epoch": 0.06740190277843518, "grad_norm": 0.26735973358154297, "learning_rate": 4.301919424897338e-05, "loss": 0.0774, "step": 89 }, { "epoch": 0.06815922752875467, "grad_norm": 0.3508649170398712, "learning_rate": 4.077647473350201e-05, "loss": 0.0848, "step": 90 }, { "epoch": 0.06891655227907417, "grad_norm": 0.2726826071739197, "learning_rate": 3.857872873103322e-05, "loss": 0.0847, "step": 91 }, { "epoch": 0.06967387702939366, "grad_norm": 0.2948499023914337, "learning_rate": 3.642762517900322e-05, "loss": 0.076, "step": 92 }, { "epoch": 0.07043120177971317, "grad_norm": 0.28020283579826355, "learning_rate": 3.4324797595226565e-05, "loss": 0.0618, "step": 93 }, { "epoch": 0.07118852653003266, "grad_norm": 0.293235182762146, "learning_rate": 3.227184283742591e-05, "loss": 0.0931, "step": 94 }, { "epoch": 0.07194585128035215, "grad_norm": 0.22709383070468903, "learning_rate": 3.0270319890600462e-05, "loss": 0.0661, "step": 95 }, { "epoch": 0.07270317603067165, "grad_norm": 0.2769714593887329, "learning_rate": 2.8321748683154893e-05, "loss": 0.075, "step": 96 }, { "epoch": 0.07346050078099114, "grad_norm": 0.2676548957824707, "learning_rate": 2.6427608932686843e-05, "loss": 0.0885, "step": 97 }, { "epoch": 0.07421782553131065, "grad_norm": 0.2752656936645508, "learning_rate": 2.4589339022310386e-05, "loss": 0.0751, "step": 98 }, { "epoch": 0.07497515028163014, "grad_norm": 0.32387983798980713, "learning_rate": 2.2808334908367914e-05, "loss": 0.0886, "step": 99 }, { "epoch": 0.07573247503194964, "grad_norm": 0.3434562683105469, "learning_rate": 2.1085949060360654e-05, "loss": 0.1105, "step": 100 }, { "epoch": 0.07573247503194964, "eval_loss": 0.07799232006072998, "eval_runtime": 665.6799, "eval_samples_per_second": 6.682, "eval_steps_per_second": 1.67, "step": 100 }, { "epoch": 0.07648979978226914, "grad_norm": 0.2453308403491974, "learning_rate": 1.9423489433902186e-05, "loss": 0.0635, "step": 101 }, { "epoch": 0.07724712453258863, "grad_norm": 0.286064088344574, "learning_rate": 1.7822218477475494e-05, "loss": 0.0783, "step": 102 }, { "epoch": 0.07800444928290813, "grad_norm": 0.29971766471862793, "learning_rate": 1.6283352173747145e-05, "loss": 0.0698, "step": 103 }, { "epoch": 0.07876177403322762, "grad_norm": 0.3119301497936249, "learning_rate": 1.4808059116167305e-05, "loss": 0.0773, "step": 104 }, { "epoch": 0.07951909878354713, "grad_norm": 0.20733344554901123, "learning_rate": 1.339745962155613e-05, "loss": 0.0604, "step": 105 }, { "epoch": 0.08027642353386662, "grad_norm": 0.25688570737838745, "learning_rate": 1.2052624879351104e-05, "loss": 0.0638, "step": 106 }, { "epoch": 0.08103374828418611, "grad_norm": 0.28935614228248596, "learning_rate": 1.0774576138160597e-05, "loss": 0.0713, "step": 107 }, { "epoch": 0.08179107303450561, "grad_norm": 0.24278295040130615, "learning_rate": 9.564283930242257e-06, "loss": 0.1038, "step": 108 }, { "epoch": 0.0825483977848251, "grad_norm": 0.30280330777168274, "learning_rate": 8.422667334494249e-06, "loss": 0.0981, "step": 109 }, { "epoch": 0.08330572253514461, "grad_norm": 0.2627258598804474, "learning_rate": 7.350593278519824e-06, "loss": 0.0663, "step": 110 }, { "epoch": 0.0840630472854641, "grad_norm": 0.2400035709142685, "learning_rate": 6.3488758802945354e-06, "loss": 0.071, "step": 111 }, { "epoch": 0.08482037203578359, "grad_norm": 0.31241780519485474, "learning_rate": 5.418275829936537e-06, "loss": 0.0658, "step": 112 }, { "epoch": 0.0855776967861031, "grad_norm": 0.3294007480144501, "learning_rate": 4.559499812049251e-06, "loss": 0.0978, "step": 113 }, { "epoch": 0.08633502153642258, "grad_norm": 0.22428545355796814, "learning_rate": 3.7731999690749585e-06, "loss": 0.0613, "step": 114 }, { "epoch": 0.08709234628674209, "grad_norm": 0.28080588579177856, "learning_rate": 3.059973406066963e-06, "loss": 0.0716, "step": 115 }, { "epoch": 0.08784967103706158, "grad_norm": 0.26730701327323914, "learning_rate": 2.420361737256438e-06, "loss": 0.0673, "step": 116 }, { "epoch": 0.08860699578738107, "grad_norm": 0.34970346093177795, "learning_rate": 1.8548506747582129e-06, "loss": 0.0773, "step": 117 }, { "epoch": 0.08936432053770058, "grad_norm": 0.2649919390678406, "learning_rate": 1.3638696597277679e-06, "loss": 0.0659, "step": 118 }, { "epoch": 0.09012164528802007, "grad_norm": 0.2567705512046814, "learning_rate": 9.477915362496758e-07, "loss": 0.0663, "step": 119 }, { "epoch": 0.09087897003833957, "grad_norm": 0.2665155827999115, "learning_rate": 6.069322682050516e-07, "loss": 0.0572, "step": 120 }, { "epoch": 0.09163629478865906, "grad_norm": 0.25261205434799194, "learning_rate": 3.415506993330153e-07, "loss": 0.0683, "step": 121 }, { "epoch": 0.09239361953897855, "grad_norm": 0.26622474193573, "learning_rate": 1.518483566683826e-07, "loss": 0.0637, "step": 122 }, { "epoch": 0.09315094428929806, "grad_norm": 0.2766965925693512, "learning_rate": 3.796929750485845e-08, "loss": 0.0843, "step": 123 }, { "epoch": 0.09390826903961755, "grad_norm": 0.2631254494190216, "learning_rate": 0.0, "loss": 0.08, "step": 124 } ], "logging_steps": 1, "max_steps": 124, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.4796378195021e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }