{ "best_metric": 0.7434989809989929, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 1.0049566294919454, "eval_steps": 50, "global_step": 101, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009913258983890954, "grad_norm": 0.48874759674072266, "learning_rate": 2e-05, "loss": 1.1772, "step": 1 }, { "epoch": 0.009913258983890954, "eval_loss": 1.1726865768432617, "eval_runtime": 45.2328, "eval_samples_per_second": 7.517, "eval_steps_per_second": 1.879, "step": 1 }, { "epoch": 0.01982651796778191, "grad_norm": 0.474133163690567, "learning_rate": 4e-05, "loss": 1.1973, "step": 2 }, { "epoch": 0.02973977695167286, "grad_norm": 0.5018295645713806, "learning_rate": 6e-05, "loss": 1.2222, "step": 3 }, { "epoch": 0.03965303593556382, "grad_norm": 0.45751234889030457, "learning_rate": 8e-05, "loss": 1.2127, "step": 4 }, { "epoch": 0.04956629491945477, "grad_norm": 0.5277777314186096, "learning_rate": 0.0001, "loss": 1.1466, "step": 5 }, { "epoch": 0.05947955390334572, "grad_norm": 0.4355424642562866, "learning_rate": 0.00012, "loss": 1.0944, "step": 6 }, { "epoch": 0.06939281288723669, "grad_norm": 0.4266590476036072, "learning_rate": 0.00014, "loss": 1.0243, "step": 7 }, { "epoch": 0.07930607187112763, "grad_norm": 0.8596389293670654, "learning_rate": 0.00016, "loss": 0.9531, "step": 8 }, { "epoch": 0.08921933085501858, "grad_norm": 0.39760446548461914, "learning_rate": 0.00018, "loss": 0.915, "step": 9 }, { "epoch": 0.09913258983890955, "grad_norm": 0.3741200268268585, "learning_rate": 0.0002, "loss": 0.9735, "step": 10 }, { "epoch": 0.1090458488228005, "grad_norm": 0.3944251835346222, "learning_rate": 0.00019994041405510705, "loss": 0.9357, "step": 11 }, { "epoch": 0.11895910780669144, "grad_norm": 0.3373732268810272, "learning_rate": 0.0001997617272301248, "loss": 0.9593, "step": 12 }, { "epoch": 0.1288723667905824, "grad_norm": 0.3242512047290802, "learning_rate": 0.0001994641524695193, "loss": 0.9296, "step": 13 }, { "epoch": 0.13878562577447337, "grad_norm": 0.3163102865219116, "learning_rate": 0.00019904804439875633, "loss": 0.9026, "step": 14 }, { "epoch": 0.14869888475836432, "grad_norm": 0.3128542900085449, "learning_rate": 0.0001985138989016874, "loss": 0.9143, "step": 15 }, { "epoch": 0.15861214374225527, "grad_norm": 0.29705294966697693, "learning_rate": 0.00019786235252959553, "loss": 0.9004, "step": 16 }, { "epoch": 0.16852540272614622, "grad_norm": 0.3244360089302063, "learning_rate": 0.0001970941817426052, "loss": 0.7955, "step": 17 }, { "epoch": 0.17843866171003717, "grad_norm": 0.2974328398704529, "learning_rate": 0.00019621030198436006, "loss": 0.9283, "step": 18 }, { "epoch": 0.18835192069392812, "grad_norm": 0.2881193459033966, "learning_rate": 0.00019521176659107142, "loss": 0.8537, "step": 19 }, { "epoch": 0.1982651796778191, "grad_norm": 0.2800627052783966, "learning_rate": 0.00019409976553623766, "loss": 0.8994, "step": 20 }, { "epoch": 0.20817843866171004, "grad_norm": 0.2811860740184784, "learning_rate": 0.00019287562401253022, "loss": 0.8763, "step": 21 }, { "epoch": 0.218091697645601, "grad_norm": 0.25567537546157837, "learning_rate": 0.00019154080085253666, "loss": 0.8475, "step": 22 }, { "epoch": 0.22800495662949194, "grad_norm": 0.2695324718952179, "learning_rate": 0.0001900968867902419, "loss": 0.87, "step": 23 }, { "epoch": 0.2379182156133829, "grad_norm": 0.2898944914340973, "learning_rate": 0.000188545602565321, "loss": 0.8119, "step": 24 }, { "epoch": 0.24783147459727387, "grad_norm": 0.2776627838611603, "learning_rate": 0.00018688879687250067, "loss": 0.8292, "step": 25 }, { "epoch": 0.2577447335811648, "grad_norm": 0.28858432173728943, "learning_rate": 0.00018512844415843514, "loss": 0.8165, "step": 26 }, { "epoch": 0.26765799256505574, "grad_norm": 0.31470319628715515, "learning_rate": 0.00018326664226872065, "loss": 0.8366, "step": 27 }, { "epoch": 0.27757125154894674, "grad_norm": 0.3070776164531708, "learning_rate": 0.00018130560994785325, "loss": 0.8505, "step": 28 }, { "epoch": 0.2874845105328377, "grad_norm": 0.3148271441459656, "learning_rate": 0.00017924768419510904, "loss": 0.8784, "step": 29 }, { "epoch": 0.29739776951672864, "grad_norm": 0.281838983297348, "learning_rate": 0.00017709531747949796, "loss": 0.8435, "step": 30 }, { "epoch": 0.3073110285006196, "grad_norm": 0.2795185446739197, "learning_rate": 0.00017485107481711012, "loss": 0.8421, "step": 31 }, { "epoch": 0.31722428748451054, "grad_norm": 0.2741798758506775, "learning_rate": 0.00017251763071433765, "loss": 0.7954, "step": 32 }, { "epoch": 0.3271375464684015, "grad_norm": 0.28247004747390747, "learning_rate": 0.00017009776598061495, "loss": 0.7691, "step": 33 }, { "epoch": 0.33705080545229243, "grad_norm": 0.3221910893917084, "learning_rate": 0.00016759436441447545, "loss": 0.8613, "step": 34 }, { "epoch": 0.3469640644361834, "grad_norm": 0.300059974193573, "learning_rate": 0.00016501040936687443, "loss": 0.814, "step": 35 }, { "epoch": 0.35687732342007433, "grad_norm": 0.3095373213291168, "learning_rate": 0.00016234898018587337, "loss": 0.8116, "step": 36 }, { "epoch": 0.3667905824039653, "grad_norm": 0.28610774874687195, "learning_rate": 0.00015961324854692254, "loss": 0.8363, "step": 37 }, { "epoch": 0.37670384138785623, "grad_norm": 0.3051545023918152, "learning_rate": 0.00015680647467311557, "loss": 0.8035, "step": 38 }, { "epoch": 0.38661710037174724, "grad_norm": 0.28369075059890747, "learning_rate": 0.00015393200344991995, "loss": 0.8328, "step": 39 }, { "epoch": 0.3965303593556382, "grad_norm": 0.2789386510848999, "learning_rate": 0.0001509932604390136, "loss": 0.8211, "step": 40 }, { "epoch": 0.40644361833952913, "grad_norm": 0.3046523928642273, "learning_rate": 0.00014799374779597867, "loss": 0.804, "step": 41 }, { "epoch": 0.4163568773234201, "grad_norm": 0.29437047243118286, "learning_rate": 0.00014493704009671613, "loss": 0.799, "step": 42 }, { "epoch": 0.42627013630731103, "grad_norm": 0.2865321934223175, "learning_rate": 0.0001418267800775565, "loss": 0.7877, "step": 43 }, { "epoch": 0.436183395291202, "grad_norm": 0.2856364846229553, "learning_rate": 0.0001386666742941419, "loss": 0.8098, "step": 44 }, { "epoch": 0.44609665427509293, "grad_norm": 0.2996920049190521, "learning_rate": 0.00013546048870425356, "loss": 0.8478, "step": 45 }, { "epoch": 0.4560099132589839, "grad_norm": 0.30788877606391907, "learning_rate": 0.00013221204417984908, "loss": 0.8112, "step": 46 }, { "epoch": 0.46592317224287483, "grad_norm": 0.29653915762901306, "learning_rate": 0.00012892521195365678, "loss": 0.7905, "step": 47 }, { "epoch": 0.4758364312267658, "grad_norm": 0.3146958351135254, "learning_rate": 0.0001256039090057547, "loss": 0.8241, "step": 48 }, { "epoch": 0.4857496902106567, "grad_norm": 0.3192234933376312, "learning_rate": 0.00012225209339563145, "loss": 0.7645, "step": 49 }, { "epoch": 0.49566294919454773, "grad_norm": 0.32355308532714844, "learning_rate": 0.00011887375954529168, "loss": 0.7881, "step": 50 }, { "epoch": 0.49566294919454773, "eval_loss": 0.7790313959121704, "eval_runtime": 45.7186, "eval_samples_per_second": 7.437, "eval_steps_per_second": 1.859, "step": 50 }, { "epoch": 0.5055762081784386, "grad_norm": 0.3305985927581787, "learning_rate": 0.00011547293347902812, "loss": 0.7902, "step": 51 }, { "epoch": 0.5154894671623296, "grad_norm": 0.331398069858551, "learning_rate": 0.0001120536680255323, "loss": 0.7915, "step": 52 }, { "epoch": 0.5254027261462205, "grad_norm": 0.34784436225891113, "learning_rate": 0.00010862003798806196, "loss": 0.7742, "step": 53 }, { "epoch": 0.5353159851301115, "grad_norm": 0.3431916832923889, "learning_rate": 0.00010517613528842097, "loss": 0.8222, "step": 54 }, { "epoch": 0.5452292441140025, "grad_norm": 0.3108989894390106, "learning_rate": 0.00010172606409053886, "loss": 0.753, "step": 55 }, { "epoch": 0.5551425030978935, "grad_norm": 0.3834403157234192, "learning_rate": 9.827393590946116e-05, "loss": 0.7495, "step": 56 }, { "epoch": 0.5650557620817844, "grad_norm": 0.3325962722301483, "learning_rate": 9.482386471157904e-05, "loss": 0.7572, "step": 57 }, { "epoch": 0.5749690210656754, "grad_norm": 0.34333735704421997, "learning_rate": 9.137996201193805e-05, "loss": 0.7767, "step": 58 }, { "epoch": 0.5848822800495663, "grad_norm": 0.32998785376548767, "learning_rate": 8.79463319744677e-05, "loss": 0.7841, "step": 59 }, { "epoch": 0.5947955390334573, "grad_norm": 0.33316344022750854, "learning_rate": 8.452706652097186e-05, "loss": 0.7639, "step": 60 }, { "epoch": 0.6047087980173482, "grad_norm": 0.3633474111557007, "learning_rate": 8.112624045470835e-05, "loss": 0.8559, "step": 61 }, { "epoch": 0.6146220570012392, "grad_norm": 0.3570536971092224, "learning_rate": 7.774790660436858e-05, "loss": 0.7631, "step": 62 }, { "epoch": 0.6245353159851301, "grad_norm": 0.3394259214401245, "learning_rate": 7.43960909942453e-05, "loss": 0.7786, "step": 63 }, { "epoch": 0.6344485749690211, "grad_norm": 0.3467733860015869, "learning_rate": 7.107478804634325e-05, "loss": 0.7614, "step": 64 }, { "epoch": 0.644361833952912, "grad_norm": 0.3295450210571289, "learning_rate": 6.778795582015097e-05, "loss": 0.8523, "step": 65 }, { "epoch": 0.654275092936803, "grad_norm": 0.3386925756931305, "learning_rate": 6.453951129574644e-05, "loss": 0.8138, "step": 66 }, { "epoch": 0.6641883519206939, "grad_norm": 0.32922279834747314, "learning_rate": 6.133332570585812e-05, "loss": 0.7738, "step": 67 }, { "epoch": 0.6741016109045849, "grad_norm": 0.3545457720756531, "learning_rate": 5.817321992244351e-05, "loss": 0.7795, "step": 68 }, { "epoch": 0.6840148698884758, "grad_norm": 0.34623822569847107, "learning_rate": 5.506295990328385e-05, "loss": 0.7551, "step": 69 }, { "epoch": 0.6939281288723668, "grad_norm": 0.3696240186691284, "learning_rate": 5.200625220402139e-05, "loss": 0.7543, "step": 70 }, { "epoch": 0.7038413878562577, "grad_norm": 0.34937718510627747, "learning_rate": 4.900673956098644e-05, "loss": 0.7364, "step": 71 }, { "epoch": 0.7137546468401487, "grad_norm": 0.36205634474754333, "learning_rate": 4.606799655008009e-05, "loss": 0.7579, "step": 72 }, { "epoch": 0.7236679058240396, "grad_norm": 0.3667196035385132, "learning_rate": 4.3193525326884435e-05, "loss": 0.7343, "step": 73 }, { "epoch": 0.7335811648079306, "grad_norm": 0.36573389172554016, "learning_rate": 4.038675145307747e-05, "loss": 0.7218, "step": 74 }, { "epoch": 0.7434944237918215, "grad_norm": 0.367421954870224, "learning_rate": 3.7651019814126654e-05, "loss": 0.8087, "step": 75 }, { "epoch": 0.7534076827757125, "grad_norm": 0.3466510474681854, "learning_rate": 3.498959063312558e-05, "loss": 0.797, "step": 76 }, { "epoch": 0.7633209417596035, "grad_norm": 0.3628087341785431, "learning_rate": 3.2405635585524565e-05, "loss": 0.7412, "step": 77 }, { "epoch": 0.7732342007434945, "grad_norm": 0.3664226830005646, "learning_rate": 2.9902234019385057e-05, "loss": 0.7456, "step": 78 }, { "epoch": 0.7831474597273854, "grad_norm": 0.346771240234375, "learning_rate": 2.7482369285662378e-05, "loss": 0.769, "step": 79 }, { "epoch": 0.7930607187112764, "grad_norm": 0.38797229528427124, "learning_rate": 2.514892518288988e-05, "loss": 0.7946, "step": 80 }, { "epoch": 0.8029739776951673, "grad_norm": 0.33963850140571594, "learning_rate": 2.290468252050204e-05, "loss": 0.7541, "step": 81 }, { "epoch": 0.8128872366790583, "grad_norm": 0.3752409517765045, "learning_rate": 2.0752315804890977e-05, "loss": 0.8104, "step": 82 }, { "epoch": 0.8228004956629492, "grad_norm": 0.33254724740982056, "learning_rate": 1.8694390052146737e-05, "loss": 0.7511, "step": 83 }, { "epoch": 0.8327137546468402, "grad_norm": 0.3548758029937744, "learning_rate": 1.6733357731279377e-05, "loss": 0.7255, "step": 84 }, { "epoch": 0.8426270136307311, "grad_norm": 0.34011703729629517, "learning_rate": 1.4871555841564887e-05, "loss": 0.7506, "step": 85 }, { "epoch": 0.8525402726146221, "grad_norm": 0.35038140416145325, "learning_rate": 1.311120312749935e-05, "loss": 0.7778, "step": 86 }, { "epoch": 0.862453531598513, "grad_norm": 0.35488229990005493, "learning_rate": 1.1454397434679021e-05, "loss": 0.7953, "step": 87 }, { "epoch": 0.872366790582404, "grad_norm": 0.3349529206752777, "learning_rate": 9.903113209758096e-06, "loss": 0.7046, "step": 88 }, { "epoch": 0.8822800495662949, "grad_norm": 0.3759743869304657, "learning_rate": 8.45919914746337e-06, "loss": 0.763, "step": 89 }, { "epoch": 0.8921933085501859, "grad_norm": 0.3336428999900818, "learning_rate": 7.124375987469767e-06, "loss": 0.7372, "step": 90 }, { "epoch": 0.9021065675340768, "grad_norm": 0.3244943916797638, "learning_rate": 5.900234463762366e-06, "loss": 0.7363, "step": 91 }, { "epoch": 0.9120198265179678, "grad_norm": 0.3384454846382141, "learning_rate": 4.788233408928589e-06, "loss": 0.7456, "step": 92 }, { "epoch": 0.9219330855018587, "grad_norm": 0.3504377603530884, "learning_rate": 3.789698015639953e-06, "loss": 0.826, "step": 93 }, { "epoch": 0.9318463444857497, "grad_norm": 0.36054572463035583, "learning_rate": 2.905818257394799e-06, "loss": 0.8068, "step": 94 }, { "epoch": 0.9417596034696406, "grad_norm": 0.3620031476020813, "learning_rate": 2.137647470404469e-06, "loss": 0.7452, "step": 95 }, { "epoch": 0.9516728624535316, "grad_norm": 0.345386266708374, "learning_rate": 1.48610109831262e-06, "loss": 0.7458, "step": 96 }, { "epoch": 0.9615861214374225, "grad_norm": 0.3529147505760193, "learning_rate": 9.519556012436815e-07, "loss": 0.7703, "step": 97 }, { "epoch": 0.9714993804213135, "grad_norm": 0.3554355204105377, "learning_rate": 5.358475304807375e-07, "loss": 0.7556, "step": 98 }, { "epoch": 0.9814126394052045, "grad_norm": 0.35240188241004944, "learning_rate": 2.382727698752474e-07, "loss": 0.7362, "step": 99 }, { "epoch": 0.9913258983890955, "grad_norm": 0.3343020975589752, "learning_rate": 5.958594489295921e-08, "loss": 0.7531, "step": 100 }, { "epoch": 0.9913258983890955, "eval_loss": 0.7434989809989929, "eval_runtime": 45.6881, "eval_samples_per_second": 7.442, "eval_steps_per_second": 1.86, "step": 100 }, { "epoch": 1.0049566294919454, "grad_norm": 0.5763421654701233, "learning_rate": 0.0, "loss": 1.0404, "step": 101 } ], "logging_steps": 1, "max_steps": 101, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.27994227137708e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }