{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.10772578890097932, "eval_steps": 50, "global_step": 99, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001088139281828074, "grad_norm": 0.2615509629249573, "learning_rate": 5e-05, "loss": 0.838, "step": 1 }, { "epoch": 0.001088139281828074, "eval_loss": 0.23379258811473846, "eval_runtime": 24.7758, "eval_samples_per_second": 15.62, "eval_steps_per_second": 7.83, "step": 1 }, { "epoch": 0.002176278563656148, "grad_norm": 0.27500101923942566, "learning_rate": 0.0001, "loss": 1.0311, "step": 2 }, { "epoch": 0.003264417845484222, "grad_norm": 0.26265749335289, "learning_rate": 0.00015, "loss": 0.7598, "step": 3 }, { "epoch": 0.004352557127312296, "grad_norm": 0.34702393412590027, "learning_rate": 0.0002, "loss": 0.7643, "step": 4 }, { "epoch": 0.00544069640914037, "grad_norm": 0.26868095993995667, "learning_rate": 0.00025, "loss": 0.8676, "step": 5 }, { "epoch": 0.006528835690968444, "grad_norm": 0.6326900124549866, "learning_rate": 0.0003, "loss": 0.9927, "step": 6 }, { "epoch": 0.007616974972796518, "grad_norm": 0.6055320501327515, "learning_rate": 0.00035, "loss": 0.7319, "step": 7 }, { "epoch": 0.008705114254624592, "grad_norm": 0.43092861771583557, "learning_rate": 0.0004, "loss": 0.7695, "step": 8 }, { "epoch": 0.009793253536452665, "grad_norm": 0.40037959814071655, "learning_rate": 0.00045000000000000004, "loss": 0.7098, "step": 9 }, { "epoch": 0.01088139281828074, "grad_norm": 0.39341455698013306, "learning_rate": 0.0005, "loss": 0.4365, "step": 10 }, { "epoch": 0.011969532100108813, "grad_norm": 0.8135687112808228, "learning_rate": 0.0004998442655654946, "loss": 0.7446, "step": 11 }, { "epoch": 0.013057671381936888, "grad_norm": 0.8516698479652405, "learning_rate": 0.0004993772562876909, "loss": 0.3987, "step": 12 }, { "epoch": 0.014145810663764961, "grad_norm": 0.3541325628757477, "learning_rate": 0.0004985995540019955, "loss": 0.4453, "step": 13 }, { "epoch": 0.015233949945593036, "grad_norm": 0.5297847986221313, "learning_rate": 0.0004975121276286136, "loss": 0.5155, "step": 14 }, { "epoch": 0.01632208922742111, "grad_norm": 0.442644327878952, "learning_rate": 0.0004961163319653958, "loss": 0.4814, "step": 15 }, { "epoch": 0.017410228509249184, "grad_norm": 0.4812023341655731, "learning_rate": 0.0004944139059999286, "loss": 0.5217, "step": 16 }, { "epoch": 0.018498367791077257, "grad_norm": 0.4284003674983978, "learning_rate": 0.000492406970742972, "loss": 0.2965, "step": 17 }, { "epoch": 0.01958650707290533, "grad_norm": 0.5950977206230164, "learning_rate": 0.0004900980265859448, "loss": 0.5757, "step": 18 }, { "epoch": 0.020674646354733407, "grad_norm": 0.35087430477142334, "learning_rate": 0.0004874899501857477, "loss": 0.314, "step": 19 }, { "epoch": 0.02176278563656148, "grad_norm": 0.2939762771129608, "learning_rate": 0.00048458599088080736, "loss": 0.3879, "step": 20 }, { "epoch": 0.022850924918389554, "grad_norm": 0.3336440324783325, "learning_rate": 0.0004813897666428053, "loss": 0.3425, "step": 21 }, { "epoch": 0.023939064200217627, "grad_norm": 0.5165703296661377, "learning_rate": 0.00047790525956913543, "loss": 0.4142, "step": 22 }, { "epoch": 0.025027203482045703, "grad_norm": 0.41237401962280273, "learning_rate": 0.0004741368109217071, "loss": 0.4147, "step": 23 }, { "epoch": 0.026115342763873776, "grad_norm": 0.5671696066856384, "learning_rate": 0.00047008911571827283, "loss": 0.48, "step": 24 }, { "epoch": 0.02720348204570185, "grad_norm": 0.42801016569137573, "learning_rate": 0.00046576721688302105, "loss": 0.3996, "step": 25 }, { "epoch": 0.028291621327529923, "grad_norm": 0.3613467514514923, "learning_rate": 0.0004611764989637205, "loss": 0.3176, "step": 26 }, { "epoch": 0.029379760609358, "grad_norm": 1.3145064115524292, "learning_rate": 0.0004563226814232444, "loss": 0.4326, "step": 27 }, { "epoch": 0.030467899891186073, "grad_norm": 0.47409093379974365, "learning_rate": 0.0004512118115138315, "loss": 0.3246, "step": 28 }, { "epoch": 0.031556039173014146, "grad_norm": 0.5220752954483032, "learning_rate": 0.0004458502567429631, "loss": 0.5013, "step": 29 }, { "epoch": 0.03264417845484222, "grad_norm": 1.3407182693481445, "learning_rate": 0.00044024469694024196, "loss": 0.4893, "step": 30 }, { "epoch": 0.03373231773667029, "grad_norm": 0.7757295370101929, "learning_rate": 0.00043440211593515554, "loss": 0.4163, "step": 31 }, { "epoch": 0.03482045701849837, "grad_norm": 0.27271905541419983, "learning_rate": 0.0004283297928560951, "loss": 0.2256, "step": 32 }, { "epoch": 0.035908596300326445, "grad_norm": 0.4762435257434845, "learning_rate": 0.0004220352930614672, "loss": 0.4584, "step": 33 }, { "epoch": 0.036996735582154515, "grad_norm": 0.5020000338554382, "learning_rate": 0.00041552645871420013, "loss": 0.4403, "step": 34 }, { "epoch": 0.03808487486398259, "grad_norm": 0.3345811069011688, "learning_rate": 0.00040881139901138467, "loss": 0.4192, "step": 35 }, { "epoch": 0.03917301414581066, "grad_norm": 0.2985716462135315, "learning_rate": 0.00040189848008122475, "loss": 0.2805, "step": 36 }, { "epoch": 0.04026115342763874, "grad_norm": 0.8610369563102722, "learning_rate": 0.00039479631455988334, "loss": 0.718, "step": 37 }, { "epoch": 0.041349292709466814, "grad_norm": 0.4035598635673523, "learning_rate": 0.0003875137508612103, "loss": 0.3767, "step": 38 }, { "epoch": 0.042437431991294884, "grad_norm": 0.3371526002883911, "learning_rate": 0.00038005986215272055, "loss": 0.3665, "step": 39 }, { "epoch": 0.04352557127312296, "grad_norm": 0.9126002788543701, "learning_rate": 0.0003724439350515571, "loss": 0.6029, "step": 40 }, { "epoch": 0.04461371055495103, "grad_norm": 0.34127795696258545, "learning_rate": 0.0003646754580545226, "loss": 0.4245, "step": 41 }, { "epoch": 0.04570184983677911, "grad_norm": 0.4555828869342804, "learning_rate": 0.000356764109716594, "loss": 0.6118, "step": 42 }, { "epoch": 0.046789989118607184, "grad_norm": 0.30817294120788574, "learning_rate": 0.00034871974659264783, "loss": 0.3774, "step": 43 }, { "epoch": 0.04787812840043525, "grad_norm": 0.6372089982032776, "learning_rate": 0.0003405523909574206, "loss": 0.4466, "step": 44 }, { "epoch": 0.04896626768226333, "grad_norm": 0.2218654602766037, "learning_rate": 0.0003322722183190025, "loss": 0.2911, "step": 45 }, { "epoch": 0.05005440696409141, "grad_norm": 0.7268356084823608, "learning_rate": 0.0003238895447414211, "loss": 0.5186, "step": 46 }, { "epoch": 0.051142546245919476, "grad_norm": 0.3474952280521393, "learning_rate": 0.0003154148139921102, "loss": 0.3702, "step": 47 }, { "epoch": 0.05223068552774755, "grad_norm": 0.47476518154144287, "learning_rate": 0.00030685858453027663, "loss": 0.3098, "step": 48 }, { "epoch": 0.05331882480957562, "grad_norm": 0.33052054047584534, "learning_rate": 0.0002982315163523742, "loss": 0.3452, "step": 49 }, { "epoch": 0.0544069640914037, "grad_norm": 0.3936608135700226, "learning_rate": 0.000289544357711076, "loss": 0.3504, "step": 50 }, { "epoch": 0.0544069640914037, "eval_loss": 0.10187384486198425, "eval_runtime": 24.5923, "eval_samples_per_second": 15.737, "eval_steps_per_second": 7.889, "step": 50 }, { "epoch": 0.055495103373231776, "grad_norm": 0.3658672273159027, "learning_rate": 0.0002808079317242896, "loss": 0.3851, "step": 51 }, { "epoch": 0.056583242655059846, "grad_norm": 0.2899942994117737, "learning_rate": 0.0002720331228909005, "loss": 0.2768, "step": 52 }, { "epoch": 0.05767138193688792, "grad_norm": 0.6674902439117432, "learning_rate": 0.00026323086353004075, "loss": 0.6506, "step": 53 }, { "epoch": 0.058759521218716, "grad_norm": 0.8436214923858643, "learning_rate": 0.0002544121201607822, "loss": 0.8227, "step": 54 }, { "epoch": 0.05984766050054407, "grad_norm": 0.2817295789718628, "learning_rate": 0.00024558787983921783, "loss": 0.3449, "step": 55 }, { "epoch": 0.060935799782372145, "grad_norm": 0.25585779547691345, "learning_rate": 0.0002367691364699592, "loss": 0.2408, "step": 56 }, { "epoch": 0.062023939064200215, "grad_norm": 0.250396192073822, "learning_rate": 0.00022796687710909964, "loss": 0.2588, "step": 57 }, { "epoch": 0.06311207834602829, "grad_norm": 0.6818592548370361, "learning_rate": 0.00021919206827571036, "loss": 0.5172, "step": 58 }, { "epoch": 0.06420021762785637, "grad_norm": 0.8470868468284607, "learning_rate": 0.00021045564228892402, "loss": 0.2823, "step": 59 }, { "epoch": 0.06528835690968444, "grad_norm": 0.48449358344078064, "learning_rate": 0.00020176848364762578, "loss": 0.3356, "step": 60 }, { "epoch": 0.06637649619151251, "grad_norm": 0.25606584548950195, "learning_rate": 0.00019314141546972343, "loss": 0.3552, "step": 61 }, { "epoch": 0.06746463547334058, "grad_norm": 0.21550142765045166, "learning_rate": 0.00018458518600788986, "loss": 0.2025, "step": 62 }, { "epoch": 0.06855277475516866, "grad_norm": 0.33708736300468445, "learning_rate": 0.00017611045525857898, "loss": 0.299, "step": 63 }, { "epoch": 0.06964091403699674, "grad_norm": 0.41520461440086365, "learning_rate": 0.0001677277816809975, "loss": 0.3242, "step": 64 }, { "epoch": 0.07072905331882481, "grad_norm": 0.9485870003700256, "learning_rate": 0.00015944760904257942, "loss": 0.4488, "step": 65 }, { "epoch": 0.07181719260065289, "grad_norm": 0.36165550351142883, "learning_rate": 0.0001512802534073522, "loss": 0.5672, "step": 66 }, { "epoch": 0.07290533188248095, "grad_norm": 0.6473806500434875, "learning_rate": 0.00014323589028340596, "loss": 0.6128, "step": 67 }, { "epoch": 0.07399347116430903, "grad_norm": 0.2864266633987427, "learning_rate": 0.00013532454194547733, "loss": 0.3151, "step": 68 }, { "epoch": 0.0750816104461371, "grad_norm": 0.2809907793998718, "learning_rate": 0.00012755606494844294, "loss": 0.1535, "step": 69 }, { "epoch": 0.07616974972796518, "grad_norm": 0.32490411400794983, "learning_rate": 0.00011994013784727947, "loss": 0.3442, "step": 70 }, { "epoch": 0.07725788900979326, "grad_norm": 0.6548874974250793, "learning_rate": 0.00011248624913878966, "loss": 0.5008, "step": 71 }, { "epoch": 0.07834602829162132, "grad_norm": 0.7361955046653748, "learning_rate": 0.0001052036854401166, "loss": 0.6513, "step": 72 }, { "epoch": 0.0794341675734494, "grad_norm": 0.5869486927986145, "learning_rate": 9.810151991877531e-05, "loss": 0.4508, "step": 73 }, { "epoch": 0.08052230685527748, "grad_norm": 0.3208440840244293, "learning_rate": 9.118860098861537e-05, "loss": 0.325, "step": 74 }, { "epoch": 0.08161044613710555, "grad_norm": 0.35303372144699097, "learning_rate": 8.44735412857999e-05, "loss": 0.327, "step": 75 }, { "epoch": 0.08269858541893363, "grad_norm": 0.22797748446464539, "learning_rate": 7.79647069385328e-05, "loss": 0.3183, "step": 76 }, { "epoch": 0.08378672470076169, "grad_norm": 0.328909307718277, "learning_rate": 7.167020714390501e-05, "loss": 0.3248, "step": 77 }, { "epoch": 0.08487486398258977, "grad_norm": 0.34297263622283936, "learning_rate": 6.559788406484446e-05, "loss": 0.2397, "step": 78 }, { "epoch": 0.08596300326441784, "grad_norm": 0.6872113943099976, "learning_rate": 5.975530305975807e-05, "loss": 0.5042, "step": 79 }, { "epoch": 0.08705114254624592, "grad_norm": 0.42344143986701965, "learning_rate": 5.414974325703686e-05, "loss": 0.3577, "step": 80 }, { "epoch": 0.088139281828074, "grad_norm": 0.31334200501441956, "learning_rate": 4.8788188486168616e-05, "loss": 0.2931, "step": 81 }, { "epoch": 0.08922742110990206, "grad_norm": 0.394218385219574, "learning_rate": 4.367731857675569e-05, "loss": 0.3908, "step": 82 }, { "epoch": 0.09031556039173014, "grad_norm": 0.3325517475605011, "learning_rate": 3.882350103627952e-05, "loss": 0.351, "step": 83 }, { "epoch": 0.09140369967355821, "grad_norm": 0.72450190782547, "learning_rate": 3.423278311697897e-05, "loss": 0.5302, "step": 84 }, { "epoch": 0.09249183895538629, "grad_norm": 0.9300814867019653, "learning_rate": 2.9910884281727225e-05, "loss": 0.6356, "step": 85 }, { "epoch": 0.09357997823721437, "grad_norm": 0.36167338490486145, "learning_rate": 2.586318907829291e-05, "loss": 0.4763, "step": 86 }, { "epoch": 0.09466811751904244, "grad_norm": 0.3068345785140991, "learning_rate": 2.209474043086457e-05, "loss": 0.3455, "step": 87 }, { "epoch": 0.0957562568008705, "grad_norm": 0.4473964273929596, "learning_rate": 1.861023335719475e-05, "loss": 0.5717, "step": 88 }, { "epoch": 0.09684439608269858, "grad_norm": 0.31159886717796326, "learning_rate": 1.5414009119192633e-05, "loss": 0.298, "step": 89 }, { "epoch": 0.09793253536452666, "grad_norm": 0.2886298894882202, "learning_rate": 1.25100498142523e-05, "loss": 0.3209, "step": 90 }, { "epoch": 0.09902067464635474, "grad_norm": 0.27010682225227356, "learning_rate": 9.901973414055187e-06, "loss": 0.2501, "step": 91 }, { "epoch": 0.10010881392818281, "grad_norm": 0.3159748315811157, "learning_rate": 7.593029257027956e-06, "loss": 0.2336, "step": 92 }, { "epoch": 0.10119695321001088, "grad_norm": 0.5132943391799927, "learning_rate": 5.5860940000714015e-06, "loss": 0.4026, "step": 93 }, { "epoch": 0.10228509249183895, "grad_norm": 0.3360811769962311, "learning_rate": 3.8836680346041594e-06, "loss": 0.3444, "step": 94 }, { "epoch": 0.10337323177366703, "grad_norm": 0.9271918535232544, "learning_rate": 2.487872371386424e-06, "loss": 0.7088, "step": 95 }, { "epoch": 0.1044613710554951, "grad_norm": 0.7888476252555847, "learning_rate": 1.4004459980045125e-06, "loss": 0.5126, "step": 96 }, { "epoch": 0.10554951033732318, "grad_norm": 0.25628480315208435, "learning_rate": 6.22743712309054e-07, "loss": 0.2571, "step": 97 }, { "epoch": 0.10663764961915125, "grad_norm": 0.4536930322647095, "learning_rate": 1.557344345054501e-07, "loss": 0.5685, "step": 98 }, { "epoch": 0.10772578890097932, "grad_norm": 0.7662109732627869, "learning_rate": 0.0, "loss": 0.5405, "step": 99 } ], "logging_steps": 1, "max_steps": 99, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.988754859143987e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }