mrferr3t's picture
Training in progress, step 99, checkpoint
5347256 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.10772578890097932,
"eval_steps": 50,
"global_step": 99,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001088139281828074,
"grad_norm": 0.2615509629249573,
"learning_rate": 5e-05,
"loss": 0.838,
"step": 1
},
{
"epoch": 0.001088139281828074,
"eval_loss": 0.23379258811473846,
"eval_runtime": 24.7758,
"eval_samples_per_second": 15.62,
"eval_steps_per_second": 7.83,
"step": 1
},
{
"epoch": 0.002176278563656148,
"grad_norm": 0.27500101923942566,
"learning_rate": 0.0001,
"loss": 1.0311,
"step": 2
},
{
"epoch": 0.003264417845484222,
"grad_norm": 0.26265749335289,
"learning_rate": 0.00015,
"loss": 0.7598,
"step": 3
},
{
"epoch": 0.004352557127312296,
"grad_norm": 0.34702393412590027,
"learning_rate": 0.0002,
"loss": 0.7643,
"step": 4
},
{
"epoch": 0.00544069640914037,
"grad_norm": 0.26868095993995667,
"learning_rate": 0.00025,
"loss": 0.8676,
"step": 5
},
{
"epoch": 0.006528835690968444,
"grad_norm": 0.6326900124549866,
"learning_rate": 0.0003,
"loss": 0.9927,
"step": 6
},
{
"epoch": 0.007616974972796518,
"grad_norm": 0.6055320501327515,
"learning_rate": 0.00035,
"loss": 0.7319,
"step": 7
},
{
"epoch": 0.008705114254624592,
"grad_norm": 0.43092861771583557,
"learning_rate": 0.0004,
"loss": 0.7695,
"step": 8
},
{
"epoch": 0.009793253536452665,
"grad_norm": 0.40037959814071655,
"learning_rate": 0.00045000000000000004,
"loss": 0.7098,
"step": 9
},
{
"epoch": 0.01088139281828074,
"grad_norm": 0.39341455698013306,
"learning_rate": 0.0005,
"loss": 0.4365,
"step": 10
},
{
"epoch": 0.011969532100108813,
"grad_norm": 0.8135687112808228,
"learning_rate": 0.0004998442655654946,
"loss": 0.7446,
"step": 11
},
{
"epoch": 0.013057671381936888,
"grad_norm": 0.8516698479652405,
"learning_rate": 0.0004993772562876909,
"loss": 0.3987,
"step": 12
},
{
"epoch": 0.014145810663764961,
"grad_norm": 0.3541325628757477,
"learning_rate": 0.0004985995540019955,
"loss": 0.4453,
"step": 13
},
{
"epoch": 0.015233949945593036,
"grad_norm": 0.5297847986221313,
"learning_rate": 0.0004975121276286136,
"loss": 0.5155,
"step": 14
},
{
"epoch": 0.01632208922742111,
"grad_norm": 0.442644327878952,
"learning_rate": 0.0004961163319653958,
"loss": 0.4814,
"step": 15
},
{
"epoch": 0.017410228509249184,
"grad_norm": 0.4812023341655731,
"learning_rate": 0.0004944139059999286,
"loss": 0.5217,
"step": 16
},
{
"epoch": 0.018498367791077257,
"grad_norm": 0.4284003674983978,
"learning_rate": 0.000492406970742972,
"loss": 0.2965,
"step": 17
},
{
"epoch": 0.01958650707290533,
"grad_norm": 0.5950977206230164,
"learning_rate": 0.0004900980265859448,
"loss": 0.5757,
"step": 18
},
{
"epoch": 0.020674646354733407,
"grad_norm": 0.35087430477142334,
"learning_rate": 0.0004874899501857477,
"loss": 0.314,
"step": 19
},
{
"epoch": 0.02176278563656148,
"grad_norm": 0.2939762771129608,
"learning_rate": 0.00048458599088080736,
"loss": 0.3879,
"step": 20
},
{
"epoch": 0.022850924918389554,
"grad_norm": 0.3336440324783325,
"learning_rate": 0.0004813897666428053,
"loss": 0.3425,
"step": 21
},
{
"epoch": 0.023939064200217627,
"grad_norm": 0.5165703296661377,
"learning_rate": 0.00047790525956913543,
"loss": 0.4142,
"step": 22
},
{
"epoch": 0.025027203482045703,
"grad_norm": 0.41237401962280273,
"learning_rate": 0.0004741368109217071,
"loss": 0.4147,
"step": 23
},
{
"epoch": 0.026115342763873776,
"grad_norm": 0.5671696066856384,
"learning_rate": 0.00047008911571827283,
"loss": 0.48,
"step": 24
},
{
"epoch": 0.02720348204570185,
"grad_norm": 0.42801016569137573,
"learning_rate": 0.00046576721688302105,
"loss": 0.3996,
"step": 25
},
{
"epoch": 0.028291621327529923,
"grad_norm": 0.3613467514514923,
"learning_rate": 0.0004611764989637205,
"loss": 0.3176,
"step": 26
},
{
"epoch": 0.029379760609358,
"grad_norm": 1.3145064115524292,
"learning_rate": 0.0004563226814232444,
"loss": 0.4326,
"step": 27
},
{
"epoch": 0.030467899891186073,
"grad_norm": 0.47409093379974365,
"learning_rate": 0.0004512118115138315,
"loss": 0.3246,
"step": 28
},
{
"epoch": 0.031556039173014146,
"grad_norm": 0.5220752954483032,
"learning_rate": 0.0004458502567429631,
"loss": 0.5013,
"step": 29
},
{
"epoch": 0.03264417845484222,
"grad_norm": 1.3407182693481445,
"learning_rate": 0.00044024469694024196,
"loss": 0.4893,
"step": 30
},
{
"epoch": 0.03373231773667029,
"grad_norm": 0.7757295370101929,
"learning_rate": 0.00043440211593515554,
"loss": 0.4163,
"step": 31
},
{
"epoch": 0.03482045701849837,
"grad_norm": 0.27271905541419983,
"learning_rate": 0.0004283297928560951,
"loss": 0.2256,
"step": 32
},
{
"epoch": 0.035908596300326445,
"grad_norm": 0.4762435257434845,
"learning_rate": 0.0004220352930614672,
"loss": 0.4584,
"step": 33
},
{
"epoch": 0.036996735582154515,
"grad_norm": 0.5020000338554382,
"learning_rate": 0.00041552645871420013,
"loss": 0.4403,
"step": 34
},
{
"epoch": 0.03808487486398259,
"grad_norm": 0.3345811069011688,
"learning_rate": 0.00040881139901138467,
"loss": 0.4192,
"step": 35
},
{
"epoch": 0.03917301414581066,
"grad_norm": 0.2985716462135315,
"learning_rate": 0.00040189848008122475,
"loss": 0.2805,
"step": 36
},
{
"epoch": 0.04026115342763874,
"grad_norm": 0.8610369563102722,
"learning_rate": 0.00039479631455988334,
"loss": 0.718,
"step": 37
},
{
"epoch": 0.041349292709466814,
"grad_norm": 0.4035598635673523,
"learning_rate": 0.0003875137508612103,
"loss": 0.3767,
"step": 38
},
{
"epoch": 0.042437431991294884,
"grad_norm": 0.3371526002883911,
"learning_rate": 0.00038005986215272055,
"loss": 0.3665,
"step": 39
},
{
"epoch": 0.04352557127312296,
"grad_norm": 0.9126002788543701,
"learning_rate": 0.0003724439350515571,
"loss": 0.6029,
"step": 40
},
{
"epoch": 0.04461371055495103,
"grad_norm": 0.34127795696258545,
"learning_rate": 0.0003646754580545226,
"loss": 0.4245,
"step": 41
},
{
"epoch": 0.04570184983677911,
"grad_norm": 0.4555828869342804,
"learning_rate": 0.000356764109716594,
"loss": 0.6118,
"step": 42
},
{
"epoch": 0.046789989118607184,
"grad_norm": 0.30817294120788574,
"learning_rate": 0.00034871974659264783,
"loss": 0.3774,
"step": 43
},
{
"epoch": 0.04787812840043525,
"grad_norm": 0.6372089982032776,
"learning_rate": 0.0003405523909574206,
"loss": 0.4466,
"step": 44
},
{
"epoch": 0.04896626768226333,
"grad_norm": 0.2218654602766037,
"learning_rate": 0.0003322722183190025,
"loss": 0.2911,
"step": 45
},
{
"epoch": 0.05005440696409141,
"grad_norm": 0.7268356084823608,
"learning_rate": 0.0003238895447414211,
"loss": 0.5186,
"step": 46
},
{
"epoch": 0.051142546245919476,
"grad_norm": 0.3474952280521393,
"learning_rate": 0.0003154148139921102,
"loss": 0.3702,
"step": 47
},
{
"epoch": 0.05223068552774755,
"grad_norm": 0.47476518154144287,
"learning_rate": 0.00030685858453027663,
"loss": 0.3098,
"step": 48
},
{
"epoch": 0.05331882480957562,
"grad_norm": 0.33052054047584534,
"learning_rate": 0.0002982315163523742,
"loss": 0.3452,
"step": 49
},
{
"epoch": 0.0544069640914037,
"grad_norm": 0.3936608135700226,
"learning_rate": 0.000289544357711076,
"loss": 0.3504,
"step": 50
},
{
"epoch": 0.0544069640914037,
"eval_loss": 0.10187384486198425,
"eval_runtime": 24.5923,
"eval_samples_per_second": 15.737,
"eval_steps_per_second": 7.889,
"step": 50
},
{
"epoch": 0.055495103373231776,
"grad_norm": 0.3658672273159027,
"learning_rate": 0.0002808079317242896,
"loss": 0.3851,
"step": 51
},
{
"epoch": 0.056583242655059846,
"grad_norm": 0.2899942994117737,
"learning_rate": 0.0002720331228909005,
"loss": 0.2768,
"step": 52
},
{
"epoch": 0.05767138193688792,
"grad_norm": 0.6674902439117432,
"learning_rate": 0.00026323086353004075,
"loss": 0.6506,
"step": 53
},
{
"epoch": 0.058759521218716,
"grad_norm": 0.8436214923858643,
"learning_rate": 0.0002544121201607822,
"loss": 0.8227,
"step": 54
},
{
"epoch": 0.05984766050054407,
"grad_norm": 0.2817295789718628,
"learning_rate": 0.00024558787983921783,
"loss": 0.3449,
"step": 55
},
{
"epoch": 0.060935799782372145,
"grad_norm": 0.25585779547691345,
"learning_rate": 0.0002367691364699592,
"loss": 0.2408,
"step": 56
},
{
"epoch": 0.062023939064200215,
"grad_norm": 0.250396192073822,
"learning_rate": 0.00022796687710909964,
"loss": 0.2588,
"step": 57
},
{
"epoch": 0.06311207834602829,
"grad_norm": 0.6818592548370361,
"learning_rate": 0.00021919206827571036,
"loss": 0.5172,
"step": 58
},
{
"epoch": 0.06420021762785637,
"grad_norm": 0.8470868468284607,
"learning_rate": 0.00021045564228892402,
"loss": 0.2823,
"step": 59
},
{
"epoch": 0.06528835690968444,
"grad_norm": 0.48449358344078064,
"learning_rate": 0.00020176848364762578,
"loss": 0.3356,
"step": 60
},
{
"epoch": 0.06637649619151251,
"grad_norm": 0.25606584548950195,
"learning_rate": 0.00019314141546972343,
"loss": 0.3552,
"step": 61
},
{
"epoch": 0.06746463547334058,
"grad_norm": 0.21550142765045166,
"learning_rate": 0.00018458518600788986,
"loss": 0.2025,
"step": 62
},
{
"epoch": 0.06855277475516866,
"grad_norm": 0.33708736300468445,
"learning_rate": 0.00017611045525857898,
"loss": 0.299,
"step": 63
},
{
"epoch": 0.06964091403699674,
"grad_norm": 0.41520461440086365,
"learning_rate": 0.0001677277816809975,
"loss": 0.3242,
"step": 64
},
{
"epoch": 0.07072905331882481,
"grad_norm": 0.9485870003700256,
"learning_rate": 0.00015944760904257942,
"loss": 0.4488,
"step": 65
},
{
"epoch": 0.07181719260065289,
"grad_norm": 0.36165550351142883,
"learning_rate": 0.0001512802534073522,
"loss": 0.5672,
"step": 66
},
{
"epoch": 0.07290533188248095,
"grad_norm": 0.6473806500434875,
"learning_rate": 0.00014323589028340596,
"loss": 0.6128,
"step": 67
},
{
"epoch": 0.07399347116430903,
"grad_norm": 0.2864266633987427,
"learning_rate": 0.00013532454194547733,
"loss": 0.3151,
"step": 68
},
{
"epoch": 0.0750816104461371,
"grad_norm": 0.2809907793998718,
"learning_rate": 0.00012755606494844294,
"loss": 0.1535,
"step": 69
},
{
"epoch": 0.07616974972796518,
"grad_norm": 0.32490411400794983,
"learning_rate": 0.00011994013784727947,
"loss": 0.3442,
"step": 70
},
{
"epoch": 0.07725788900979326,
"grad_norm": 0.6548874974250793,
"learning_rate": 0.00011248624913878966,
"loss": 0.5008,
"step": 71
},
{
"epoch": 0.07834602829162132,
"grad_norm": 0.7361955046653748,
"learning_rate": 0.0001052036854401166,
"loss": 0.6513,
"step": 72
},
{
"epoch": 0.0794341675734494,
"grad_norm": 0.5869486927986145,
"learning_rate": 9.810151991877531e-05,
"loss": 0.4508,
"step": 73
},
{
"epoch": 0.08052230685527748,
"grad_norm": 0.3208440840244293,
"learning_rate": 9.118860098861537e-05,
"loss": 0.325,
"step": 74
},
{
"epoch": 0.08161044613710555,
"grad_norm": 0.35303372144699097,
"learning_rate": 8.44735412857999e-05,
"loss": 0.327,
"step": 75
},
{
"epoch": 0.08269858541893363,
"grad_norm": 0.22797748446464539,
"learning_rate": 7.79647069385328e-05,
"loss": 0.3183,
"step": 76
},
{
"epoch": 0.08378672470076169,
"grad_norm": 0.328909307718277,
"learning_rate": 7.167020714390501e-05,
"loss": 0.3248,
"step": 77
},
{
"epoch": 0.08487486398258977,
"grad_norm": 0.34297263622283936,
"learning_rate": 6.559788406484446e-05,
"loss": 0.2397,
"step": 78
},
{
"epoch": 0.08596300326441784,
"grad_norm": 0.6872113943099976,
"learning_rate": 5.975530305975807e-05,
"loss": 0.5042,
"step": 79
},
{
"epoch": 0.08705114254624592,
"grad_norm": 0.42344143986701965,
"learning_rate": 5.414974325703686e-05,
"loss": 0.3577,
"step": 80
},
{
"epoch": 0.088139281828074,
"grad_norm": 0.31334200501441956,
"learning_rate": 4.8788188486168616e-05,
"loss": 0.2931,
"step": 81
},
{
"epoch": 0.08922742110990206,
"grad_norm": 0.394218385219574,
"learning_rate": 4.367731857675569e-05,
"loss": 0.3908,
"step": 82
},
{
"epoch": 0.09031556039173014,
"grad_norm": 0.3325517475605011,
"learning_rate": 3.882350103627952e-05,
"loss": 0.351,
"step": 83
},
{
"epoch": 0.09140369967355821,
"grad_norm": 0.72450190782547,
"learning_rate": 3.423278311697897e-05,
"loss": 0.5302,
"step": 84
},
{
"epoch": 0.09249183895538629,
"grad_norm": 0.9300814867019653,
"learning_rate": 2.9910884281727225e-05,
"loss": 0.6356,
"step": 85
},
{
"epoch": 0.09357997823721437,
"grad_norm": 0.36167338490486145,
"learning_rate": 2.586318907829291e-05,
"loss": 0.4763,
"step": 86
},
{
"epoch": 0.09466811751904244,
"grad_norm": 0.3068345785140991,
"learning_rate": 2.209474043086457e-05,
"loss": 0.3455,
"step": 87
},
{
"epoch": 0.0957562568008705,
"grad_norm": 0.4473964273929596,
"learning_rate": 1.861023335719475e-05,
"loss": 0.5717,
"step": 88
},
{
"epoch": 0.09684439608269858,
"grad_norm": 0.31159886717796326,
"learning_rate": 1.5414009119192633e-05,
"loss": 0.298,
"step": 89
},
{
"epoch": 0.09793253536452666,
"grad_norm": 0.2886298894882202,
"learning_rate": 1.25100498142523e-05,
"loss": 0.3209,
"step": 90
},
{
"epoch": 0.09902067464635474,
"grad_norm": 0.27010682225227356,
"learning_rate": 9.901973414055187e-06,
"loss": 0.2501,
"step": 91
},
{
"epoch": 0.10010881392818281,
"grad_norm": 0.3159748315811157,
"learning_rate": 7.593029257027956e-06,
"loss": 0.2336,
"step": 92
},
{
"epoch": 0.10119695321001088,
"grad_norm": 0.5132943391799927,
"learning_rate": 5.5860940000714015e-06,
"loss": 0.4026,
"step": 93
},
{
"epoch": 0.10228509249183895,
"grad_norm": 0.3360811769962311,
"learning_rate": 3.8836680346041594e-06,
"loss": 0.3444,
"step": 94
},
{
"epoch": 0.10337323177366703,
"grad_norm": 0.9271918535232544,
"learning_rate": 2.487872371386424e-06,
"loss": 0.7088,
"step": 95
},
{
"epoch": 0.1044613710554951,
"grad_norm": 0.7888476252555847,
"learning_rate": 1.4004459980045125e-06,
"loss": 0.5126,
"step": 96
},
{
"epoch": 0.10554951033732318,
"grad_norm": 0.25628480315208435,
"learning_rate": 6.22743712309054e-07,
"loss": 0.2571,
"step": 97
},
{
"epoch": 0.10663764961915125,
"grad_norm": 0.4536930322647095,
"learning_rate": 1.557344345054501e-07,
"loss": 0.5685,
"step": 98
},
{
"epoch": 0.10772578890097932,
"grad_norm": 0.7662109732627869,
"learning_rate": 0.0,
"loss": 0.5405,
"step": 99
}
],
"logging_steps": 1,
"max_steps": 99,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.988754859143987e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}