{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.015539771612447514, "eval_steps": 50, "global_step": 99, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015696739002472236, "grad_norm": 0.4209878444671631, "learning_rate": 5e-05, "loss": 0.96, "step": 1 }, { "epoch": 0.00015696739002472236, "eval_loss": 1.059880018234253, "eval_runtime": 58.7176, "eval_samples_per_second": 45.693, "eval_steps_per_second": 22.855, "step": 1 }, { "epoch": 0.0003139347800494447, "grad_norm": 0.37363916635513306, "learning_rate": 0.0001, "loss": 0.7969, "step": 2 }, { "epoch": 0.0004709021700741671, "grad_norm": 0.31887054443359375, "learning_rate": 0.00015, "loss": 0.9958, "step": 3 }, { "epoch": 0.0006278695600988894, "grad_norm": 0.6505757570266724, "learning_rate": 0.0002, "loss": 1.4545, "step": 4 }, { "epoch": 0.0007848369501236119, "grad_norm": 0.338165819644928, "learning_rate": 0.00025, "loss": 0.8554, "step": 5 }, { "epoch": 0.0009418043401483342, "grad_norm": 0.6994407176971436, "learning_rate": 0.0003, "loss": 0.9343, "step": 6 }, { "epoch": 0.0010987717301730565, "grad_norm": 0.31736791133880615, "learning_rate": 0.00035, "loss": 0.7684, "step": 7 }, { "epoch": 0.0012557391201977789, "grad_norm": 0.42489951848983765, "learning_rate": 0.0004, "loss": 0.9331, "step": 8 }, { "epoch": 0.0014127065102225013, "grad_norm": 0.4670903980731964, "learning_rate": 0.00045000000000000004, "loss": 0.938, "step": 9 }, { "epoch": 0.0015696739002472237, "grad_norm": 0.6221330165863037, "learning_rate": 0.0005, "loss": 1.163, "step": 10 }, { "epoch": 0.001726641290271946, "grad_norm": 0.44136929512023926, "learning_rate": 0.0004998442655654946, "loss": 0.743, "step": 11 }, { "epoch": 0.0018836086802966683, "grad_norm": 0.4158223867416382, "learning_rate": 0.0004993772562876909, "loss": 0.7409, "step": 12 }, { "epoch": 0.0020405760703213907, "grad_norm": 0.6507619023323059, "learning_rate": 0.0004985995540019955, "loss": 0.7963, "step": 13 }, { "epoch": 0.002197543460346113, "grad_norm": 0.5717980861663818, "learning_rate": 0.0004975121276286136, "loss": 0.899, "step": 14 }, { "epoch": 0.0023545108503708356, "grad_norm": 0.6496816277503967, "learning_rate": 0.0004961163319653958, "loss": 0.9617, "step": 15 }, { "epoch": 0.0025114782403955578, "grad_norm": 0.4178118109703064, "learning_rate": 0.0004944139059999286, "loss": 0.6501, "step": 16 }, { "epoch": 0.00266844563042028, "grad_norm": 0.4809323251247406, "learning_rate": 0.000492406970742972, "loss": 1.1646, "step": 17 }, { "epoch": 0.0028254130204450026, "grad_norm": 0.34386783838272095, "learning_rate": 0.0004900980265859448, "loss": 0.6848, "step": 18 }, { "epoch": 0.002982380410469725, "grad_norm": 0.39839452505111694, "learning_rate": 0.0004874899501857477, "loss": 0.9252, "step": 19 }, { "epoch": 0.0031393478004944474, "grad_norm": 0.2926274836063385, "learning_rate": 0.00048458599088080736, "loss": 0.6852, "step": 20 }, { "epoch": 0.0032963151905191696, "grad_norm": 0.309630811214447, "learning_rate": 0.0004813897666428053, "loss": 0.7408, "step": 21 }, { "epoch": 0.003453282580543892, "grad_norm": 0.44346147775650024, "learning_rate": 0.00047790525956913543, "loss": 0.7721, "step": 22 }, { "epoch": 0.0036102499705686145, "grad_norm": 0.38101327419281006, "learning_rate": 0.0004741368109217071, "loss": 0.8934, "step": 23 }, { "epoch": 0.0037672173605933367, "grad_norm": 0.6885596513748169, "learning_rate": 0.00047008911571827283, "loss": 0.7099, "step": 24 }, { "epoch": 0.003924184750618059, "grad_norm": 0.4238623082637787, "learning_rate": 0.00046576721688302105, "loss": 0.6671, "step": 25 }, { "epoch": 0.0040811521406427815, "grad_norm": 0.4145318269729614, "learning_rate": 0.0004611764989637205, "loss": 0.907, "step": 26 }, { "epoch": 0.004238119530667504, "grad_norm": 0.47778183221817017, "learning_rate": 0.0004563226814232444, "loss": 0.9015, "step": 27 }, { "epoch": 0.004395086920692226, "grad_norm": 0.40440261363983154, "learning_rate": 0.0004512118115138315, "loss": 0.7177, "step": 28 }, { "epoch": 0.0045520543107169485, "grad_norm": 0.6247038841247559, "learning_rate": 0.0004458502567429631, "loss": 0.898, "step": 29 }, { "epoch": 0.004709021700741671, "grad_norm": 0.7316962480545044, "learning_rate": 0.00044024469694024196, "loss": 1.295, "step": 30 }, { "epoch": 0.004865989090766393, "grad_norm": 0.46687981486320496, "learning_rate": 0.00043440211593515554, "loss": 1.0014, "step": 31 }, { "epoch": 0.0050229564807911155, "grad_norm": 0.4061206877231598, "learning_rate": 0.0004283297928560951, "loss": 0.6779, "step": 32 }, { "epoch": 0.005179923870815838, "grad_norm": 0.4588050842285156, "learning_rate": 0.0004220352930614672, "loss": 0.8217, "step": 33 }, { "epoch": 0.00533689126084056, "grad_norm": 0.39541956782341003, "learning_rate": 0.00041552645871420013, "loss": 1.0593, "step": 34 }, { "epoch": 0.005493858650865283, "grad_norm": 0.32590246200561523, "learning_rate": 0.00040881139901138467, "loss": 0.5995, "step": 35 }, { "epoch": 0.005650826040890005, "grad_norm": 0.34778353571891785, "learning_rate": 0.00040189848008122475, "loss": 0.8404, "step": 36 }, { "epoch": 0.005807793430914728, "grad_norm": 0.43329522013664246, "learning_rate": 0.00039479631455988334, "loss": 0.714, "step": 37 }, { "epoch": 0.00596476082093945, "grad_norm": 0.48381856083869934, "learning_rate": 0.0003875137508612103, "loss": 0.939, "step": 38 }, { "epoch": 0.006121728210964172, "grad_norm": 0.8319804668426514, "learning_rate": 0.00038005986215272055, "loss": 0.9289, "step": 39 }, { "epoch": 0.006278695600988895, "grad_norm": 0.4785021245479584, "learning_rate": 0.0003724439350515571, "loss": 1.051, "step": 40 }, { "epoch": 0.006435662991013617, "grad_norm": 0.6576308608055115, "learning_rate": 0.0003646754580545226, "loss": 0.9983, "step": 41 }, { "epoch": 0.006592630381038339, "grad_norm": 0.4170599579811096, "learning_rate": 0.000356764109716594, "loss": 0.8185, "step": 42 }, { "epoch": 0.006749597771063062, "grad_norm": 0.4000290632247925, "learning_rate": 0.00034871974659264783, "loss": 0.8255, "step": 43 }, { "epoch": 0.006906565161087784, "grad_norm": 0.32657188177108765, "learning_rate": 0.0003405523909574206, "loss": 0.6339, "step": 44 }, { "epoch": 0.007063532551112506, "grad_norm": 0.41789180040359497, "learning_rate": 0.0003322722183190025, "loss": 0.8051, "step": 45 }, { "epoch": 0.007220499941137229, "grad_norm": 0.38286373019218445, "learning_rate": 0.0003238895447414211, "loss": 0.6807, "step": 46 }, { "epoch": 0.007377467331161951, "grad_norm": 0.3656023442745209, "learning_rate": 0.0003154148139921102, "loss": 0.6798, "step": 47 }, { "epoch": 0.007534434721186673, "grad_norm": 0.4634772837162018, "learning_rate": 0.00030685858453027663, "loss": 0.8384, "step": 48 }, { "epoch": 0.007691402111211396, "grad_norm": 0.3100186884403229, "learning_rate": 0.0002982315163523742, "loss": 0.7984, "step": 49 }, { "epoch": 0.007848369501236118, "grad_norm": 0.608703076839447, "learning_rate": 0.000289544357711076, "loss": 1.2016, "step": 50 }, { "epoch": 0.007848369501236118, "eval_loss": 0.898485004901886, "eval_runtime": 58.4598, "eval_samples_per_second": 45.895, "eval_steps_per_second": 22.956, "step": 50 }, { "epoch": 0.008005336891260841, "grad_norm": 0.3083370625972748, "learning_rate": 0.0002808079317242896, "loss": 0.6572, "step": 51 }, { "epoch": 0.008162304281285563, "grad_norm": 0.3935853838920593, "learning_rate": 0.0002720331228909005, "loss": 1.0255, "step": 52 }, { "epoch": 0.008319271671310285, "grad_norm": 0.5018428564071655, "learning_rate": 0.00026323086353004075, "loss": 0.7937, "step": 53 }, { "epoch": 0.008476239061335008, "grad_norm": 0.27589741349220276, "learning_rate": 0.0002544121201607822, "loss": 0.5505, "step": 54 }, { "epoch": 0.00863320645135973, "grad_norm": 0.47793063521385193, "learning_rate": 0.00024558787983921783, "loss": 0.7793, "step": 55 }, { "epoch": 0.008790173841384452, "grad_norm": 0.6015914082527161, "learning_rate": 0.0002367691364699592, "loss": 1.0549, "step": 56 }, { "epoch": 0.008947141231409175, "grad_norm": 0.42470067739486694, "learning_rate": 0.00022796687710909964, "loss": 1.1239, "step": 57 }, { "epoch": 0.009104108621433897, "grad_norm": 0.41093453764915466, "learning_rate": 0.00021919206827571036, "loss": 1.0407, "step": 58 }, { "epoch": 0.009261076011458619, "grad_norm": 0.41134944558143616, "learning_rate": 0.00021045564228892402, "loss": 0.6677, "step": 59 }, { "epoch": 0.009418043401483342, "grad_norm": 0.38528087735176086, "learning_rate": 0.00020176848364762578, "loss": 1.0211, "step": 60 }, { "epoch": 0.009575010791508064, "grad_norm": 0.38283631205558777, "learning_rate": 0.00019314141546972343, "loss": 0.8785, "step": 61 }, { "epoch": 0.009731978181532786, "grad_norm": 0.3168281316757202, "learning_rate": 0.00018458518600788986, "loss": 0.8723, "step": 62 }, { "epoch": 0.00988894557155751, "grad_norm": 0.32666337490081787, "learning_rate": 0.00017611045525857898, "loss": 0.9419, "step": 63 }, { "epoch": 0.010045912961582231, "grad_norm": 0.40057867765426636, "learning_rate": 0.0001677277816809975, "loss": 0.9665, "step": 64 }, { "epoch": 0.010202880351606953, "grad_norm": 0.46375736594200134, "learning_rate": 0.00015944760904257942, "loss": 0.9058, "step": 65 }, { "epoch": 0.010359847741631676, "grad_norm": 0.3828064203262329, "learning_rate": 0.0001512802534073522, "loss": 0.6986, "step": 66 }, { "epoch": 0.010516815131656398, "grad_norm": 0.38636311888694763, "learning_rate": 0.00014323589028340596, "loss": 0.817, "step": 67 }, { "epoch": 0.01067378252168112, "grad_norm": 0.41518744826316833, "learning_rate": 0.00013532454194547733, "loss": 0.6401, "step": 68 }, { "epoch": 0.010830749911705843, "grad_norm": 0.5281746983528137, "learning_rate": 0.00012755606494844294, "loss": 0.8537, "step": 69 }, { "epoch": 0.010987717301730565, "grad_norm": 0.41452756524086, "learning_rate": 0.00011994013784727947, "loss": 0.8924, "step": 70 }, { "epoch": 0.011144684691755289, "grad_norm": 0.45944878458976746, "learning_rate": 0.00011248624913878966, "loss": 0.7256, "step": 71 }, { "epoch": 0.01130165208178001, "grad_norm": 0.6275045275688171, "learning_rate": 0.0001052036854401166, "loss": 0.9691, "step": 72 }, { "epoch": 0.011458619471804732, "grad_norm": 0.5686112642288208, "learning_rate": 9.810151991877531e-05, "loss": 0.8182, "step": 73 }, { "epoch": 0.011615586861829456, "grad_norm": 0.3548356592655182, "learning_rate": 9.118860098861537e-05, "loss": 0.5956, "step": 74 }, { "epoch": 0.011772554251854177, "grad_norm": 0.4017604887485504, "learning_rate": 8.44735412857999e-05, "loss": 0.6562, "step": 75 }, { "epoch": 0.0119295216418789, "grad_norm": 0.3947277367115021, "learning_rate": 7.79647069385328e-05, "loss": 1.0559, "step": 76 }, { "epoch": 0.012086489031903623, "grad_norm": 0.320968896150589, "learning_rate": 7.167020714390501e-05, "loss": 0.547, "step": 77 }, { "epoch": 0.012243456421928344, "grad_norm": 0.3922961950302124, "learning_rate": 6.559788406484446e-05, "loss": 0.9905, "step": 78 }, { "epoch": 0.012400423811953066, "grad_norm": 0.41221266984939575, "learning_rate": 5.975530305975807e-05, "loss": 0.8861, "step": 79 }, { "epoch": 0.01255739120197779, "grad_norm": 0.3318183422088623, "learning_rate": 5.414974325703686e-05, "loss": 0.7566, "step": 80 }, { "epoch": 0.012714358592002512, "grad_norm": 0.4427206814289093, "learning_rate": 4.8788188486168616e-05, "loss": 0.7644, "step": 81 }, { "epoch": 0.012871325982027233, "grad_norm": 0.4162525236606598, "learning_rate": 4.367731857675569e-05, "loss": 0.6616, "step": 82 }, { "epoch": 0.013028293372051957, "grad_norm": 0.37238290905952454, "learning_rate": 3.882350103627952e-05, "loss": 0.8344, "step": 83 }, { "epoch": 0.013185260762076679, "grad_norm": 0.43004289269447327, "learning_rate": 3.423278311697897e-05, "loss": 0.6133, "step": 84 }, { "epoch": 0.0133422281521014, "grad_norm": 0.4774795472621918, "learning_rate": 2.9910884281727225e-05, "loss": 0.9229, "step": 85 }, { "epoch": 0.013499195542126124, "grad_norm": 0.4710048735141754, "learning_rate": 2.586318907829291e-05, "loss": 1.003, "step": 86 }, { "epoch": 0.013656162932150846, "grad_norm": 0.48064345121383667, "learning_rate": 2.209474043086457e-05, "loss": 0.6504, "step": 87 }, { "epoch": 0.013813130322175567, "grad_norm": 0.3248710632324219, "learning_rate": 1.861023335719475e-05, "loss": 0.7729, "step": 88 }, { "epoch": 0.01397009771220029, "grad_norm": 0.43739113211631775, "learning_rate": 1.5414009119192633e-05, "loss": 1.0168, "step": 89 }, { "epoch": 0.014127065102225013, "grad_norm": 0.26098549365997314, "learning_rate": 1.25100498142523e-05, "loss": 0.6323, "step": 90 }, { "epoch": 0.014284032492249734, "grad_norm": 0.44334644079208374, "learning_rate": 9.901973414055187e-06, "loss": 0.7762, "step": 91 }, { "epoch": 0.014440999882274458, "grad_norm": 0.42882293462753296, "learning_rate": 7.593029257027956e-06, "loss": 0.9357, "step": 92 }, { "epoch": 0.01459796727229918, "grad_norm": 0.3838594853878021, "learning_rate": 5.5860940000714015e-06, "loss": 0.7766, "step": 93 }, { "epoch": 0.014754934662323901, "grad_norm": 0.2918799817562103, "learning_rate": 3.8836680346041594e-06, "loss": 0.6025, "step": 94 }, { "epoch": 0.014911902052348625, "grad_norm": 0.3367508053779602, "learning_rate": 2.487872371386424e-06, "loss": 0.7892, "step": 95 }, { "epoch": 0.015068869442373347, "grad_norm": 0.3433071970939636, "learning_rate": 1.4004459980045125e-06, "loss": 0.5108, "step": 96 }, { "epoch": 0.01522583683239807, "grad_norm": 0.4132903516292572, "learning_rate": 6.22743712309054e-07, "loss": 0.8608, "step": 97 }, { "epoch": 0.015382804222422792, "grad_norm": 0.36693432927131653, "learning_rate": 1.557344345054501e-07, "loss": 0.7531, "step": 98 }, { "epoch": 0.015539771612447514, "grad_norm": 0.42726150155067444, "learning_rate": 0.0, "loss": 0.8528, "step": 99 } ], "logging_steps": 1, "max_steps": 99, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4864473877708800.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }