{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03544776119402985, "eval_steps": 38, "global_step": 38, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009328358208955224, "grad_norm": 2.931856870651245, "learning_rate": 4e-05, "loss": 3.1183, "step": 1 }, { "epoch": 0.0009328358208955224, "eval_loss": 0.7989240884780884, "eval_runtime": 61.068, "eval_samples_per_second": 7.402, "eval_steps_per_second": 3.701, "step": 1 }, { "epoch": 0.0018656716417910447, "grad_norm": 3.0857083797454834, "learning_rate": 8e-05, "loss": 3.3967, "step": 2 }, { "epoch": 0.002798507462686567, "grad_norm": 2.9495201110839844, "learning_rate": 0.00012, "loss": 3.4361, "step": 3 }, { "epoch": 0.0037313432835820895, "grad_norm": 2.4459776878356934, "learning_rate": 0.00016, "loss": 3.2275, "step": 4 }, { "epoch": 0.0046641791044776115, "grad_norm": 2.410581588745117, "learning_rate": 0.0002, "loss": 3.0069, "step": 5 }, { "epoch": 0.005597014925373134, "grad_norm": 1.99094820022583, "learning_rate": 0.00024, "loss": 2.6997, "step": 6 }, { "epoch": 0.0065298507462686565, "grad_norm": 1.850408911705017, "learning_rate": 0.00028, "loss": 2.6469, "step": 7 }, { "epoch": 0.007462686567164179, "grad_norm": 2.972104072570801, "learning_rate": 0.00032, "loss": 3.351, "step": 8 }, { "epoch": 0.008395522388059701, "grad_norm": 2.8936173915863037, "learning_rate": 0.00036, "loss": 2.8041, "step": 9 }, { "epoch": 0.009328358208955223, "grad_norm": 2.354464530944824, "learning_rate": 0.0004, "loss": 2.7652, "step": 10 }, { "epoch": 0.010261194029850746, "grad_norm": 2.024070978164673, "learning_rate": 0.0003999496469885013, "loss": 2.8786, "step": 11 }, { "epoch": 0.011194029850746268, "grad_norm": 1.8775830268859863, "learning_rate": 0.00039979861330826294, "loss": 2.0356, "step": 12 }, { "epoch": 0.012126865671641791, "grad_norm": 1.7130846977233887, "learning_rate": 0.0003995469750092912, "loss": 2.6772, "step": 13 }, { "epoch": 0.013059701492537313, "grad_norm": 1.9945520162582397, "learning_rate": 0.00039919485879904784, "loss": 2.8959, "step": 14 }, { "epoch": 0.013992537313432836, "grad_norm": 1.6973563432693481, "learning_rate": 0.00039874244197864856, "loss": 2.3531, "step": 15 }, { "epoch": 0.014925373134328358, "grad_norm": 1.5936486721038818, "learning_rate": 0.00039818995235358696, "loss": 1.9481, "step": 16 }, { "epoch": 0.01585820895522388, "grad_norm": 1.828995943069458, "learning_rate": 0.00039753766811902755, "loss": 3.6787, "step": 17 }, { "epoch": 0.016791044776119403, "grad_norm": 1.553421139717102, "learning_rate": 0.0003967859177197259, "loss": 2.2696, "step": 18 }, { "epoch": 0.017723880597014924, "grad_norm": 1.2893372774124146, "learning_rate": 0.00039593507968464716, "loss": 1.9452, "step": 19 }, { "epoch": 0.018656716417910446, "grad_norm": 1.674631953239441, "learning_rate": 0.0003949855824363647, "loss": 2.2916, "step": 20 }, { "epoch": 0.01958955223880597, "grad_norm": 1.4569449424743652, "learning_rate": 0.0003939379040753374, "loss": 2.1701, "step": 21 }, { "epoch": 0.020522388059701493, "grad_norm": 1.438828468322754, "learning_rate": 0.00039279257213917066, "loss": 2.5621, "step": 22 }, { "epoch": 0.021455223880597014, "grad_norm": 1.3770123720169067, "learning_rate": 0.0003915501633369861, "loss": 2.1366, "step": 23 }, { "epoch": 0.022388059701492536, "grad_norm": 1.4712828397750854, "learning_rate": 0.00039021130325903074, "loss": 2.319, "step": 24 }, { "epoch": 0.02332089552238806, "grad_norm": 1.465248465538025, "learning_rate": 0.00038877666606167355, "loss": 2.3959, "step": 25 }, { "epoch": 0.024253731343283583, "grad_norm": 1.4384740591049194, "learning_rate": 0.00038724697412794747, "loss": 2.097, "step": 26 }, { "epoch": 0.025186567164179104, "grad_norm": 1.3539812564849854, "learning_rate": 0.0003856229977038078, "loss": 2.2313, "step": 27 }, { "epoch": 0.026119402985074626, "grad_norm": 1.3618801832199097, "learning_rate": 0.0003839055545102902, "loss": 2.1054, "step": 28 }, { "epoch": 0.027052238805970148, "grad_norm": 1.422633409500122, "learning_rate": 0.00038209550933176323, "loss": 2.276, "step": 29 }, { "epoch": 0.027985074626865673, "grad_norm": 1.4232622385025024, "learning_rate": 0.0003801937735804838, "loss": 2.1735, "step": 30 }, { "epoch": 0.028917910447761194, "grad_norm": 1.4555679559707642, "learning_rate": 0.0003782013048376736, "loss": 2.2706, "step": 31 }, { "epoch": 0.029850746268656716, "grad_norm": 1.2929563522338867, "learning_rate": 0.0003761191063713476, "loss": 1.9037, "step": 32 }, { "epoch": 0.030783582089552237, "grad_norm": 1.2687627077102661, "learning_rate": 0.0003739482266311391, "loss": 2.1032, "step": 33 }, { "epoch": 0.03171641791044776, "grad_norm": 1.2993357181549072, "learning_rate": 0.00037168975872037323, "loss": 2.0062, "step": 34 }, { "epoch": 0.03264925373134328, "grad_norm": 1.3507018089294434, "learning_rate": 0.00036934483984565685, "loss": 2.1522, "step": 35 }, { "epoch": 0.033582089552238806, "grad_norm": 1.4183921813964844, "learning_rate": 0.00036691465074426054, "loss": 1.845, "step": 36 }, { "epoch": 0.03451492537313433, "grad_norm": 1.3370906114578247, "learning_rate": 0.00036440041508958203, "loss": 1.9448, "step": 37 }, { "epoch": 0.03544776119402985, "grad_norm": 1.4249347448349, "learning_rate": 0.0003618033988749895, "loss": 2.4594, "step": 38 }, { "epoch": 0.03544776119402985, "eval_loss": 0.5035107135772705, "eval_runtime": 60.2427, "eval_samples_per_second": 7.503, "eval_steps_per_second": 3.751, "step": 38 } ], "logging_steps": 1, "max_steps": 150, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 38, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4981999911567360.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }