{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1697072549851506, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016970725498515061, "eval_loss": 0.7674897909164429, "eval_runtime": 71.4945, "eval_samples_per_second": 13.889, "eval_steps_per_second": 1.748, "step": 1 }, { "epoch": 0.005091217649554518, "grad_norm": 0.7084022164344788, "learning_rate": 1.5e-05, "loss": 0.7782, "step": 3 }, { "epoch": 0.010182435299109036, "grad_norm": 0.799297034740448, "learning_rate": 3e-05, "loss": 0.7424, "step": 6 }, { "epoch": 0.015273652948663556, "grad_norm": 0.63190096616745, "learning_rate": 4.5e-05, "loss": 0.6727, "step": 9 }, { "epoch": 0.015273652948663556, "eval_loss": 0.6583356261253357, "eval_runtime": 72.6186, "eval_samples_per_second": 13.674, "eval_steps_per_second": 1.721, "step": 9 }, { "epoch": 0.020364870598218072, "grad_norm": 0.6133125424385071, "learning_rate": 4.993910125649561e-05, "loss": 0.6068, "step": 12 }, { "epoch": 0.025456088247772592, "grad_norm": 0.6916378736495972, "learning_rate": 4.962019382530521e-05, "loss": 0.5521, "step": 15 }, { "epoch": 0.030547305897327112, "grad_norm": 0.5799879431724548, "learning_rate": 4.9031542398457974e-05, "loss": 0.5013, "step": 18 }, { "epoch": 0.030547305897327112, "eval_loss": 0.5111116170883179, "eval_runtime": 72.6202, "eval_samples_per_second": 13.674, "eval_steps_per_second": 1.721, "step": 18 }, { "epoch": 0.03563852354688163, "grad_norm": 0.5423109531402588, "learning_rate": 4.817959636416969e-05, "loss": 0.4864, "step": 21 }, { "epoch": 0.040729741196436145, "grad_norm": 0.5172947645187378, "learning_rate": 4.707368982147318e-05, "loss": 0.4619, "step": 24 }, { "epoch": 0.04582095884599067, "grad_norm": 0.48413389921188354, "learning_rate": 4.572593931387604e-05, "loss": 0.4352, "step": 27 }, { "epoch": 0.04582095884599067, "eval_loss": 0.4471208155155182, "eval_runtime": 72.6272, "eval_samples_per_second": 13.673, "eval_steps_per_second": 1.721, "step": 27 }, { "epoch": 0.050912176495545185, "grad_norm": 0.4537797272205353, "learning_rate": 4.415111107797445e-05, "loss": 0.4472, "step": 30 }, { "epoch": 0.0560033941450997, "grad_norm": 0.5130160450935364, "learning_rate": 4.2366459261474933e-05, "loss": 0.432, "step": 33 }, { "epoch": 0.061094611794654224, "grad_norm": 0.42606261372566223, "learning_rate": 4.039153688314145e-05, "loss": 0.4192, "step": 36 }, { "epoch": 0.061094611794654224, "eval_loss": 0.4150956869125366, "eval_runtime": 72.6093, "eval_samples_per_second": 13.676, "eval_steps_per_second": 1.722, "step": 36 }, { "epoch": 0.06618582944420874, "grad_norm": 0.4711809754371643, "learning_rate": 3.824798160583012e-05, "loss": 0.4205, "step": 39 }, { "epoch": 0.07127704709376326, "grad_norm": 0.44911667704582214, "learning_rate": 3.5959278669726935e-05, "loss": 0.3916, "step": 42 }, { "epoch": 0.07636826474331777, "grad_norm": 0.7759262323379517, "learning_rate": 3.355050358314172e-05, "loss": 0.3813, "step": 45 }, { "epoch": 0.07636826474331777, "eval_loss": 0.39792174100875854, "eval_runtime": 72.5601, "eval_samples_per_second": 13.685, "eval_steps_per_second": 1.723, "step": 45 }, { "epoch": 0.08145948239287229, "grad_norm": 0.4920569658279419, "learning_rate": 3.104804738999169e-05, "loss": 0.3736, "step": 48 }, { "epoch": 0.08655070004242682, "grad_norm": 0.4970743954181671, "learning_rate": 2.8479327524001636e-05, "loss": 0.3831, "step": 51 }, { "epoch": 0.09164191769198134, "grad_norm": 0.46638166904449463, "learning_rate": 2.587248741756253e-05, "loss": 0.3872, "step": 54 }, { "epoch": 0.09164191769198134, "eval_loss": 0.3839256465435028, "eval_runtime": 72.6322, "eval_samples_per_second": 13.672, "eval_steps_per_second": 1.721, "step": 54 }, { "epoch": 0.09673313534153585, "grad_norm": 0.5236009359359741, "learning_rate": 2.3256088156396868e-05, "loss": 0.3653, "step": 57 }, { "epoch": 0.10182435299109037, "grad_norm": 0.49318793416023254, "learning_rate": 2.0658795558326743e-05, "loss": 0.3813, "step": 60 }, { "epoch": 0.10691557064064489, "grad_norm": 0.542844831943512, "learning_rate": 1.8109066104575023e-05, "loss": 0.3749, "step": 63 }, { "epoch": 0.10691557064064489, "eval_loss": 0.3750540316104889, "eval_runtime": 72.6222, "eval_samples_per_second": 13.674, "eval_steps_per_second": 1.721, "step": 63 }, { "epoch": 0.1120067882901994, "grad_norm": 0.5431241393089294, "learning_rate": 1.56348351646022e-05, "loss": 0.3732, "step": 66 }, { "epoch": 0.11709800593975392, "grad_norm": 0.5867822766304016, "learning_rate": 1.3263210930352737e-05, "loss": 0.3811, "step": 69 }, { "epoch": 0.12218922358930845, "grad_norm": 0.5038670897483826, "learning_rate": 1.1020177413231334e-05, "loss": 0.3653, "step": 72 }, { "epoch": 0.12218922358930845, "eval_loss": 0.36931174993515015, "eval_runtime": 72.6161, "eval_samples_per_second": 13.675, "eval_steps_per_second": 1.721, "step": 72 }, { "epoch": 0.12728044123886295, "grad_norm": 0.5324311852455139, "learning_rate": 8.930309757836517e-06, "loss": 0.3728, "step": 75 }, { "epoch": 0.13237165888841748, "grad_norm": 0.47567200660705566, "learning_rate": 7.016504991533726e-06, "loss": 0.3449, "step": 78 }, { "epoch": 0.137462876537972, "grad_norm": 0.46117860078811646, "learning_rate": 5.299731159831953e-06, "loss": 0.3702, "step": 81 }, { "epoch": 0.137462876537972, "eval_loss": 0.3658164441585541, "eval_runtime": 72.5356, "eval_samples_per_second": 13.69, "eval_steps_per_second": 1.723, "step": 81 }, { "epoch": 0.14255409418752651, "grad_norm": 0.4353705644607544, "learning_rate": 3.798797596089351e-06, "loss": 0.3578, "step": 84 }, { "epoch": 0.14764531183708104, "grad_norm": 0.5827435851097107, "learning_rate": 2.5301488425208296e-06, "loss": 0.3565, "step": 87 }, { "epoch": 0.15273652948663555, "grad_norm": 0.45924094319343567, "learning_rate": 1.5076844803522922e-06, "loss": 0.3456, "step": 90 }, { "epoch": 0.15273652948663555, "eval_loss": 0.3645370602607727, "eval_runtime": 72.5734, "eval_samples_per_second": 13.683, "eval_steps_per_second": 1.722, "step": 90 }, { "epoch": 0.15782774713619008, "grad_norm": 0.5294960141181946, "learning_rate": 7.426068431000882e-07, "loss": 0.3643, "step": 93 }, { "epoch": 0.16291896478574458, "grad_norm": 0.4545383155345917, "learning_rate": 2.4329828146074095e-07, "loss": 0.3463, "step": 96 }, { "epoch": 0.1680101824352991, "grad_norm": 0.6580240726470947, "learning_rate": 1.522932452260595e-08, "loss": 0.3619, "step": 99 }, { "epoch": 0.1680101824352991, "eval_loss": 0.364297091960907, "eval_runtime": 72.5705, "eval_samples_per_second": 13.683, "eval_steps_per_second": 1.722, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4874840035386982e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }