{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.7199999999999998, "eval_steps": 25, "global_step": 48, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 91.39633178710938, "learning_rate": 0.0002, "loss": 9.8814, "step": 1 }, { "epoch": 0.16, "grad_norm": 69.82352447509766, "learning_rate": 0.0002, "loss": 9.508, "step": 2 }, { "epoch": 0.24, "grad_norm": 88.77470397949219, "learning_rate": 0.0002, "loss": 8.8757, "step": 3 }, { "epoch": 0.32, "grad_norm": 78.74607849121094, "learning_rate": 0.0002, "loss": 8.1277, "step": 4 }, { "epoch": 0.4, "grad_norm": 57.297943115234375, "learning_rate": 0.0002, "loss": 7.5538, "step": 5 }, { "epoch": 0.48, "grad_norm": 48.130455017089844, "learning_rate": 0.0002, "loss": 7.0051, "step": 6 }, { "epoch": 0.56, "grad_norm": 53.24027633666992, "learning_rate": 0.0002, "loss": 6.8888, "step": 7 }, { "epoch": 0.64, "grad_norm": 60.705360412597656, "learning_rate": 0.0002, "loss": 6.9736, "step": 8 }, { "epoch": 0.72, "grad_norm": 54.261192321777344, "learning_rate": 0.0002, "loss": 6.9427, "step": 9 }, { "epoch": 0.8, "grad_norm": 26.81684684753418, "learning_rate": 0.0002, "loss": 6.5488, "step": 10 }, { "epoch": 0.88, "grad_norm": 26.71163558959961, "learning_rate": 0.0002, "loss": 6.1613, "step": 11 }, { "epoch": 0.96, "grad_norm": 17.654388427734375, "learning_rate": 0.0002, "loss": 6.4909, "step": 12 }, { "epoch": 1.0, "grad_norm": 11.796930313110352, "learning_rate": 0.0002, "loss": 3.1317, "step": 13 }, { "epoch": 1.08, "grad_norm": 12.123869895935059, "learning_rate": 0.0002, "loss": 6.5345, "step": 14 }, { "epoch": 1.16, "grad_norm": 34.23923873901367, "learning_rate": 0.0002, "loss": 5.8903, "step": 15 }, { "epoch": 1.24, "grad_norm": 22.84559440612793, "learning_rate": 0.0002, "loss": 5.7779, "step": 16 }, { "epoch": 1.32, "grad_norm": 13.740588188171387, "learning_rate": 0.0002, "loss": 6.2512, "step": 17 }, { "epoch": 1.4, "grad_norm": 12.967787742614746, "learning_rate": 0.0002, "loss": 5.9554, "step": 18 }, { "epoch": 1.48, "grad_norm": 17.227495193481445, "learning_rate": 0.0002, "loss": 6.0653, "step": 19 }, { "epoch": 1.56, "grad_norm": 32.50784683227539, "learning_rate": 0.0002, "loss": 6.7432, "step": 20 }, { "epoch": 1.6400000000000001, "grad_norm": 14.103690147399902, "learning_rate": 0.0002, "loss": 6.324, "step": 21 }, { "epoch": 1.72, "grad_norm": 13.136197090148926, "learning_rate": 0.0002, "loss": 6.0088, "step": 22 }, { "epoch": 1.8, "grad_norm": 13.178682327270508, "learning_rate": 0.0002, "loss": 6.2658, "step": 23 }, { "epoch": 1.88, "grad_norm": 8.991060256958008, "learning_rate": 0.0002, "loss": 6.3406, "step": 24 }, { "epoch": 1.96, "grad_norm": 11.259429931640625, "learning_rate": 0.0002, "loss": 6.4484, "step": 25 }, { "epoch": 1.96, "eval_clap": 0.1398070603609085, "eval_loss": 5.790895462036133, "eval_runtime": 105.0758, "eval_samples_per_second": 0.152, "eval_steps_per_second": 0.152, "step": 25 }, { "epoch": 2.0, "grad_norm": 5.0106096267700195, "learning_rate": 0.0002, "loss": 3.2002, "step": 26 }, { "epoch": 2.08, "grad_norm": 20.214805603027344, "learning_rate": 0.0002, "loss": 5.9964, "step": 27 }, { "epoch": 2.16, "grad_norm": 9.884445190429688, "learning_rate": 0.0002, "loss": 6.158, "step": 28 }, { "epoch": 2.24, "grad_norm": 29.28801918029785, "learning_rate": 0.0002, "loss": 5.6601, "step": 29 }, { "epoch": 2.32, "grad_norm": 8.058592796325684, "learning_rate": 0.0002, "loss": 6.1302, "step": 30 }, { "epoch": 2.4, "grad_norm": 14.248044967651367, "learning_rate": 0.0002, "loss": 6.22, "step": 31 }, { "epoch": 2.48, "grad_norm": 13.470457077026367, "learning_rate": 0.0002, "loss": 6.0524, "step": 32 }, { "epoch": 2.56, "grad_norm": 14.079459190368652, "learning_rate": 0.0002, "loss": 6.2468, "step": 33 }, { "epoch": 2.64, "grad_norm": 22.368366241455078, "learning_rate": 0.0002, "loss": 6.6833, "step": 34 }, { "epoch": 2.7199999999999998, "grad_norm": 8.494449615478516, "learning_rate": 0.0002, "loss": 5.9105, "step": 35 }, { "epoch": 2.8, "grad_norm": 7.31801176071167, "learning_rate": 0.0002, "loss": 6.1115, "step": 36 }, { "epoch": 2.88, "grad_norm": 12.499704360961914, "learning_rate": 0.0002, "loss": 5.8303, "step": 37 }, { "epoch": 2.96, "grad_norm": 12.041208267211914, "learning_rate": 0.0002, "loss": 6.2806, "step": 38 }, { "epoch": 3.0, "grad_norm": 9.749319076538086, "learning_rate": 0.0002, "loss": 2.9795, "step": 39 }, { "epoch": 3.08, "grad_norm": 7.609906196594238, "learning_rate": 0.0002, "loss": 6.2322, "step": 40 }, { "epoch": 3.16, "grad_norm": 8.086981773376465, "learning_rate": 0.0002, "loss": 6.1848, "step": 41 }, { "epoch": 3.24, "grad_norm": 22.7243709564209, "learning_rate": 0.0002, "loss": 5.6357, "step": 42 }, { "epoch": 3.32, "grad_norm": 13.268460273742676, "learning_rate": 0.0002, "loss": 6.1597, "step": 43 }, { "epoch": 3.4, "grad_norm": 9.500421524047852, "learning_rate": 0.0002, "loss": 6.0353, "step": 44 }, { "epoch": 3.48, "grad_norm": 21.6617431640625, "learning_rate": 0.0002, "loss": 5.6887, "step": 45 }, { "epoch": 3.56, "grad_norm": 14.880556106567383, "learning_rate": 0.0002, "loss": 5.6862, "step": 46 }, { "epoch": 3.64, "grad_norm": 14.079529762268066, "learning_rate": 0.0002, "loss": 6.1141, "step": 47 }, { "epoch": 3.7199999999999998, "grad_norm": 13.863551139831543, "learning_rate": 0.0002, "loss": 5.5132, "step": 48 }, { "epoch": 3.7199999999999998, "step": 48, "total_flos": 84973802219964.0, "train_loss": 6.279264723261197, "train_runtime": 934.4841, "train_samples_per_second": 0.856, "train_steps_per_second": 0.051 } ], "logging_steps": 1.0, "max_steps": 48, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 84973802219964.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }