dzanbek's picture
Training in progress, step 20, checkpoint
8c7dbfd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.002889233991837914,
"eval_steps": 2,
"global_step": 20,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001444616995918957,
"grad_norm": 29.741775512695312,
"learning_rate": 1e-05,
"loss": 21.916,
"step": 1
},
{
"epoch": 0.0001444616995918957,
"eval_loss": 4.278669834136963,
"eval_runtime": 411.0727,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 3.547,
"step": 1
},
{
"epoch": 0.0002889233991837914,
"grad_norm": 28.80156707763672,
"learning_rate": 2e-05,
"loss": 18.1093,
"step": 2
},
{
"epoch": 0.0002889233991837914,
"eval_loss": 4.272582054138184,
"eval_runtime": 412.4395,
"eval_samples_per_second": 7.068,
"eval_steps_per_second": 3.535,
"step": 2
},
{
"epoch": 0.0004333850987756871,
"grad_norm": 27.65138816833496,
"learning_rate": 3e-05,
"loss": 11.7399,
"step": 3
},
{
"epoch": 0.0005778467983675828,
"grad_norm": 33.123443603515625,
"learning_rate": 4e-05,
"loss": 15.397,
"step": 4
},
{
"epoch": 0.0005778467983675828,
"eval_loss": 4.1736063957214355,
"eval_runtime": 412.3727,
"eval_samples_per_second": 7.069,
"eval_steps_per_second": 3.536,
"step": 4
},
{
"epoch": 0.0007223084979594785,
"grad_norm": 48.433624267578125,
"learning_rate": 5e-05,
"loss": 16.1812,
"step": 5
},
{
"epoch": 0.0008667701975513742,
"grad_norm": 30.907163619995117,
"learning_rate": 6e-05,
"loss": 13.8531,
"step": 6
},
{
"epoch": 0.0008667701975513742,
"eval_loss": 3.9675278663635254,
"eval_runtime": 412.3112,
"eval_samples_per_second": 7.07,
"eval_steps_per_second": 3.536,
"step": 6
},
{
"epoch": 0.0010112318971432699,
"grad_norm": 25.792160034179688,
"learning_rate": 7e-05,
"loss": 22.1435,
"step": 7
},
{
"epoch": 0.0011556935967351656,
"grad_norm": 22.48407745361328,
"learning_rate": 8e-05,
"loss": 17.8129,
"step": 8
},
{
"epoch": 0.0011556935967351656,
"eval_loss": 3.734724998474121,
"eval_runtime": 412.2645,
"eval_samples_per_second": 7.071,
"eval_steps_per_second": 3.537,
"step": 8
},
{
"epoch": 0.0013001552963270613,
"grad_norm": 21.80668830871582,
"learning_rate": 9e-05,
"loss": 13.4756,
"step": 9
},
{
"epoch": 0.001444616995918957,
"grad_norm": 18.533721923828125,
"learning_rate": 0.0001,
"loss": 12.6123,
"step": 10
},
{
"epoch": 0.001444616995918957,
"eval_loss": 3.4023940563201904,
"eval_runtime": 412.1148,
"eval_samples_per_second": 7.073,
"eval_steps_per_second": 3.538,
"step": 10
},
{
"epoch": 0.0015890786955108527,
"grad_norm": 12.594943046569824,
"learning_rate": 9.755282581475769e-05,
"loss": 9.2502,
"step": 11
},
{
"epoch": 0.0017335403951027485,
"grad_norm": 28.27761459350586,
"learning_rate": 9.045084971874738e-05,
"loss": 14.809,
"step": 12
},
{
"epoch": 0.0017335403951027485,
"eval_loss": 2.9927456378936768,
"eval_runtime": 412.4787,
"eval_samples_per_second": 7.067,
"eval_steps_per_second": 3.535,
"step": 12
},
{
"epoch": 0.001878002094694644,
"grad_norm": 24.574901580810547,
"learning_rate": 7.938926261462366e-05,
"loss": 13.8176,
"step": 13
},
{
"epoch": 0.0020224637942865397,
"grad_norm": 49.8048210144043,
"learning_rate": 6.545084971874738e-05,
"loss": 16.2677,
"step": 14
},
{
"epoch": 0.0020224637942865397,
"eval_loss": 2.5962870121002197,
"eval_runtime": 412.0906,
"eval_samples_per_second": 7.074,
"eval_steps_per_second": 3.538,
"step": 14
},
{
"epoch": 0.0021669254938784356,
"grad_norm": 12.992769241333008,
"learning_rate": 5e-05,
"loss": 8.358,
"step": 15
},
{
"epoch": 0.002311387193470331,
"grad_norm": 10.932060241699219,
"learning_rate": 3.4549150281252636e-05,
"loss": 9.3938,
"step": 16
},
{
"epoch": 0.002311387193470331,
"eval_loss": 2.366903066635132,
"eval_runtime": 412.0962,
"eval_samples_per_second": 7.074,
"eval_steps_per_second": 3.538,
"step": 16
},
{
"epoch": 0.002455848893062227,
"grad_norm": 34.9987907409668,
"learning_rate": 2.061073738537635e-05,
"loss": 12.4402,
"step": 17
},
{
"epoch": 0.0026003105926541226,
"grad_norm": 39.801536560058594,
"learning_rate": 9.549150281252633e-06,
"loss": 9.872,
"step": 18
},
{
"epoch": 0.0026003105926541226,
"eval_loss": 2.25581955909729,
"eval_runtime": 412.3441,
"eval_samples_per_second": 7.069,
"eval_steps_per_second": 3.536,
"step": 18
},
{
"epoch": 0.002744772292246018,
"grad_norm": 41.84544372558594,
"learning_rate": 2.4471741852423237e-06,
"loss": 10.0533,
"step": 19
},
{
"epoch": 0.002889233991837914,
"grad_norm": 16.11212158203125,
"learning_rate": 0.0,
"loss": 7.1957,
"step": 20
},
{
"epoch": 0.002889233991837914,
"eval_loss": 2.2330732345581055,
"eval_runtime": 411.9801,
"eval_samples_per_second": 7.076,
"eval_steps_per_second": 3.539,
"step": 20
}
],
"logging_steps": 1,
"max_steps": 20,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6534646443540480.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}