mamung's picture
Training in progress, step 100, checkpoint
51078f9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0785391714117416,
"eval_steps": 9,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007853917141174161,
"eval_loss": 8.724822998046875,
"eval_runtime": 233.5003,
"eval_samples_per_second": 19.392,
"eval_steps_per_second": 2.424,
"step": 1
},
{
"epoch": 0.0039269585705870805,
"grad_norm": 6.884418964385986,
"learning_rate": 3.75e-05,
"loss": 8.547,
"step": 5
},
{
"epoch": 0.007068525427056745,
"eval_loss": 6.619219779968262,
"eval_runtime": 233.7681,
"eval_samples_per_second": 19.37,
"eval_steps_per_second": 2.421,
"step": 9
},
{
"epoch": 0.007853917141174161,
"grad_norm": 9.305004119873047,
"learning_rate": 7.5e-05,
"loss": 7.0279,
"step": 10
},
{
"epoch": 0.011780875711761242,
"grad_norm": 8.224003791809082,
"learning_rate": 0.0001125,
"loss": 5.2814,
"step": 15
},
{
"epoch": 0.01413705085411349,
"eval_loss": 4.465071678161621,
"eval_runtime": 233.7503,
"eval_samples_per_second": 19.371,
"eval_steps_per_second": 2.421,
"step": 18
},
{
"epoch": 0.015707834282348322,
"grad_norm": 8.007319450378418,
"learning_rate": 0.00015,
"loss": 4.5609,
"step": 20
},
{
"epoch": 0.0196347928529354,
"grad_norm": 8.696537017822266,
"learning_rate": 0.00014855889603024227,
"loss": 4.2927,
"step": 25
},
{
"epoch": 0.021205576281170233,
"eval_loss": 4.00045919418335,
"eval_runtime": 233.7374,
"eval_samples_per_second": 19.372,
"eval_steps_per_second": 2.422,
"step": 27
},
{
"epoch": 0.023561751423522483,
"grad_norm": 6.938048839569092,
"learning_rate": 0.0001442909649383465,
"loss": 4.0025,
"step": 30
},
{
"epoch": 0.027488709994109562,
"grad_norm": 7.009843826293945,
"learning_rate": 0.0001373602209226909,
"loss": 3.8531,
"step": 35
},
{
"epoch": 0.02827410170822698,
"eval_loss": 3.82582426071167,
"eval_runtime": 233.7172,
"eval_samples_per_second": 19.374,
"eval_steps_per_second": 2.422,
"step": 36
},
{
"epoch": 0.031415668564696644,
"grad_norm": 6.201370716094971,
"learning_rate": 0.00012803300858899104,
"loss": 3.719,
"step": 40
},
{
"epoch": 0.03534262713528372,
"grad_norm": 5.8150105476379395,
"learning_rate": 0.00011666776747647015,
"loss": 3.6351,
"step": 45
},
{
"epoch": 0.03534262713528372,
"eval_loss": 3.6261699199676514,
"eval_runtime": 233.6588,
"eval_samples_per_second": 19.379,
"eval_steps_per_second": 2.422,
"step": 45
},
{
"epoch": 0.0392695857058708,
"grad_norm": 4.917875289916992,
"learning_rate": 0.00010370125742738173,
"loss": 3.6745,
"step": 50
},
{
"epoch": 0.042411152562340466,
"eval_loss": 3.5405988693237305,
"eval_runtime": 233.7238,
"eval_samples_per_second": 19.373,
"eval_steps_per_second": 2.422,
"step": 54
},
{
"epoch": 0.04319654427645788,
"grad_norm": 4.494648456573486,
"learning_rate": 8.963177415120962e-05,
"loss": 3.5363,
"step": 55
},
{
"epoch": 0.04712350284704497,
"grad_norm": 6.017573833465576,
"learning_rate": 7.5e-05,
"loss": 3.2906,
"step": 60
},
{
"epoch": 0.04947967798939721,
"eval_loss": 3.450070381164551,
"eval_runtime": 233.7549,
"eval_samples_per_second": 19.371,
"eval_steps_per_second": 2.421,
"step": 63
},
{
"epoch": 0.051050461417632045,
"grad_norm": 4.84181022644043,
"learning_rate": 6.036822584879038e-05,
"loss": 3.3757,
"step": 65
},
{
"epoch": 0.054977419988219124,
"grad_norm": 6.739719390869141,
"learning_rate": 4.6298742572618266e-05,
"loss": 3.4344,
"step": 70
},
{
"epoch": 0.05654820341645396,
"eval_loss": 3.3793962001800537,
"eval_runtime": 233.8278,
"eval_samples_per_second": 19.365,
"eval_steps_per_second": 2.421,
"step": 72
},
{
"epoch": 0.0589043785588062,
"grad_norm": 4.579084873199463,
"learning_rate": 3.333223252352985e-05,
"loss": 3.3768,
"step": 75
},
{
"epoch": 0.06283133712939329,
"grad_norm": 5.169261932373047,
"learning_rate": 2.1966991411008938e-05,
"loss": 3.4602,
"step": 80
},
{
"epoch": 0.0636167288435107,
"eval_loss": 3.332181453704834,
"eval_runtime": 233.7069,
"eval_samples_per_second": 19.375,
"eval_steps_per_second": 2.422,
"step": 81
},
{
"epoch": 0.06675829569998036,
"grad_norm": 5.313036918640137,
"learning_rate": 1.2639779077309098e-05,
"loss": 3.3791,
"step": 85
},
{
"epoch": 0.07068525427056745,
"grad_norm": 4.830722332000732,
"learning_rate": 5.709035061653494e-06,
"loss": 3.0172,
"step": 90
},
{
"epoch": 0.07068525427056745,
"eval_loss": 3.3156657218933105,
"eval_runtime": 233.6444,
"eval_samples_per_second": 19.38,
"eval_steps_per_second": 2.422,
"step": 90
},
{
"epoch": 0.07461221284115453,
"grad_norm": 5.533350944519043,
"learning_rate": 1.4411039697577175e-06,
"loss": 3.4561,
"step": 95
},
{
"epoch": 0.07775377969762419,
"eval_loss": 3.3061177730560303,
"eval_runtime": 233.6519,
"eval_samples_per_second": 19.379,
"eval_steps_per_second": 2.422,
"step": 99
},
{
"epoch": 0.0785391714117416,
"grad_norm": 4.896818161010742,
"learning_rate": 0.0,
"loss": 2.9994,
"step": 100
}
],
"logging_steps": 5,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 9,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.508513578745856e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}