dixedus's picture
Training in progress, step 119, checkpoint
c066c46 verified
raw
history blame
9.26 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.08991310918020401,
"eval_steps": 17,
"global_step": 119,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007555723460521345,
"eval_loss": 12.450613975524902,
"eval_runtime": 61.6454,
"eval_samples_per_second": 36.175,
"eval_steps_per_second": 4.526,
"step": 1
},
{
"epoch": 0.0022667170381564035,
"grad_norm": 0.009174207225441933,
"learning_rate": 3e-05,
"loss": 12.453,
"step": 3
},
{
"epoch": 0.004533434076312807,
"grad_norm": 0.009654853492975235,
"learning_rate": 6e-05,
"loss": 12.4526,
"step": 6
},
{
"epoch": 0.00680015111446921,
"grad_norm": 0.008986242115497589,
"learning_rate": 9e-05,
"loss": 12.4507,
"step": 9
},
{
"epoch": 0.009066868152625614,
"grad_norm": 0.009755842387676239,
"learning_rate": 9.997266286704631e-05,
"loss": 12.4496,
"step": 12
},
{
"epoch": 0.011333585190782017,
"grad_norm": 0.011493120342493057,
"learning_rate": 9.98292246503335e-05,
"loss": 12.4487,
"step": 15
},
{
"epoch": 0.012844729882886286,
"eval_loss": 12.450437545776367,
"eval_runtime": 61.7871,
"eval_samples_per_second": 36.092,
"eval_steps_per_second": 4.516,
"step": 17
},
{
"epoch": 0.01360030222893842,
"grad_norm": 0.009524165652692318,
"learning_rate": 9.956320346634876e-05,
"loss": 12.454,
"step": 18
},
{
"epoch": 0.015867019267094825,
"grad_norm": 0.012806428596377373,
"learning_rate": 9.917525374361912e-05,
"loss": 12.4518,
"step": 21
},
{
"epoch": 0.018133736305251228,
"grad_norm": 0.011713715270161629,
"learning_rate": 9.86663298624003e-05,
"loss": 12.4506,
"step": 24
},
{
"epoch": 0.02040045334340763,
"grad_norm": 0.010967453010380268,
"learning_rate": 9.803768380684242e-05,
"loss": 12.4491,
"step": 27
},
{
"epoch": 0.022667170381564034,
"grad_norm": 0.0131283700466156,
"learning_rate": 9.729086208503174e-05,
"loss": 12.4512,
"step": 30
},
{
"epoch": 0.024933887419720437,
"grad_norm": 0.012522528879344463,
"learning_rate": 9.642770192448536e-05,
"loss": 12.4509,
"step": 33
},
{
"epoch": 0.02568945976577257,
"eval_loss": 12.45008373260498,
"eval_runtime": 61.8082,
"eval_samples_per_second": 36.079,
"eval_steps_per_second": 4.514,
"step": 34
},
{
"epoch": 0.02720060445787684,
"grad_norm": 0.013970930129289627,
"learning_rate": 9.545032675245813e-05,
"loss": 12.4541,
"step": 36
},
{
"epoch": 0.029467321496033247,
"grad_norm": 0.014797762967646122,
"learning_rate": 9.43611409721806e-05,
"loss": 12.4476,
"step": 39
},
{
"epoch": 0.03173403853418965,
"grad_norm": 0.01638209819793701,
"learning_rate": 9.316282404787871e-05,
"loss": 12.4498,
"step": 42
},
{
"epoch": 0.03400075557234605,
"grad_norm": 0.017414981499314308,
"learning_rate": 9.185832391312644e-05,
"loss": 12.4489,
"step": 45
},
{
"epoch": 0.036267472610502456,
"grad_norm": 0.01976948417723179,
"learning_rate": 9.045084971874738e-05,
"loss": 12.4498,
"step": 48
},
{
"epoch": 0.03853418964865886,
"grad_norm": 0.017705973237752914,
"learning_rate": 8.894386393810563e-05,
"loss": 12.4478,
"step": 51
},
{
"epoch": 0.03853418964865886,
"eval_loss": 12.449511528015137,
"eval_runtime": 61.8317,
"eval_samples_per_second": 36.066,
"eval_steps_per_second": 4.512,
"step": 51
},
{
"epoch": 0.04080090668681526,
"grad_norm": 0.018775586038827896,
"learning_rate": 8.73410738492077e-05,
"loss": 12.4476,
"step": 54
},
{
"epoch": 0.043067623724971665,
"grad_norm": 0.020082606002688408,
"learning_rate": 8.564642241456986e-05,
"loss": 12.4519,
"step": 57
},
{
"epoch": 0.04533434076312807,
"grad_norm": 0.01945601962506771,
"learning_rate": 8.386407858128706e-05,
"loss": 12.4513,
"step": 60
},
{
"epoch": 0.04760105780128447,
"grad_norm": 0.02504323236644268,
"learning_rate": 8.199842702516583e-05,
"loss": 12.451,
"step": 63
},
{
"epoch": 0.049867774839440875,
"grad_norm": 0.02477116324007511,
"learning_rate": 8.005405736415126e-05,
"loss": 12.4508,
"step": 66
},
{
"epoch": 0.05137891953154514,
"eval_loss": 12.448628425598145,
"eval_runtime": 61.7963,
"eval_samples_per_second": 36.086,
"eval_steps_per_second": 4.515,
"step": 68
},
{
"epoch": 0.05213449187759728,
"grad_norm": 0.02452562004327774,
"learning_rate": 7.803575286758364e-05,
"loss": 12.4501,
"step": 69
},
{
"epoch": 0.05440120891575368,
"grad_norm": 0.024608276784420013,
"learning_rate": 7.594847868906076e-05,
"loss": 12.449,
"step": 72
},
{
"epoch": 0.056667925953910084,
"grad_norm": 0.025729818269610405,
"learning_rate": 7.379736965185368e-05,
"loss": 12.4491,
"step": 75
},
{
"epoch": 0.058934642992066494,
"grad_norm": 0.026810096576809883,
"learning_rate": 7.158771761692464e-05,
"loss": 12.4492,
"step": 78
},
{
"epoch": 0.0612013600302229,
"grad_norm": 0.030193008482456207,
"learning_rate": 6.932495846462261e-05,
"loss": 12.4488,
"step": 81
},
{
"epoch": 0.0634680770683793,
"grad_norm": 0.0322081558406353,
"learning_rate": 6.701465872208216e-05,
"loss": 12.4457,
"step": 84
},
{
"epoch": 0.06422364941443143,
"eval_loss": 12.447479248046875,
"eval_runtime": 61.6601,
"eval_samples_per_second": 36.166,
"eval_steps_per_second": 4.525,
"step": 85
},
{
"epoch": 0.0657347941065357,
"grad_norm": 0.03805829957127571,
"learning_rate": 6.466250186922325e-05,
"loss": 12.4443,
"step": 87
},
{
"epoch": 0.0680015111446921,
"grad_norm": 0.03595505282282829,
"learning_rate": 6.227427435703997e-05,
"loss": 12.4494,
"step": 90
},
{
"epoch": 0.0702682281828485,
"grad_norm": 0.03157583624124527,
"learning_rate": 5.985585137257401e-05,
"loss": 12.446,
"step": 93
},
{
"epoch": 0.07253494522100491,
"grad_norm": 0.03606283292174339,
"learning_rate": 5.74131823855921e-05,
"loss": 12.4462,
"step": 96
},
{
"epoch": 0.07480166225916131,
"grad_norm": 0.03671179711818695,
"learning_rate": 5.495227651252315e-05,
"loss": 12.4473,
"step": 99
},
{
"epoch": 0.07706837929731772,
"grad_norm": 0.042302753776311874,
"learning_rate": 5.247918773366112e-05,
"loss": 12.4479,
"step": 102
},
{
"epoch": 0.07706837929731772,
"eval_loss": 12.446091651916504,
"eval_runtime": 61.8106,
"eval_samples_per_second": 36.078,
"eval_steps_per_second": 4.514,
"step": 102
},
{
"epoch": 0.07933509633547413,
"grad_norm": 0.04813135042786598,
"learning_rate": 5e-05,
"loss": 12.4455,
"step": 105
},
{
"epoch": 0.08160181337363052,
"grad_norm": 0.046277038753032684,
"learning_rate": 4.7520812266338885e-05,
"loss": 12.4466,
"step": 108
},
{
"epoch": 0.08386853041178693,
"grad_norm": 0.046285875141620636,
"learning_rate": 4.504772348747687e-05,
"loss": 12.4448,
"step": 111
},
{
"epoch": 0.08613524744994333,
"grad_norm": 0.051637545228004456,
"learning_rate": 4.2586817614407895e-05,
"loss": 12.4436,
"step": 114
},
{
"epoch": 0.08840196448809974,
"grad_norm": 0.04436314478516579,
"learning_rate": 4.0144148627425993e-05,
"loss": 12.4446,
"step": 117
},
{
"epoch": 0.08991310918020401,
"eval_loss": 12.444755554199219,
"eval_runtime": 61.8074,
"eval_samples_per_second": 36.08,
"eval_steps_per_second": 4.514,
"step": 119
}
],
"logging_steps": 3,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 17,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 263272267776.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}