|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.046667185190946564, |
|
"eval_steps": 13, |
|
"global_step": 150, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00031111456793964375, |
|
"eval_loss": 3.795616626739502, |
|
"eval_runtime": 387.5587, |
|
"eval_samples_per_second": 13.969, |
|
"eval_steps_per_second": 1.747, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0009333437038189313, |
|
"grad_norm": 1.8201688528060913, |
|
"learning_rate": 1.5e-05, |
|
"loss": 3.6704, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0018666874076378626, |
|
"grad_norm": 1.9469057321548462, |
|
"learning_rate": 3e-05, |
|
"loss": 3.829, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.002800031111456794, |
|
"grad_norm": 2.4936416149139404, |
|
"learning_rate": 4.5e-05, |
|
"loss": 3.6616, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.003733374815275725, |
|
"grad_norm": 3.112192392349243, |
|
"learning_rate": 4.997482666353287e-05, |
|
"loss": 3.1786, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.004044489383215369, |
|
"eval_loss": 2.5751025676727295, |
|
"eval_runtime": 389.7804, |
|
"eval_samples_per_second": 13.89, |
|
"eval_steps_per_second": 1.737, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.004666718519094656, |
|
"grad_norm": 2.7909445762634277, |
|
"learning_rate": 4.984280524733107e-05, |
|
"loss": 2.6525, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.005600062222913588, |
|
"grad_norm": 2.6461470127105713, |
|
"learning_rate": 4.959823971496574e-05, |
|
"loss": 2.2006, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.00653340592673252, |
|
"grad_norm": 2.1072444915771484, |
|
"learning_rate": 4.9242238009417175e-05, |
|
"loss": 1.9303, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.00746674963055145, |
|
"grad_norm": 1.8727749586105347, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 1.8151, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.008088978766430738, |
|
"eval_loss": 1.7247991561889648, |
|
"eval_runtime": 389.9247, |
|
"eval_samples_per_second": 13.885, |
|
"eval_steps_per_second": 1.736, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.008400093334370381, |
|
"grad_norm": 1.5817272663116455, |
|
"learning_rate": 4.820287471297598e-05, |
|
"loss": 1.6959, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.009333437038189313, |
|
"grad_norm": 1.7775505781173706, |
|
"learning_rate": 4.752422169756048e-05, |
|
"loss": 1.7599, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.010266780742008244, |
|
"grad_norm": 1.512369155883789, |
|
"learning_rate": 4.674352832889239e-05, |
|
"loss": 1.6668, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.011200124445827176, |
|
"grad_norm": 1.427876353263855, |
|
"learning_rate": 4.586433134303257e-05, |
|
"loss": 1.6359, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.012133468149646108, |
|
"grad_norm": 1.557162880897522, |
|
"learning_rate": 4.489061372204453e-05, |
|
"loss": 1.5442, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.012133468149646108, |
|
"eval_loss": 1.5903956890106201, |
|
"eval_runtime": 389.6183, |
|
"eval_samples_per_second": 13.896, |
|
"eval_steps_per_second": 1.738, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.01306681185346504, |
|
"grad_norm": 1.3834956884384155, |
|
"learning_rate": 4.382678665009028e-05, |
|
"loss": 1.5914, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.01400015555728397, |
|
"grad_norm": 1.2795754671096802, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 1.512, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0149334992611029, |
|
"grad_norm": 1.383254885673523, |
|
"learning_rate": 4.144846814849282e-05, |
|
"loss": 1.5425, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.015866842964921832, |
|
"grad_norm": 1.4243319034576416, |
|
"learning_rate": 4.01447510960205e-05, |
|
"loss": 1.5153, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.016177957532861477, |
|
"eval_loss": 1.5446308851242065, |
|
"eval_runtime": 390.0683, |
|
"eval_samples_per_second": 13.88, |
|
"eval_steps_per_second": 1.736, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.016800186668740762, |
|
"grad_norm": 1.3810615539550781, |
|
"learning_rate": 3.8772424536302564e-05, |
|
"loss": 1.5528, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.017733530372559696, |
|
"grad_norm": 1.4023598432540894, |
|
"learning_rate": 3.7337705451608674e-05, |
|
"loss": 1.5652, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.018666874076378626, |
|
"grad_norm": 1.2947098016738892, |
|
"learning_rate": 3.5847093477938956e-05, |
|
"loss": 1.5906, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01960021778019756, |
|
"grad_norm": 1.40787935256958, |
|
"learning_rate": 3.4307341460048633e-05, |
|
"loss": 1.4431, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.020222446916076844, |
|
"eval_loss": 1.5237524509429932, |
|
"eval_runtime": 389.7942, |
|
"eval_samples_per_second": 13.889, |
|
"eval_steps_per_second": 1.737, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.02053356148401649, |
|
"grad_norm": 1.2801828384399414, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 1.5227, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.02146690518783542, |
|
"grad_norm": 1.3553149700164795, |
|
"learning_rate": 3.110851015344735e-05, |
|
"loss": 1.5033, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.022400248891654352, |
|
"grad_norm": 1.339362621307373, |
|
"learning_rate": 2.9463922369965917e-05, |
|
"loss": 1.517, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.023333592595473282, |
|
"grad_norm": 1.307166337966919, |
|
"learning_rate": 2.7799111902582696e-05, |
|
"loss": 1.4759, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.024266936299292215, |
|
"grad_norm": 1.2301127910614014, |
|
"learning_rate": 2.6121620758762877e-05, |
|
"loss": 1.4794, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.024266936299292215, |
|
"eval_loss": 1.506866216659546, |
|
"eval_runtime": 389.7553, |
|
"eval_samples_per_second": 13.891, |
|
"eval_steps_per_second": 1.737, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.025200280003111145, |
|
"grad_norm": 1.2243834733963013, |
|
"learning_rate": 2.443904839260488e-05, |
|
"loss": 1.4497, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.02613362370693008, |
|
"grad_norm": 1.1867769956588745, |
|
"learning_rate": 2.2759017277414166e-05, |
|
"loss": 1.4718, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.02706696741074901, |
|
"grad_norm": 1.1967134475708008, |
|
"learning_rate": 2.1089138373994223e-05, |
|
"loss": 1.5324, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.02800031111456794, |
|
"grad_norm": 1.2109935283660889, |
|
"learning_rate": 1.9436976651092144e-05, |
|
"loss": 1.5148, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.028311425682507583, |
|
"eval_loss": 1.4927889108657837, |
|
"eval_runtime": 389.8573, |
|
"eval_samples_per_second": 13.887, |
|
"eval_steps_per_second": 1.737, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.028933654818386872, |
|
"grad_norm": 1.2042676210403442, |
|
"learning_rate": 1.781001681419957e-05, |
|
"loss": 1.5146, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.0298669985222058, |
|
"grad_norm": 1.2717711925506592, |
|
"learning_rate": 1.621562939796643e-05, |
|
"loss": 1.4681, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.030800342226024735, |
|
"grad_norm": 1.1750036478042603, |
|
"learning_rate": 1.466103737583699e-05, |
|
"loss": 1.5074, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.031733685929843665, |
|
"grad_norm": 1.3590319156646729, |
|
"learning_rate": 1.3153283438175034e-05, |
|
"loss": 1.5168, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.032355915065722954, |
|
"eval_loss": 1.4859765768051147, |
|
"eval_runtime": 389.9351, |
|
"eval_samples_per_second": 13.884, |
|
"eval_steps_per_second": 1.736, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.032667029633662595, |
|
"grad_norm": 1.15152907371521, |
|
"learning_rate": 1.1699198087116589e-05, |
|
"loss": 1.4838, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.033600373337481525, |
|
"grad_norm": 1.2240478992462158, |
|
"learning_rate": 1.0305368692688174e-05, |
|
"loss": 1.5484, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.03453371704130046, |
|
"grad_norm": 1.2042181491851807, |
|
"learning_rate": 8.978109650374397e-06, |
|
"loss": 1.5173, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.03546706074511939, |
|
"grad_norm": 1.205437183380127, |
|
"learning_rate": 7.723433775328384e-06, |
|
"loss": 1.472, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.03640040444893832, |
|
"grad_norm": 1.2650120258331299, |
|
"learning_rate": 6.547025062816486e-06, |
|
"loss": 1.5346, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.03640040444893832, |
|
"eval_loss": 1.4806684255599976, |
|
"eval_runtime": 390.1832, |
|
"eval_samples_per_second": 13.876, |
|
"eval_steps_per_second": 1.735, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.03733374815275725, |
|
"grad_norm": 1.1355212926864624, |
|
"learning_rate": 5.454212938299255e-06, |
|
"loss": 1.4929, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03826709185657618, |
|
"grad_norm": 1.2554372549057007, |
|
"learning_rate": 4.4499481138022544e-06, |
|
"loss": 1.4357, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.03920043556039512, |
|
"grad_norm": 1.289537787437439, |
|
"learning_rate": 3.5387801599533475e-06, |
|
"loss": 1.4257, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.04013377926421405, |
|
"grad_norm": 1.439562201499939, |
|
"learning_rate": 2.7248368952908053e-06, |
|
"loss": 1.4607, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.04044489383215369, |
|
"eval_loss": 1.4785248041152954, |
|
"eval_runtime": 390.0687, |
|
"eval_samples_per_second": 13.88, |
|
"eval_steps_per_second": 1.736, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.04106712296803298, |
|
"grad_norm": 1.2261285781860352, |
|
"learning_rate": 2.0118056862137357e-06, |
|
"loss": 1.5202, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.04200046667185191, |
|
"grad_norm": 1.3027156591415405, |
|
"learning_rate": 1.4029167422908107e-06, |
|
"loss": 1.5151, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.04293381037567084, |
|
"grad_norm": 1.1372510194778442, |
|
"learning_rate": 9.009284826036691e-07, |
|
"loss": 1.5352, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.043867154079489774, |
|
"grad_norm": 1.248124122619629, |
|
"learning_rate": 5.08115039419113e-07, |
|
"loss": 1.5216, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.044489383215369056, |
|
"eval_loss": 1.4776010513305664, |
|
"eval_runtime": 389.8727, |
|
"eval_samples_per_second": 13.887, |
|
"eval_steps_per_second": 1.736, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.044800497783308704, |
|
"grad_norm": 1.2946685552597046, |
|
"learning_rate": 2.262559558016325e-07, |
|
"loss": 1.5154, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.045733841487127634, |
|
"grad_norm": 1.1807682514190674, |
|
"learning_rate": 5.662812383859795e-08, |
|
"loss": 1.4998, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.046667185190946564, |
|
"grad_norm": 1.2081470489501953, |
|
"learning_rate": 0.0, |
|
"loss": 1.5239, |
|
"step": 150 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 150, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 13, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.225661850681344e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|