|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.16953941791466515, |
|
"eval_steps": 13, |
|
"global_step": 150, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0011302627860977678, |
|
"eval_loss": 0.7253943681716919, |
|
"eval_runtime": 222.5674, |
|
"eval_samples_per_second": 6.695, |
|
"eval_steps_per_second": 0.84, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003390788358293303, |
|
"grad_norm": 6.4600725173950195, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.9057, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.006781576716586606, |
|
"grad_norm": 4.287564277648926, |
|
"learning_rate": 3e-05, |
|
"loss": 2.7228, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.010172365074879909, |
|
"grad_norm": 4.434852600097656, |
|
"learning_rate": 4.5e-05, |
|
"loss": 2.2412, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.013563153433173212, |
|
"grad_norm": 4.096804618835449, |
|
"learning_rate": 4.997482666353287e-05, |
|
"loss": 1.6976, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.01469341621927098, |
|
"eval_loss": 0.281572163105011, |
|
"eval_runtime": 224.6256, |
|
"eval_samples_per_second": 6.633, |
|
"eval_steps_per_second": 0.832, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.016953941791466517, |
|
"grad_norm": 3.934027910232544, |
|
"learning_rate": 4.984280524733107e-05, |
|
"loss": 1.2107, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.020344730149759818, |
|
"grad_norm": 2.3636763095855713, |
|
"learning_rate": 4.959823971496574e-05, |
|
"loss": 0.8397, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.02373551850805312, |
|
"grad_norm": 1.3671884536743164, |
|
"learning_rate": 4.9242238009417175e-05, |
|
"loss": 0.7578, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.027126306866346424, |
|
"grad_norm": 1.3218061923980713, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 0.6645, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.02938683243854196, |
|
"eval_loss": 0.16004851460456848, |
|
"eval_runtime": 224.8305, |
|
"eval_samples_per_second": 6.627, |
|
"eval_steps_per_second": 0.832, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.030517095224639728, |
|
"grad_norm": 1.757283091545105, |
|
"learning_rate": 4.820287471297598e-05, |
|
"loss": 0.6531, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.033907883582933035, |
|
"grad_norm": 1.2630192041397095, |
|
"learning_rate": 4.752422169756048e-05, |
|
"loss": 0.616, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03729867194122634, |
|
"grad_norm": 1.2325754165649414, |
|
"learning_rate": 4.674352832889239e-05, |
|
"loss": 0.6141, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.040689460299519635, |
|
"grad_norm": 1.5199618339538574, |
|
"learning_rate": 4.586433134303257e-05, |
|
"loss": 0.6026, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04408024865781294, |
|
"grad_norm": 1.2060375213623047, |
|
"learning_rate": 4.489061372204453e-05, |
|
"loss": 0.6114, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.04408024865781294, |
|
"eval_loss": 0.14285726845264435, |
|
"eval_runtime": 224.8051, |
|
"eval_samples_per_second": 6.628, |
|
"eval_steps_per_second": 0.832, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.04747103701610624, |
|
"grad_norm": 1.2959660291671753, |
|
"learning_rate": 4.382678665009028e-05, |
|
"loss": 0.576, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.050861825374399545, |
|
"grad_norm": 1.1517972946166992, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 0.5597, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05425261373269285, |
|
"grad_norm": 1.2231593132019043, |
|
"learning_rate": 4.144846814849282e-05, |
|
"loss": 0.5803, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.05764340209098615, |
|
"grad_norm": 1.1372201442718506, |
|
"learning_rate": 4.01447510960205e-05, |
|
"loss": 0.5299, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.05877366487708392, |
|
"eval_loss": 0.13713335990905762, |
|
"eval_runtime": 224.8683, |
|
"eval_samples_per_second": 6.626, |
|
"eval_steps_per_second": 0.832, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.061034190449279456, |
|
"grad_norm": 1.0566143989562988, |
|
"learning_rate": 3.8772424536302564e-05, |
|
"loss": 0.555, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.06442497880757277, |
|
"grad_norm": 1.0105335712432861, |
|
"learning_rate": 3.7337705451608674e-05, |
|
"loss": 0.5633, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.06781576716586607, |
|
"grad_norm": 1.0776220560073853, |
|
"learning_rate": 3.5847093477938956e-05, |
|
"loss": 0.5251, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07120655552415937, |
|
"grad_norm": 1.2005013227462769, |
|
"learning_rate": 3.4307341460048633e-05, |
|
"loss": 0.5479, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0734670810963549, |
|
"eval_loss": 0.13397091627120972, |
|
"eval_runtime": 224.8, |
|
"eval_samples_per_second": 6.628, |
|
"eval_steps_per_second": 0.832, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07459734388245268, |
|
"grad_norm": 0.9956369996070862, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 0.5392, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.07798813224074597, |
|
"grad_norm": 1.1790428161621094, |
|
"learning_rate": 3.110851015344735e-05, |
|
"loss": 0.5354, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.08137892059903927, |
|
"grad_norm": 1.041911005973816, |
|
"learning_rate": 2.9463922369965917e-05, |
|
"loss": 0.5366, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.08476970895733257, |
|
"grad_norm": 1.101559042930603, |
|
"learning_rate": 2.7799111902582696e-05, |
|
"loss": 0.5158, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08816049731562588, |
|
"grad_norm": 0.9680448174476624, |
|
"learning_rate": 2.6121620758762877e-05, |
|
"loss": 0.4675, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.08816049731562588, |
|
"eval_loss": 0.13185884058475494, |
|
"eval_runtime": 224.7208, |
|
"eval_samples_per_second": 6.63, |
|
"eval_steps_per_second": 0.832, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.09155128567391918, |
|
"grad_norm": 1.1247650384902954, |
|
"learning_rate": 2.443904839260488e-05, |
|
"loss": 0.5192, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.09494207403221248, |
|
"grad_norm": 1.0689295530319214, |
|
"learning_rate": 2.2759017277414166e-05, |
|
"loss": 0.5174, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.09833286239050579, |
|
"grad_norm": 1.2236093282699585, |
|
"learning_rate": 2.1089138373994223e-05, |
|
"loss": 0.5135, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.10172365074879909, |
|
"grad_norm": 1.1371599435806274, |
|
"learning_rate": 1.9436976651092144e-05, |
|
"loss": 0.5161, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10285391353489687, |
|
"eval_loss": 0.13008707761764526, |
|
"eval_runtime": 224.8789, |
|
"eval_samples_per_second": 6.626, |
|
"eval_steps_per_second": 0.832, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.1051144391070924, |
|
"grad_norm": 0.9614408612251282, |
|
"learning_rate": 1.781001681419957e-05, |
|
"loss": 0.5149, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.1085052274653857, |
|
"grad_norm": 0.8851366639137268, |
|
"learning_rate": 1.621562939796643e-05, |
|
"loss": 0.5307, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.111896015823679, |
|
"grad_norm": 1.0845979452133179, |
|
"learning_rate": 1.466103737583699e-05, |
|
"loss": 0.5183, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.1152868041819723, |
|
"grad_norm": 1.0732598304748535, |
|
"learning_rate": 1.3153283438175034e-05, |
|
"loss": 0.5277, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.11754732975416785, |
|
"eval_loss": 0.12890119850635529, |
|
"eval_runtime": 225.0833, |
|
"eval_samples_per_second": 6.62, |
|
"eval_steps_per_second": 0.831, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.11867759254026561, |
|
"grad_norm": 1.0611317157745361, |
|
"learning_rate": 1.1699198087116589e-05, |
|
"loss": 0.5579, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.12206838089855891, |
|
"grad_norm": 1.7777539491653442, |
|
"learning_rate": 1.0305368692688174e-05, |
|
"loss": 0.5391, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.12545916925685222, |
|
"grad_norm": 1.0344315767288208, |
|
"learning_rate": 8.978109650374397e-06, |
|
"loss": 0.4931, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.12884995761514553, |
|
"grad_norm": 1.0285758972167969, |
|
"learning_rate": 7.723433775328384e-06, |
|
"loss": 0.5289, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.13224074597343882, |
|
"grad_norm": 0.9961230158805847, |
|
"learning_rate": 6.547025062816486e-06, |
|
"loss": 0.5086, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.13224074597343882, |
|
"eval_loss": 0.12818647921085358, |
|
"eval_runtime": 224.9913, |
|
"eval_samples_per_second": 6.622, |
|
"eval_steps_per_second": 0.831, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.13563153433173214, |
|
"grad_norm": 1.7210478782653809, |
|
"learning_rate": 5.454212938299255e-06, |
|
"loss": 0.5592, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13902232269002543, |
|
"grad_norm": 1.005109429359436, |
|
"learning_rate": 4.4499481138022544e-06, |
|
"loss": 0.4997, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.14241311104831875, |
|
"grad_norm": 1.0120928287506104, |
|
"learning_rate": 3.5387801599533475e-06, |
|
"loss": 0.543, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.14580389940661204, |
|
"grad_norm": 0.9547510743141174, |
|
"learning_rate": 2.7248368952908053e-06, |
|
"loss": 0.5363, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.1469341621927098, |
|
"eval_loss": 0.12771442532539368, |
|
"eval_runtime": 224.7527, |
|
"eval_samples_per_second": 6.63, |
|
"eval_steps_per_second": 0.832, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14919468776490535, |
|
"grad_norm": 1.0372916460037231, |
|
"learning_rate": 2.0118056862137357e-06, |
|
"loss": 0.4873, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.15258547612319864, |
|
"grad_norm": 1.11273992061615, |
|
"learning_rate": 1.4029167422908107e-06, |
|
"loss": 0.5262, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.15597626448149193, |
|
"grad_norm": 0.9660788774490356, |
|
"learning_rate": 9.009284826036691e-07, |
|
"loss": 0.5585, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.15936705283978525, |
|
"grad_norm": 1.1789458990097046, |
|
"learning_rate": 5.08115039419113e-07, |
|
"loss": 0.4771, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.16162757841198078, |
|
"eval_loss": 0.127611443400383, |
|
"eval_runtime": 225.1337, |
|
"eval_samples_per_second": 6.618, |
|
"eval_steps_per_second": 0.831, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.16275784119807854, |
|
"grad_norm": 0.9032691717147827, |
|
"learning_rate": 2.262559558016325e-07, |
|
"loss": 0.4993, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.16614862955637186, |
|
"grad_norm": 1.033461332321167, |
|
"learning_rate": 5.662812383859795e-08, |
|
"loss": 0.4605, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.16953941791466515, |
|
"grad_norm": 0.9426572322845459, |
|
"learning_rate": 0.0, |
|
"loss": 0.4887, |
|
"step": 150 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 150, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 13, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.218781823926272e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|