|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.5, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.18797126412391663, |
|
"learning_rate": 9.949748743718594e-05, |
|
"loss": 0.6604, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.15986427664756775, |
|
"learning_rate": 9.84924623115578e-05, |
|
"loss": 0.6109, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.14691925048828125, |
|
"learning_rate": 9.748743718592965e-05, |
|
"loss": 0.6119, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.12482919543981552, |
|
"learning_rate": 9.64824120603015e-05, |
|
"loss": 0.6518, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.1446041613817215, |
|
"learning_rate": 9.547738693467337e-05, |
|
"loss": 0.5964, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.11769542098045349, |
|
"learning_rate": 9.447236180904523e-05, |
|
"loss": 0.5714, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.1492375284433365, |
|
"learning_rate": 9.34673366834171e-05, |
|
"loss": 0.6074, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.16033588349819183, |
|
"learning_rate": 9.246231155778895e-05, |
|
"loss": 0.5441, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.12141509354114532, |
|
"learning_rate": 9.14572864321608e-05, |
|
"loss": 0.5898, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.14080357551574707, |
|
"learning_rate": 9.045226130653267e-05, |
|
"loss": 0.5599, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.14213934540748596, |
|
"learning_rate": 8.944723618090453e-05, |
|
"loss": 0.5713, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.1625017374753952, |
|
"learning_rate": 8.84422110552764e-05, |
|
"loss": 0.543, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.16236303746700287, |
|
"learning_rate": 8.743718592964825e-05, |
|
"loss": 0.5461, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.14767853915691376, |
|
"learning_rate": 8.64321608040201e-05, |
|
"loss": 0.615, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.1415889859199524, |
|
"learning_rate": 8.542713567839196e-05, |
|
"loss": 0.5717, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.14877167344093323, |
|
"learning_rate": 8.442211055276383e-05, |
|
"loss": 0.5413, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.15324772894382477, |
|
"learning_rate": 8.341708542713568e-05, |
|
"loss": 0.5564, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.14583168923854828, |
|
"learning_rate": 8.241206030150754e-05, |
|
"loss": 0.549, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.17882034182548523, |
|
"learning_rate": 8.14070351758794e-05, |
|
"loss": 0.5495, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.13743259012699127, |
|
"learning_rate": 8.040201005025126e-05, |
|
"loss": 0.5545, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.1426534354686737, |
|
"learning_rate": 7.939698492462313e-05, |
|
"loss": 0.5137, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.16195817291736603, |
|
"learning_rate": 7.839195979899498e-05, |
|
"loss": 0.5241, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.17723768949508667, |
|
"learning_rate": 7.738693467336684e-05, |
|
"loss": 0.5201, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.1717960685491562, |
|
"learning_rate": 7.638190954773869e-05, |
|
"loss": 0.4599, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.17444512248039246, |
|
"learning_rate": 7.537688442211056e-05, |
|
"loss": 0.5151, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.19015884399414062, |
|
"learning_rate": 7.437185929648241e-05, |
|
"loss": 0.5413, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.1704825609922409, |
|
"learning_rate": 7.336683417085427e-05, |
|
"loss": 0.5352, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.18583402037620544, |
|
"learning_rate": 7.236180904522614e-05, |
|
"loss": 0.4971, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.1627408117055893, |
|
"learning_rate": 7.135678391959799e-05, |
|
"loss": 0.4678, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.167761892080307, |
|
"learning_rate": 7.035175879396985e-05, |
|
"loss": 0.5322, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.18980734050273895, |
|
"learning_rate": 6.93467336683417e-05, |
|
"loss": 0.5084, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.18042020499706268, |
|
"learning_rate": 6.834170854271357e-05, |
|
"loss": 0.5538, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.2088419497013092, |
|
"learning_rate": 6.733668341708544e-05, |
|
"loss": 0.4912, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.15689465403556824, |
|
"learning_rate": 6.633165829145729e-05, |
|
"loss": 0.5273, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.18145041167736053, |
|
"learning_rate": 6.532663316582915e-05, |
|
"loss": 0.5564, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.1750493198633194, |
|
"learning_rate": 6.4321608040201e-05, |
|
"loss": 0.4913, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.19720715284347534, |
|
"learning_rate": 6.331658291457287e-05, |
|
"loss": 0.5309, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.1915196031332016, |
|
"learning_rate": 6.231155778894473e-05, |
|
"loss": 0.5385, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.18675631284713745, |
|
"learning_rate": 6.130653266331658e-05, |
|
"loss": 0.5007, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.19217941164970398, |
|
"learning_rate": 6.030150753768844e-05, |
|
"loss": 0.4999, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.1979852020740509, |
|
"learning_rate": 5.929648241206031e-05, |
|
"loss": 0.4839, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.2420262098312378, |
|
"learning_rate": 5.829145728643216e-05, |
|
"loss": 0.4727, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.21145766973495483, |
|
"learning_rate": 5.728643216080403e-05, |
|
"loss": 0.4942, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.23028914630413055, |
|
"learning_rate": 5.628140703517588e-05, |
|
"loss": 0.4984, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.19866284728050232, |
|
"learning_rate": 5.527638190954774e-05, |
|
"loss": 0.4909, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.18788225948810577, |
|
"learning_rate": 5.4271356783919604e-05, |
|
"loss": 0.4889, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.20647984743118286, |
|
"learning_rate": 5.3266331658291455e-05, |
|
"loss": 0.4844, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.2238943725824356, |
|
"learning_rate": 5.226130653266332e-05, |
|
"loss": 0.4531, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.2138652801513672, |
|
"learning_rate": 5.125628140703518e-05, |
|
"loss": 0.4631, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.2267480194568634, |
|
"learning_rate": 5.0251256281407036e-05, |
|
"loss": 0.4617, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.069113651395789e+17, |
|
"train_batch_size": 10, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|