{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9956108266276518, "eval_steps": 500, "global_step": 1023, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.029261155815654718, "grad_norm": 2.214355966256512, "learning_rate": 5e-06, "loss": 0.7976, "step": 10 }, { "epoch": 0.058522311631309436, "grad_norm": 1.281388597648664, "learning_rate": 5e-06, "loss": 0.7383, "step": 20 }, { "epoch": 0.08778346744696415, "grad_norm": 1.2874847374087468, "learning_rate": 5e-06, "loss": 0.7068, "step": 30 }, { "epoch": 0.11704462326261887, "grad_norm": 1.0911342229904486, "learning_rate": 5e-06, "loss": 0.6963, "step": 40 }, { "epoch": 0.14630577907827358, "grad_norm": 1.1433555201488177, "learning_rate": 5e-06, "loss": 0.6869, "step": 50 }, { "epoch": 0.1755669348939283, "grad_norm": 0.9269750204811369, "learning_rate": 5e-06, "loss": 0.6765, "step": 60 }, { "epoch": 0.20482809070958302, "grad_norm": 0.5953508309644502, "learning_rate": 5e-06, "loss": 0.67, "step": 70 }, { "epoch": 0.23408924652523774, "grad_norm": 0.41718068806926895, "learning_rate": 5e-06, "loss": 0.6648, "step": 80 }, { "epoch": 0.26335040234089246, "grad_norm": 0.46544885140833864, "learning_rate": 5e-06, "loss": 0.662, "step": 90 }, { "epoch": 0.29261155815654716, "grad_norm": 0.4427594704326977, "learning_rate": 5e-06, "loss": 0.6545, "step": 100 }, { "epoch": 0.3218727139722019, "grad_norm": 0.4255743712663812, "learning_rate": 5e-06, "loss": 0.6633, "step": 110 }, { "epoch": 0.3511338697878566, "grad_norm": 0.37382775787193123, "learning_rate": 5e-06, "loss": 0.6682, "step": 120 }, { "epoch": 0.38039502560351135, "grad_norm": 0.38055941012143985, "learning_rate": 5e-06, "loss": 0.652, "step": 130 }, { "epoch": 0.40965618141916604, "grad_norm": 0.365314239083999, "learning_rate": 5e-06, "loss": 0.6527, "step": 140 }, { "epoch": 0.4389173372348208, "grad_norm": 0.34589597918392456, "learning_rate": 5e-06, "loss": 0.6481, "step": 150 }, { "epoch": 0.4681784930504755, "grad_norm": 0.3891716866109797, "learning_rate": 5e-06, "loss": 0.65, "step": 160 }, { "epoch": 0.49743964886613024, "grad_norm": 0.3629334963604037, "learning_rate": 5e-06, "loss": 0.655, "step": 170 }, { "epoch": 0.5267008046817849, "grad_norm": 0.33370615827223327, "learning_rate": 5e-06, "loss": 0.6546, "step": 180 }, { "epoch": 0.5559619604974396, "grad_norm": 0.36451231535366374, "learning_rate": 5e-06, "loss": 0.6432, "step": 190 }, { "epoch": 0.5852231163130943, "grad_norm": 0.3433862105922731, "learning_rate": 5e-06, "loss": 0.6444, "step": 200 }, { "epoch": 0.6144842721287491, "grad_norm": 0.3402627825523363, "learning_rate": 5e-06, "loss": 0.6509, "step": 210 }, { "epoch": 0.6437454279444038, "grad_norm": 0.34977805775534565, "learning_rate": 5e-06, "loss": 0.6389, "step": 220 }, { "epoch": 0.6730065837600585, "grad_norm": 0.36328360263674203, "learning_rate": 5e-06, "loss": 0.6389, "step": 230 }, { "epoch": 0.7022677395757132, "grad_norm": 0.3269783340514597, "learning_rate": 5e-06, "loss": 0.6457, "step": 240 }, { "epoch": 0.731528895391368, "grad_norm": 0.34709148478302637, "learning_rate": 5e-06, "loss": 0.6441, "step": 250 }, { "epoch": 0.7607900512070227, "grad_norm": 0.3386809515446319, "learning_rate": 5e-06, "loss": 0.6467, "step": 260 }, { "epoch": 0.7900512070226774, "grad_norm": 0.3708217576585751, "learning_rate": 5e-06, "loss": 0.649, "step": 270 }, { "epoch": 0.8193123628383321, "grad_norm": 0.3476600949918196, "learning_rate": 5e-06, "loss": 0.6478, "step": 280 }, { "epoch": 0.8485735186539868, "grad_norm": 0.3461817230116753, "learning_rate": 5e-06, "loss": 0.64, "step": 290 }, { "epoch": 0.8778346744696416, "grad_norm": 0.36176209446732716, "learning_rate": 5e-06, "loss": 0.632, "step": 300 }, { "epoch": 0.9070958302852963, "grad_norm": 0.35122909092020915, "learning_rate": 5e-06, "loss": 0.645, "step": 310 }, { "epoch": 0.936356986100951, "grad_norm": 0.3350984536401145, "learning_rate": 5e-06, "loss": 0.637, "step": 320 }, { "epoch": 0.9656181419166057, "grad_norm": 0.37466191333850035, "learning_rate": 5e-06, "loss": 0.6402, "step": 330 }, { "epoch": 0.9948792977322605, "grad_norm": 0.350268525150869, "learning_rate": 5e-06, "loss": 0.6362, "step": 340 }, { "epoch": 0.9978054133138259, "eval_loss": 0.6407278180122375, "eval_runtime": 345.7581, "eval_samples_per_second": 26.634, "eval_steps_per_second": 0.416, "step": 341 }, { "epoch": 1.025237746891002, "grad_norm": 0.3726855231058378, "learning_rate": 5e-06, "loss": 0.6515, "step": 350 }, { "epoch": 1.054498902706657, "grad_norm": 0.3528475470359308, "learning_rate": 5e-06, "loss": 0.6057, "step": 360 }, { "epoch": 1.0837600585223117, "grad_norm": 0.40430278120075436, "learning_rate": 5e-06, "loss": 0.6091, "step": 370 }, { "epoch": 1.1130212143379663, "grad_norm": 0.33594394270737427, "learning_rate": 5e-06, "loss": 0.6065, "step": 380 }, { "epoch": 1.142282370153621, "grad_norm": 0.3984462164005305, "learning_rate": 5e-06, "loss": 0.6118, "step": 390 }, { "epoch": 1.1715435259692757, "grad_norm": 0.3613121293262451, "learning_rate": 5e-06, "loss": 0.6133, "step": 400 }, { "epoch": 1.2008046817849305, "grad_norm": 0.35518722154006915, "learning_rate": 5e-06, "loss": 0.6129, "step": 410 }, { "epoch": 1.2300658376005853, "grad_norm": 0.40502626596849745, "learning_rate": 5e-06, "loss": 0.6011, "step": 420 }, { "epoch": 1.2593269934162399, "grad_norm": 0.3612357129760361, "learning_rate": 5e-06, "loss": 0.6113, "step": 430 }, { "epoch": 1.2885881492318947, "grad_norm": 0.3444217486315101, "learning_rate": 5e-06, "loss": 0.6038, "step": 440 }, { "epoch": 1.3178493050475493, "grad_norm": 0.36580965939804105, "learning_rate": 5e-06, "loss": 0.6076, "step": 450 }, { "epoch": 1.347110460863204, "grad_norm": 0.37654968983720283, "learning_rate": 5e-06, "loss": 0.6053, "step": 460 }, { "epoch": 1.3763716166788589, "grad_norm": 0.3223916565231839, "learning_rate": 5e-06, "loss": 0.6026, "step": 470 }, { "epoch": 1.4056327724945135, "grad_norm": 0.3686887217181575, "learning_rate": 5e-06, "loss": 0.6035, "step": 480 }, { "epoch": 1.4348939283101683, "grad_norm": 0.34546616075721737, "learning_rate": 5e-06, "loss": 0.6098, "step": 490 }, { "epoch": 1.464155084125823, "grad_norm": 0.34062759074897847, "learning_rate": 5e-06, "loss": 0.5982, "step": 500 }, { "epoch": 1.4934162399414777, "grad_norm": 0.34984758972682745, "learning_rate": 5e-06, "loss": 0.6062, "step": 510 }, { "epoch": 1.5226773957571325, "grad_norm": 0.38653752701593264, "learning_rate": 5e-06, "loss": 0.6085, "step": 520 }, { "epoch": 1.5519385515727873, "grad_norm": 0.34980776029961624, "learning_rate": 5e-06, "loss": 0.6071, "step": 530 }, { "epoch": 1.5811997073884418, "grad_norm": 0.36760348585346686, "learning_rate": 5e-06, "loss": 0.6137, "step": 540 }, { "epoch": 1.6104608632040964, "grad_norm": 0.3547689325544471, "learning_rate": 5e-06, "loss": 0.6048, "step": 550 }, { "epoch": 1.6397220190197512, "grad_norm": 0.3543839427681781, "learning_rate": 5e-06, "loss": 0.6065, "step": 560 }, { "epoch": 1.668983174835406, "grad_norm": 0.32142999725489213, "learning_rate": 5e-06, "loss": 0.5971, "step": 570 }, { "epoch": 1.6982443306510606, "grad_norm": 0.3492941780396127, "learning_rate": 5e-06, "loss": 0.6064, "step": 580 }, { "epoch": 1.7275054864667154, "grad_norm": 0.4044986438848018, "learning_rate": 5e-06, "loss": 0.6066, "step": 590 }, { "epoch": 1.7567666422823702, "grad_norm": 0.33644823828470133, "learning_rate": 5e-06, "loss": 0.6035, "step": 600 }, { "epoch": 1.7860277980980248, "grad_norm": 0.3426307215328823, "learning_rate": 5e-06, "loss": 0.606, "step": 610 }, { "epoch": 1.8152889539136796, "grad_norm": 0.3228193342410931, "learning_rate": 5e-06, "loss": 0.6064, "step": 620 }, { "epoch": 1.8445501097293344, "grad_norm": 0.35917298085170585, "learning_rate": 5e-06, "loss": 0.6055, "step": 630 }, { "epoch": 1.873811265544989, "grad_norm": 0.3266610800239347, "learning_rate": 5e-06, "loss": 0.6003, "step": 640 }, { "epoch": 1.9030724213606436, "grad_norm": 0.3663755934667714, "learning_rate": 5e-06, "loss": 0.606, "step": 650 }, { "epoch": 1.9323335771762986, "grad_norm": 0.3377853190846914, "learning_rate": 5e-06, "loss": 0.6065, "step": 660 }, { "epoch": 1.9615947329919532, "grad_norm": 0.3309604588866561, "learning_rate": 5e-06, "loss": 0.5954, "step": 670 }, { "epoch": 1.9908558888076078, "grad_norm": 0.3506238935132811, "learning_rate": 5e-06, "loss": 0.6098, "step": 680 }, { "epoch": 1.9967081199707388, "eval_loss": 0.6332589387893677, "eval_runtime": 344.3907, "eval_samples_per_second": 26.74, "eval_steps_per_second": 0.418, "step": 682 }, { "epoch": 2.0212143379663496, "grad_norm": 0.38883682801871317, "learning_rate": 5e-06, "loss": 0.6199, "step": 690 }, { "epoch": 2.050475493782004, "grad_norm": 0.4419278583439316, "learning_rate": 5e-06, "loss": 0.5676, "step": 700 }, { "epoch": 2.0797366495976592, "grad_norm": 0.35170054794891586, "learning_rate": 5e-06, "loss": 0.5696, "step": 710 }, { "epoch": 2.108997805413314, "grad_norm": 0.3754626962728891, "learning_rate": 5e-06, "loss": 0.5706, "step": 720 }, { "epoch": 2.1382589612289684, "grad_norm": 0.34912008172674025, "learning_rate": 5e-06, "loss": 0.5725, "step": 730 }, { "epoch": 2.1675201170446234, "grad_norm": 0.3477889316265806, "learning_rate": 5e-06, "loss": 0.5777, "step": 740 }, { "epoch": 2.196781272860278, "grad_norm": 0.3159196077934605, "learning_rate": 5e-06, "loss": 0.5765, "step": 750 }, { "epoch": 2.2260424286759326, "grad_norm": 0.36710603345755066, "learning_rate": 5e-06, "loss": 0.5739, "step": 760 }, { "epoch": 2.255303584491587, "grad_norm": 0.3287401830878657, "learning_rate": 5e-06, "loss": 0.5727, "step": 770 }, { "epoch": 2.284564740307242, "grad_norm": 0.3338056309536922, "learning_rate": 5e-06, "loss": 0.5718, "step": 780 }, { "epoch": 2.313825896122897, "grad_norm": 0.3284098603353004, "learning_rate": 5e-06, "loss": 0.5701, "step": 790 }, { "epoch": 2.3430870519385514, "grad_norm": 0.3604858195175541, "learning_rate": 5e-06, "loss": 0.5779, "step": 800 }, { "epoch": 2.3723482077542064, "grad_norm": 0.3372639613027786, "learning_rate": 5e-06, "loss": 0.573, "step": 810 }, { "epoch": 2.401609363569861, "grad_norm": 0.33077310241977137, "learning_rate": 5e-06, "loss": 0.5793, "step": 820 }, { "epoch": 2.4308705193855156, "grad_norm": 0.317174042992232, "learning_rate": 5e-06, "loss": 0.5716, "step": 830 }, { "epoch": 2.4601316752011706, "grad_norm": 0.35720787387470887, "learning_rate": 5e-06, "loss": 0.572, "step": 840 }, { "epoch": 2.489392831016825, "grad_norm": 0.34716951034356963, "learning_rate": 5e-06, "loss": 0.5743, "step": 850 }, { "epoch": 2.5186539868324798, "grad_norm": 0.3548928579154118, "learning_rate": 5e-06, "loss": 0.5673, "step": 860 }, { "epoch": 2.547915142648135, "grad_norm": 0.39809013054159753, "learning_rate": 5e-06, "loss": 0.5698, "step": 870 }, { "epoch": 2.5771762984637894, "grad_norm": 0.34975749471054834, "learning_rate": 5e-06, "loss": 0.574, "step": 880 }, { "epoch": 2.606437454279444, "grad_norm": 0.3597960490980802, "learning_rate": 5e-06, "loss": 0.5733, "step": 890 }, { "epoch": 2.6356986100950985, "grad_norm": 0.3387827525529806, "learning_rate": 5e-06, "loss": 0.5779, "step": 900 }, { "epoch": 2.6649597659107536, "grad_norm": 0.3595617862544763, "learning_rate": 5e-06, "loss": 0.578, "step": 910 }, { "epoch": 2.694220921726408, "grad_norm": 0.35719839432443745, "learning_rate": 5e-06, "loss": 0.5724, "step": 920 }, { "epoch": 2.723482077542063, "grad_norm": 0.352313951049118, "learning_rate": 5e-06, "loss": 0.576, "step": 930 }, { "epoch": 2.7527432333577178, "grad_norm": 0.34939286466829683, "learning_rate": 5e-06, "loss": 0.575, "step": 940 }, { "epoch": 2.7820043891733723, "grad_norm": 0.34251101062213785, "learning_rate": 5e-06, "loss": 0.5737, "step": 950 }, { "epoch": 2.811265544989027, "grad_norm": 0.3751942224920435, "learning_rate": 5e-06, "loss": 0.5776, "step": 960 }, { "epoch": 2.840526700804682, "grad_norm": 0.34536278066429604, "learning_rate": 5e-06, "loss": 0.5733, "step": 970 }, { "epoch": 2.8697878566203365, "grad_norm": 0.3744437847623413, "learning_rate": 5e-06, "loss": 0.5731, "step": 980 }, { "epoch": 2.899049012435991, "grad_norm": 0.32468861500765744, "learning_rate": 5e-06, "loss": 0.5661, "step": 990 }, { "epoch": 2.928310168251646, "grad_norm": 0.32492564537112933, "learning_rate": 5e-06, "loss": 0.5739, "step": 1000 }, { "epoch": 2.9575713240673007, "grad_norm": 0.3516943430167048, "learning_rate": 5e-06, "loss": 0.5656, "step": 1010 }, { "epoch": 2.9868324798829553, "grad_norm": 0.3370882708288318, "learning_rate": 5e-06, "loss": 0.5773, "step": 1020 }, { "epoch": 2.9956108266276518, "eval_loss": 0.6331183314323425, "eval_runtime": 344.6885, "eval_samples_per_second": 26.717, "eval_steps_per_second": 0.418, "step": 1023 }, { "epoch": 2.9956108266276518, "step": 1023, "total_flos": 2144987064041472.0, "train_loss": 0.614284630744688, "train_runtime": 55138.6251, "train_samples_per_second": 9.519, "train_steps_per_second": 0.019 } ], "logging_steps": 10, "max_steps": 1023, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2144987064041472.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }