{ "best_metric": 0.646575927734375, "best_model_checkpoint": "miner_id_24/checkpoint-1840", "epoch": 2.798001427551749, "eval_steps": 40, "global_step": 1960, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014275517487508922, "eval_loss": 2.6241602897644043, "eval_runtime": 3.454, "eval_samples_per_second": 33.295, "eval_steps_per_second": 1.158, "step": 1 }, { "epoch": 0.05710206995003569, "eval_loss": 2.2859995365142822, "eval_runtime": 3.4369, "eval_samples_per_second": 33.461, "eval_steps_per_second": 1.164, "step": 40 }, { "epoch": 0.11420413990007137, "eval_loss": 1.9066081047058105, "eval_runtime": 3.569, "eval_samples_per_second": 32.222, "eval_steps_per_second": 1.121, "step": 80 }, { "epoch": 0.14275517487508924, "grad_norm": 6.376378536224365, "learning_rate": 0.00013761467889908255, "loss": 4.9536, "step": 100 }, { "epoch": 0.17130620985010706, "eval_loss": 1.7338049411773682, "eval_runtime": 3.5172, "eval_samples_per_second": 32.696, "eval_steps_per_second": 1.137, "step": 120 }, { "epoch": 0.22840827980014275, "eval_loss": 1.6253185272216797, "eval_runtime": 3.4638, "eval_samples_per_second": 33.201, "eval_steps_per_second": 1.155, "step": 160 }, { "epoch": 0.28551034975017847, "grad_norm": 3.5151774883270264, "learning_rate": 0.0002752293577981651, "loss": 3.6406, "step": 200 }, { "epoch": 0.28551034975017847, "eval_loss": 1.5643784999847412, "eval_runtime": 3.5326, "eval_samples_per_second": 32.554, "eval_steps_per_second": 1.132, "step": 200 }, { "epoch": 0.3426124197002141, "eval_loss": 1.4933531284332275, "eval_runtime": 3.5491, "eval_samples_per_second": 32.402, "eval_steps_per_second": 1.127, "step": 240 }, { "epoch": 0.3997144896502498, "eval_loss": 1.4442161321640015, "eval_runtime": 3.5073, "eval_samples_per_second": 32.789, "eval_steps_per_second": 1.14, "step": 280 }, { "epoch": 0.4282655246252677, "grad_norm": 3.7754976749420166, "learning_rate": 0.0002999958858736361, "loss": 3.1356, "step": 300 }, { "epoch": 0.4568165596002855, "eval_loss": 1.3713915348052979, "eval_runtime": 3.4547, "eval_samples_per_second": 33.288, "eval_steps_per_second": 1.158, "step": 320 }, { "epoch": 0.5139186295503212, "eval_loss": 1.3405636548995972, "eval_runtime": 3.4457, "eval_samples_per_second": 33.375, "eval_steps_per_second": 1.161, "step": 360 }, { "epoch": 0.5710206995003569, "grad_norm": 4.63564920425415, "learning_rate": 0.00029997973321299517, "loss": 2.8521, "step": 400 }, { "epoch": 0.5710206995003569, "eval_loss": 1.3139711618423462, "eval_runtime": 3.5109, "eval_samples_per_second": 32.756, "eval_steps_per_second": 1.139, "step": 400 }, { "epoch": 0.6281227694503926, "eval_loss": 1.229024052619934, "eval_runtime": 3.4196, "eval_samples_per_second": 33.63, "eval_steps_per_second": 1.17, "step": 440 }, { "epoch": 0.6852248394004282, "eval_loss": 1.1974313259124756, "eval_runtime": 3.4703, "eval_samples_per_second": 33.138, "eval_steps_per_second": 1.153, "step": 480 }, { "epoch": 0.7137758743754461, "grad_norm": 3.106452226638794, "learning_rate": 0.0002999513450932977, "loss": 2.7573, "step": 500 }, { "epoch": 0.742326909350464, "eval_loss": 1.1787874698638916, "eval_runtime": 3.47, "eval_samples_per_second": 33.141, "eval_steps_per_second": 1.153, "step": 520 }, { "epoch": 0.7994289793004996, "eval_loss": 1.1294182538986206, "eval_runtime": 3.4194, "eval_samples_per_second": 33.632, "eval_steps_per_second": 1.17, "step": 560 }, { "epoch": 0.8565310492505354, "grad_norm": 4.655078411102295, "learning_rate": 0.00029991072383046797, "loss": 2.3985, "step": 600 }, { "epoch": 0.8565310492505354, "eval_loss": 1.1245826482772827, "eval_runtime": 3.472, "eval_samples_per_second": 33.122, "eval_steps_per_second": 1.152, "step": 600 }, { "epoch": 0.913633119200571, "eval_loss": 1.0636686086654663, "eval_runtime": 3.4569, "eval_samples_per_second": 33.267, "eval_steps_per_second": 1.157, "step": 640 }, { "epoch": 0.9707351891506067, "eval_loss": 1.0166677236557007, "eval_runtime": 3.6167, "eval_samples_per_second": 31.797, "eval_steps_per_second": 1.106, "step": 680 }, { "epoch": 0.9992862241256245, "grad_norm": 4.026608467102051, "learning_rate": 0.0002998578727384189, "loss": 2.2745, "step": 700 }, { "epoch": 1.0278372591006424, "eval_loss": 1.0103121995925903, "eval_runtime": 3.4673, "eval_samples_per_second": 33.167, "eval_steps_per_second": 1.154, "step": 720 }, { "epoch": 1.0849393290506781, "eval_loss": 0.9873223900794983, "eval_runtime": 3.4608, "eval_samples_per_second": 33.229, "eval_steps_per_second": 1.156, "step": 760 }, { "epoch": 1.1420413990007137, "grad_norm": 6.209025859832764, "learning_rate": 0.00029979279612878226, "loss": 1.8141, "step": 800 }, { "epoch": 1.1420413990007137, "eval_loss": 0.9796751141548157, "eval_runtime": 3.5186, "eval_samples_per_second": 32.683, "eval_steps_per_second": 1.137, "step": 800 }, { "epoch": 1.1991434689507494, "eval_loss": 0.9596832990646362, "eval_runtime": 3.4646, "eval_samples_per_second": 33.193, "eval_steps_per_second": 1.155, "step": 840 }, { "epoch": 1.2562455389007852, "eval_loss": 0.945811927318573, "eval_runtime": 3.4319, "eval_samples_per_second": 33.509, "eval_steps_per_second": 1.166, "step": 880 }, { "epoch": 1.284796573875803, "grad_norm": 4.082645416259766, "learning_rate": 0.0002997154993105566, "loss": 1.5585, "step": 900 }, { "epoch": 1.313347608850821, "eval_loss": 0.9203804135322571, "eval_runtime": 3.4359, "eval_samples_per_second": 33.47, "eval_steps_per_second": 1.164, "step": 920 }, { "epoch": 1.3704496788008567, "eval_loss": 0.9172277450561523, "eval_runtime": 3.479, "eval_samples_per_second": 33.056, "eval_steps_per_second": 1.15, "step": 960 }, { "epoch": 1.4275517487508922, "grad_norm": 4.086517333984375, "learning_rate": 0.0002996259885896743, "loss": 1.5745, "step": 1000 }, { "epoch": 1.4275517487508922, "eval_loss": 0.9019652009010315, "eval_runtime": 3.5068, "eval_samples_per_second": 32.793, "eval_steps_per_second": 1.141, "step": 1000 }, { "epoch": 1.484653818700928, "eval_loss": 0.8774219751358032, "eval_runtime": 3.5022, "eval_samples_per_second": 32.836, "eval_steps_per_second": 1.142, "step": 1040 }, { "epoch": 1.5417558886509637, "eval_loss": 0.8411704897880554, "eval_runtime": 3.4781, "eval_samples_per_second": 33.064, "eval_steps_per_second": 1.15, "step": 1080 }, { "epoch": 1.5703069236259815, "grad_norm": 1.3416516780853271, "learning_rate": 0.0002995242712684871, "loss": 1.3623, "step": 1100 }, { "epoch": 1.5988579586009992, "eval_loss": 0.8341473340988159, "eval_runtime": 3.5251, "eval_samples_per_second": 32.623, "eval_steps_per_second": 1.135, "step": 1120 }, { "epoch": 1.655960028551035, "eval_loss": 0.8323267102241516, "eval_runtime": 3.4934, "eval_samples_per_second": 32.919, "eval_steps_per_second": 1.145, "step": 1160 }, { "epoch": 1.7130620985010707, "grad_norm": 5.395486354827881, "learning_rate": 0.0002994103556451703, "loss": 1.3262, "step": 1200 }, { "epoch": 1.7130620985010707, "eval_loss": 0.7996125221252441, "eval_runtime": 3.4858, "eval_samples_per_second": 32.991, "eval_steps_per_second": 1.147, "step": 1200 }, { "epoch": 1.7701641684511062, "eval_loss": 0.8066531419754028, "eval_runtime": 3.4788, "eval_samples_per_second": 33.057, "eval_steps_per_second": 1.15, "step": 1240 }, { "epoch": 1.827266238401142, "eval_loss": 0.7751766443252563, "eval_runtime": 3.4563, "eval_samples_per_second": 33.273, "eval_steps_per_second": 1.157, "step": 1280 }, { "epoch": 1.85581727337616, "grad_norm": 1.97977876663208, "learning_rate": 0.00029928425101304583, "loss": 1.2514, "step": 1300 }, { "epoch": 1.8843683083511777, "eval_loss": 0.7176135778427124, "eval_runtime": 3.4289, "eval_samples_per_second": 33.539, "eval_steps_per_second": 1.167, "step": 1320 }, { "epoch": 1.9414703783012133, "eval_loss": 0.7170359492301941, "eval_runtime": 3.4671, "eval_samples_per_second": 33.168, "eval_steps_per_second": 1.154, "step": 1360 }, { "epoch": 1.9985724482512492, "grad_norm": 3.208996057510376, "learning_rate": 0.0002991459676598241, "loss": 1.2277, "step": 1400 }, { "epoch": 1.9985724482512492, "eval_loss": 0.7038947939872742, "eval_runtime": 3.4967, "eval_samples_per_second": 32.888, "eval_steps_per_second": 1.144, "step": 1400 }, { "epoch": 2.0556745182012848, "eval_loss": 0.7032578587532043, "eval_runtime": 3.4286, "eval_samples_per_second": 33.542, "eval_steps_per_second": 1.167, "step": 1440 }, { "epoch": 2.1127765881513203, "eval_loss": 0.7024741172790527, "eval_runtime": 3.4401, "eval_samples_per_second": 33.429, "eval_steps_per_second": 1.163, "step": 1480 }, { "epoch": 2.1413276231263385, "grad_norm": 5.024557113647461, "learning_rate": 0.0002989955168667647, "loss": 0.68, "step": 1500 }, { "epoch": 2.1698786581013563, "eval_loss": 0.7079240083694458, "eval_runtime": 3.4687, "eval_samples_per_second": 33.154, "eval_steps_per_second": 1.153, "step": 1520 }, { "epoch": 2.226980728051392, "eval_loss": 0.7214464545249939, "eval_runtime": 3.4537, "eval_samples_per_second": 33.297, "eval_steps_per_second": 1.158, "step": 1560 }, { "epoch": 2.2840827980014273, "grad_norm": 2.2797622680664062, "learning_rate": 0.0002988329109077561, "loss": 0.809, "step": 1600 }, { "epoch": 2.2840827980014273, "eval_loss": 0.6896761059761047, "eval_runtime": 3.4437, "eval_samples_per_second": 33.394, "eval_steps_per_second": 1.162, "step": 1600 }, { "epoch": 2.3411848679514633, "eval_loss": 0.7046868801116943, "eval_runtime": 3.4498, "eval_samples_per_second": 33.335, "eval_steps_per_second": 1.159, "step": 1640 }, { "epoch": 2.398286937901499, "eval_loss": 0.7042288780212402, "eval_runtime": 3.425, "eval_samples_per_second": 33.577, "eval_steps_per_second": 1.168, "step": 1680 }, { "epoch": 2.4268379728765166, "grad_norm": 2.373941421508789, "learning_rate": 0.00029865816304831436, "loss": 0.7931, "step": 1700 }, { "epoch": 2.455389007851535, "eval_loss": 0.6791558861732483, "eval_runtime": 3.4601, "eval_samples_per_second": 33.236, "eval_steps_per_second": 1.156, "step": 1720 }, { "epoch": 2.5124910778015703, "eval_loss": 0.6776650547981262, "eval_runtime": 3.4939, "eval_samples_per_second": 32.915, "eval_steps_per_second": 1.145, "step": 1760 }, { "epoch": 2.569593147751606, "grad_norm": 3.9278931617736816, "learning_rate": 0.0002984712875445008, "loss": 0.6957, "step": 1800 }, { "epoch": 2.569593147751606, "eval_loss": 0.6809150576591492, "eval_runtime": 3.4802, "eval_samples_per_second": 33.044, "eval_steps_per_second": 1.149, "step": 1800 }, { "epoch": 2.626695217701642, "eval_loss": 0.646575927734375, "eval_runtime": 3.462, "eval_samples_per_second": 33.218, "eval_steps_per_second": 1.155, "step": 1840 }, { "epoch": 2.6837972876516774, "eval_loss": 0.6704702377319336, "eval_runtime": 3.4863, "eval_samples_per_second": 32.986, "eval_steps_per_second": 1.147, "step": 1880 }, { "epoch": 2.712348322626695, "grad_norm": 3.325489044189453, "learning_rate": 0.0002982722996417592, "loss": 0.7611, "step": 1900 }, { "epoch": 2.7408993576017133, "eval_loss": 0.6548537015914917, "eval_runtime": 3.5385, "eval_samples_per_second": 32.5, "eval_steps_per_second": 1.13, "step": 1920 }, { "epoch": 2.798001427551749, "eval_loss": 0.6539962887763977, "eval_runtime": 3.4712, "eval_samples_per_second": 33.13, "eval_steps_per_second": 1.152, "step": 1960 } ], "logging_steps": 100, "max_steps": 35000, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 40, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.3002531516841984e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }