|
{ |
|
"best_metric": 0.646575927734375, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-1840", |
|
"epoch": 2.798001427551749, |
|
"eval_steps": 40, |
|
"global_step": 1960, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0014275517487508922, |
|
"eval_loss": 2.6241602897644043, |
|
"eval_runtime": 3.454, |
|
"eval_samples_per_second": 33.295, |
|
"eval_steps_per_second": 1.158, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.05710206995003569, |
|
"eval_loss": 2.2859995365142822, |
|
"eval_runtime": 3.4369, |
|
"eval_samples_per_second": 33.461, |
|
"eval_steps_per_second": 1.164, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11420413990007137, |
|
"eval_loss": 1.9066081047058105, |
|
"eval_runtime": 3.569, |
|
"eval_samples_per_second": 32.222, |
|
"eval_steps_per_second": 1.121, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14275517487508924, |
|
"grad_norm": 6.376378536224365, |
|
"learning_rate": 0.00013761467889908255, |
|
"loss": 4.9536, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17130620985010706, |
|
"eval_loss": 1.7338049411773682, |
|
"eval_runtime": 3.5172, |
|
"eval_samples_per_second": 32.696, |
|
"eval_steps_per_second": 1.137, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22840827980014275, |
|
"eval_loss": 1.6253185272216797, |
|
"eval_runtime": 3.4638, |
|
"eval_samples_per_second": 33.201, |
|
"eval_steps_per_second": 1.155, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.28551034975017847, |
|
"grad_norm": 3.5151774883270264, |
|
"learning_rate": 0.0002752293577981651, |
|
"loss": 3.6406, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.28551034975017847, |
|
"eval_loss": 1.5643784999847412, |
|
"eval_runtime": 3.5326, |
|
"eval_samples_per_second": 32.554, |
|
"eval_steps_per_second": 1.132, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3426124197002141, |
|
"eval_loss": 1.4933531284332275, |
|
"eval_runtime": 3.5491, |
|
"eval_samples_per_second": 32.402, |
|
"eval_steps_per_second": 1.127, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3997144896502498, |
|
"eval_loss": 1.4442161321640015, |
|
"eval_runtime": 3.5073, |
|
"eval_samples_per_second": 32.789, |
|
"eval_steps_per_second": 1.14, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4282655246252677, |
|
"grad_norm": 3.7754976749420166, |
|
"learning_rate": 0.0002999958858736361, |
|
"loss": 3.1356, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4568165596002855, |
|
"eval_loss": 1.3713915348052979, |
|
"eval_runtime": 3.4547, |
|
"eval_samples_per_second": 33.288, |
|
"eval_steps_per_second": 1.158, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5139186295503212, |
|
"eval_loss": 1.3405636548995972, |
|
"eval_runtime": 3.4457, |
|
"eval_samples_per_second": 33.375, |
|
"eval_steps_per_second": 1.161, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5710206995003569, |
|
"grad_norm": 4.63564920425415, |
|
"learning_rate": 0.00029997973321299517, |
|
"loss": 2.8521, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5710206995003569, |
|
"eval_loss": 1.3139711618423462, |
|
"eval_runtime": 3.5109, |
|
"eval_samples_per_second": 32.756, |
|
"eval_steps_per_second": 1.139, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6281227694503926, |
|
"eval_loss": 1.229024052619934, |
|
"eval_runtime": 3.4196, |
|
"eval_samples_per_second": 33.63, |
|
"eval_steps_per_second": 1.17, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6852248394004282, |
|
"eval_loss": 1.1974313259124756, |
|
"eval_runtime": 3.4703, |
|
"eval_samples_per_second": 33.138, |
|
"eval_steps_per_second": 1.153, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7137758743754461, |
|
"grad_norm": 3.106452226638794, |
|
"learning_rate": 0.0002999513450932977, |
|
"loss": 2.7573, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.742326909350464, |
|
"eval_loss": 1.1787874698638916, |
|
"eval_runtime": 3.47, |
|
"eval_samples_per_second": 33.141, |
|
"eval_steps_per_second": 1.153, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7994289793004996, |
|
"eval_loss": 1.1294182538986206, |
|
"eval_runtime": 3.4194, |
|
"eval_samples_per_second": 33.632, |
|
"eval_steps_per_second": 1.17, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8565310492505354, |
|
"grad_norm": 4.655078411102295, |
|
"learning_rate": 0.00029991072383046797, |
|
"loss": 2.3985, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8565310492505354, |
|
"eval_loss": 1.1245826482772827, |
|
"eval_runtime": 3.472, |
|
"eval_samples_per_second": 33.122, |
|
"eval_steps_per_second": 1.152, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.913633119200571, |
|
"eval_loss": 1.0636686086654663, |
|
"eval_runtime": 3.4569, |
|
"eval_samples_per_second": 33.267, |
|
"eval_steps_per_second": 1.157, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9707351891506067, |
|
"eval_loss": 1.0166677236557007, |
|
"eval_runtime": 3.6167, |
|
"eval_samples_per_second": 31.797, |
|
"eval_steps_per_second": 1.106, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.9992862241256245, |
|
"grad_norm": 4.026608467102051, |
|
"learning_rate": 0.0002998578727384189, |
|
"loss": 2.2745, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0278372591006424, |
|
"eval_loss": 1.0103121995925903, |
|
"eval_runtime": 3.4673, |
|
"eval_samples_per_second": 33.167, |
|
"eval_steps_per_second": 1.154, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.0849393290506781, |
|
"eval_loss": 0.9873223900794983, |
|
"eval_runtime": 3.4608, |
|
"eval_samples_per_second": 33.229, |
|
"eval_steps_per_second": 1.156, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1420413990007137, |
|
"grad_norm": 6.209025859832764, |
|
"learning_rate": 0.00029979279612878226, |
|
"loss": 1.8141, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1420413990007137, |
|
"eval_loss": 0.9796751141548157, |
|
"eval_runtime": 3.5186, |
|
"eval_samples_per_second": 32.683, |
|
"eval_steps_per_second": 1.137, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1991434689507494, |
|
"eval_loss": 0.9596832990646362, |
|
"eval_runtime": 3.4646, |
|
"eval_samples_per_second": 33.193, |
|
"eval_steps_per_second": 1.155, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2562455389007852, |
|
"eval_loss": 0.945811927318573, |
|
"eval_runtime": 3.4319, |
|
"eval_samples_per_second": 33.509, |
|
"eval_steps_per_second": 1.166, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.284796573875803, |
|
"grad_norm": 4.082645416259766, |
|
"learning_rate": 0.0002997154993105566, |
|
"loss": 1.5585, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.313347608850821, |
|
"eval_loss": 0.9203804135322571, |
|
"eval_runtime": 3.4359, |
|
"eval_samples_per_second": 33.47, |
|
"eval_steps_per_second": 1.164, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3704496788008567, |
|
"eval_loss": 0.9172277450561523, |
|
"eval_runtime": 3.479, |
|
"eval_samples_per_second": 33.056, |
|
"eval_steps_per_second": 1.15, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4275517487508922, |
|
"grad_norm": 4.086517333984375, |
|
"learning_rate": 0.0002996259885896743, |
|
"loss": 1.5745, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.4275517487508922, |
|
"eval_loss": 0.9019652009010315, |
|
"eval_runtime": 3.5068, |
|
"eval_samples_per_second": 32.793, |
|
"eval_steps_per_second": 1.141, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.484653818700928, |
|
"eval_loss": 0.8774219751358032, |
|
"eval_runtime": 3.5022, |
|
"eval_samples_per_second": 32.836, |
|
"eval_steps_per_second": 1.142, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.5417558886509637, |
|
"eval_loss": 0.8411704897880554, |
|
"eval_runtime": 3.4781, |
|
"eval_samples_per_second": 33.064, |
|
"eval_steps_per_second": 1.15, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.5703069236259815, |
|
"grad_norm": 1.3416516780853271, |
|
"learning_rate": 0.0002995242712684871, |
|
"loss": 1.3623, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.5988579586009992, |
|
"eval_loss": 0.8341473340988159, |
|
"eval_runtime": 3.5251, |
|
"eval_samples_per_second": 32.623, |
|
"eval_steps_per_second": 1.135, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.655960028551035, |
|
"eval_loss": 0.8323267102241516, |
|
"eval_runtime": 3.4934, |
|
"eval_samples_per_second": 32.919, |
|
"eval_steps_per_second": 1.145, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.7130620985010707, |
|
"grad_norm": 5.395486354827881, |
|
"learning_rate": 0.0002994103556451703, |
|
"loss": 1.3262, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7130620985010707, |
|
"eval_loss": 0.7996125221252441, |
|
"eval_runtime": 3.4858, |
|
"eval_samples_per_second": 32.991, |
|
"eval_steps_per_second": 1.147, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7701641684511062, |
|
"eval_loss": 0.8066531419754028, |
|
"eval_runtime": 3.4788, |
|
"eval_samples_per_second": 33.057, |
|
"eval_steps_per_second": 1.15, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.827266238401142, |
|
"eval_loss": 0.7751766443252563, |
|
"eval_runtime": 3.4563, |
|
"eval_samples_per_second": 33.273, |
|
"eval_steps_per_second": 1.157, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.85581727337616, |
|
"grad_norm": 1.97977876663208, |
|
"learning_rate": 0.00029928425101304583, |
|
"loss": 1.2514, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.8843683083511777, |
|
"eval_loss": 0.7176135778427124, |
|
"eval_runtime": 3.4289, |
|
"eval_samples_per_second": 33.539, |
|
"eval_steps_per_second": 1.167, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.9414703783012133, |
|
"eval_loss": 0.7170359492301941, |
|
"eval_runtime": 3.4671, |
|
"eval_samples_per_second": 33.168, |
|
"eval_steps_per_second": 1.154, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.9985724482512492, |
|
"grad_norm": 3.208996057510376, |
|
"learning_rate": 0.0002991459676598241, |
|
"loss": 1.2277, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.9985724482512492, |
|
"eval_loss": 0.7038947939872742, |
|
"eval_runtime": 3.4967, |
|
"eval_samples_per_second": 32.888, |
|
"eval_steps_per_second": 1.144, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.0556745182012848, |
|
"eval_loss": 0.7032578587532043, |
|
"eval_runtime": 3.4286, |
|
"eval_samples_per_second": 33.542, |
|
"eval_steps_per_second": 1.167, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.1127765881513203, |
|
"eval_loss": 0.7024741172790527, |
|
"eval_runtime": 3.4401, |
|
"eval_samples_per_second": 33.429, |
|
"eval_steps_per_second": 1.163, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.1413276231263385, |
|
"grad_norm": 5.024557113647461, |
|
"learning_rate": 0.0002989955168667647, |
|
"loss": 0.68, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.1698786581013563, |
|
"eval_loss": 0.7079240083694458, |
|
"eval_runtime": 3.4687, |
|
"eval_samples_per_second": 33.154, |
|
"eval_steps_per_second": 1.153, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.226980728051392, |
|
"eval_loss": 0.7214464545249939, |
|
"eval_runtime": 3.4537, |
|
"eval_samples_per_second": 33.297, |
|
"eval_steps_per_second": 1.158, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.2840827980014273, |
|
"grad_norm": 2.2797622680664062, |
|
"learning_rate": 0.0002988329109077561, |
|
"loss": 0.809, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.2840827980014273, |
|
"eval_loss": 0.6896761059761047, |
|
"eval_runtime": 3.4437, |
|
"eval_samples_per_second": 33.394, |
|
"eval_steps_per_second": 1.162, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.3411848679514633, |
|
"eval_loss": 0.7046868801116943, |
|
"eval_runtime": 3.4498, |
|
"eval_samples_per_second": 33.335, |
|
"eval_steps_per_second": 1.159, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.398286937901499, |
|
"eval_loss": 0.7042288780212402, |
|
"eval_runtime": 3.425, |
|
"eval_samples_per_second": 33.577, |
|
"eval_steps_per_second": 1.168, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.4268379728765166, |
|
"grad_norm": 2.373941421508789, |
|
"learning_rate": 0.00029865816304831436, |
|
"loss": 0.7931, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.455389007851535, |
|
"eval_loss": 0.6791558861732483, |
|
"eval_runtime": 3.4601, |
|
"eval_samples_per_second": 33.236, |
|
"eval_steps_per_second": 1.156, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.5124910778015703, |
|
"eval_loss": 0.6776650547981262, |
|
"eval_runtime": 3.4939, |
|
"eval_samples_per_second": 32.915, |
|
"eval_steps_per_second": 1.145, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.569593147751606, |
|
"grad_norm": 3.9278931617736816, |
|
"learning_rate": 0.0002984712875445008, |
|
"loss": 0.6957, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.569593147751606, |
|
"eval_loss": 0.6809150576591492, |
|
"eval_runtime": 3.4802, |
|
"eval_samples_per_second": 33.044, |
|
"eval_steps_per_second": 1.149, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.626695217701642, |
|
"eval_loss": 0.646575927734375, |
|
"eval_runtime": 3.462, |
|
"eval_samples_per_second": 33.218, |
|
"eval_steps_per_second": 1.155, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.6837972876516774, |
|
"eval_loss": 0.6704702377319336, |
|
"eval_runtime": 3.4863, |
|
"eval_samples_per_second": 32.986, |
|
"eval_steps_per_second": 1.147, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.712348322626695, |
|
"grad_norm": 3.325489044189453, |
|
"learning_rate": 0.0002982722996417592, |
|
"loss": 0.7611, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.7408993576017133, |
|
"eval_loss": 0.6548537015914917, |
|
"eval_runtime": 3.5385, |
|
"eval_samples_per_second": 32.5, |
|
"eval_steps_per_second": 1.13, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.798001427551749, |
|
"eval_loss": 0.6539962887763977, |
|
"eval_runtime": 3.4712, |
|
"eval_samples_per_second": 33.13, |
|
"eval_steps_per_second": 1.152, |
|
"step": 1960 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 35000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 40, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 3 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.3002531516841984e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|