mrferr3t's picture
Training in progress, step 1960, checkpoint
e01f90a verified
raw
history blame
15 kB
{
"best_metric": 0.646575927734375,
"best_model_checkpoint": "miner_id_24/checkpoint-1840",
"epoch": 2.798001427551749,
"eval_steps": 40,
"global_step": 1960,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014275517487508922,
"eval_loss": 2.6241602897644043,
"eval_runtime": 3.454,
"eval_samples_per_second": 33.295,
"eval_steps_per_second": 1.158,
"step": 1
},
{
"epoch": 0.05710206995003569,
"eval_loss": 2.2859995365142822,
"eval_runtime": 3.4369,
"eval_samples_per_second": 33.461,
"eval_steps_per_second": 1.164,
"step": 40
},
{
"epoch": 0.11420413990007137,
"eval_loss": 1.9066081047058105,
"eval_runtime": 3.569,
"eval_samples_per_second": 32.222,
"eval_steps_per_second": 1.121,
"step": 80
},
{
"epoch": 0.14275517487508924,
"grad_norm": 6.376378536224365,
"learning_rate": 0.00013761467889908255,
"loss": 4.9536,
"step": 100
},
{
"epoch": 0.17130620985010706,
"eval_loss": 1.7338049411773682,
"eval_runtime": 3.5172,
"eval_samples_per_second": 32.696,
"eval_steps_per_second": 1.137,
"step": 120
},
{
"epoch": 0.22840827980014275,
"eval_loss": 1.6253185272216797,
"eval_runtime": 3.4638,
"eval_samples_per_second": 33.201,
"eval_steps_per_second": 1.155,
"step": 160
},
{
"epoch": 0.28551034975017847,
"grad_norm": 3.5151774883270264,
"learning_rate": 0.0002752293577981651,
"loss": 3.6406,
"step": 200
},
{
"epoch": 0.28551034975017847,
"eval_loss": 1.5643784999847412,
"eval_runtime": 3.5326,
"eval_samples_per_second": 32.554,
"eval_steps_per_second": 1.132,
"step": 200
},
{
"epoch": 0.3426124197002141,
"eval_loss": 1.4933531284332275,
"eval_runtime": 3.5491,
"eval_samples_per_second": 32.402,
"eval_steps_per_second": 1.127,
"step": 240
},
{
"epoch": 0.3997144896502498,
"eval_loss": 1.4442161321640015,
"eval_runtime": 3.5073,
"eval_samples_per_second": 32.789,
"eval_steps_per_second": 1.14,
"step": 280
},
{
"epoch": 0.4282655246252677,
"grad_norm": 3.7754976749420166,
"learning_rate": 0.0002999958858736361,
"loss": 3.1356,
"step": 300
},
{
"epoch": 0.4568165596002855,
"eval_loss": 1.3713915348052979,
"eval_runtime": 3.4547,
"eval_samples_per_second": 33.288,
"eval_steps_per_second": 1.158,
"step": 320
},
{
"epoch": 0.5139186295503212,
"eval_loss": 1.3405636548995972,
"eval_runtime": 3.4457,
"eval_samples_per_second": 33.375,
"eval_steps_per_second": 1.161,
"step": 360
},
{
"epoch": 0.5710206995003569,
"grad_norm": 4.63564920425415,
"learning_rate": 0.00029997973321299517,
"loss": 2.8521,
"step": 400
},
{
"epoch": 0.5710206995003569,
"eval_loss": 1.3139711618423462,
"eval_runtime": 3.5109,
"eval_samples_per_second": 32.756,
"eval_steps_per_second": 1.139,
"step": 400
},
{
"epoch": 0.6281227694503926,
"eval_loss": 1.229024052619934,
"eval_runtime": 3.4196,
"eval_samples_per_second": 33.63,
"eval_steps_per_second": 1.17,
"step": 440
},
{
"epoch": 0.6852248394004282,
"eval_loss": 1.1974313259124756,
"eval_runtime": 3.4703,
"eval_samples_per_second": 33.138,
"eval_steps_per_second": 1.153,
"step": 480
},
{
"epoch": 0.7137758743754461,
"grad_norm": 3.106452226638794,
"learning_rate": 0.0002999513450932977,
"loss": 2.7573,
"step": 500
},
{
"epoch": 0.742326909350464,
"eval_loss": 1.1787874698638916,
"eval_runtime": 3.47,
"eval_samples_per_second": 33.141,
"eval_steps_per_second": 1.153,
"step": 520
},
{
"epoch": 0.7994289793004996,
"eval_loss": 1.1294182538986206,
"eval_runtime": 3.4194,
"eval_samples_per_second": 33.632,
"eval_steps_per_second": 1.17,
"step": 560
},
{
"epoch": 0.8565310492505354,
"grad_norm": 4.655078411102295,
"learning_rate": 0.00029991072383046797,
"loss": 2.3985,
"step": 600
},
{
"epoch": 0.8565310492505354,
"eval_loss": 1.1245826482772827,
"eval_runtime": 3.472,
"eval_samples_per_second": 33.122,
"eval_steps_per_second": 1.152,
"step": 600
},
{
"epoch": 0.913633119200571,
"eval_loss": 1.0636686086654663,
"eval_runtime": 3.4569,
"eval_samples_per_second": 33.267,
"eval_steps_per_second": 1.157,
"step": 640
},
{
"epoch": 0.9707351891506067,
"eval_loss": 1.0166677236557007,
"eval_runtime": 3.6167,
"eval_samples_per_second": 31.797,
"eval_steps_per_second": 1.106,
"step": 680
},
{
"epoch": 0.9992862241256245,
"grad_norm": 4.026608467102051,
"learning_rate": 0.0002998578727384189,
"loss": 2.2745,
"step": 700
},
{
"epoch": 1.0278372591006424,
"eval_loss": 1.0103121995925903,
"eval_runtime": 3.4673,
"eval_samples_per_second": 33.167,
"eval_steps_per_second": 1.154,
"step": 720
},
{
"epoch": 1.0849393290506781,
"eval_loss": 0.9873223900794983,
"eval_runtime": 3.4608,
"eval_samples_per_second": 33.229,
"eval_steps_per_second": 1.156,
"step": 760
},
{
"epoch": 1.1420413990007137,
"grad_norm": 6.209025859832764,
"learning_rate": 0.00029979279612878226,
"loss": 1.8141,
"step": 800
},
{
"epoch": 1.1420413990007137,
"eval_loss": 0.9796751141548157,
"eval_runtime": 3.5186,
"eval_samples_per_second": 32.683,
"eval_steps_per_second": 1.137,
"step": 800
},
{
"epoch": 1.1991434689507494,
"eval_loss": 0.9596832990646362,
"eval_runtime": 3.4646,
"eval_samples_per_second": 33.193,
"eval_steps_per_second": 1.155,
"step": 840
},
{
"epoch": 1.2562455389007852,
"eval_loss": 0.945811927318573,
"eval_runtime": 3.4319,
"eval_samples_per_second": 33.509,
"eval_steps_per_second": 1.166,
"step": 880
},
{
"epoch": 1.284796573875803,
"grad_norm": 4.082645416259766,
"learning_rate": 0.0002997154993105566,
"loss": 1.5585,
"step": 900
},
{
"epoch": 1.313347608850821,
"eval_loss": 0.9203804135322571,
"eval_runtime": 3.4359,
"eval_samples_per_second": 33.47,
"eval_steps_per_second": 1.164,
"step": 920
},
{
"epoch": 1.3704496788008567,
"eval_loss": 0.9172277450561523,
"eval_runtime": 3.479,
"eval_samples_per_second": 33.056,
"eval_steps_per_second": 1.15,
"step": 960
},
{
"epoch": 1.4275517487508922,
"grad_norm": 4.086517333984375,
"learning_rate": 0.0002996259885896743,
"loss": 1.5745,
"step": 1000
},
{
"epoch": 1.4275517487508922,
"eval_loss": 0.9019652009010315,
"eval_runtime": 3.5068,
"eval_samples_per_second": 32.793,
"eval_steps_per_second": 1.141,
"step": 1000
},
{
"epoch": 1.484653818700928,
"eval_loss": 0.8774219751358032,
"eval_runtime": 3.5022,
"eval_samples_per_second": 32.836,
"eval_steps_per_second": 1.142,
"step": 1040
},
{
"epoch": 1.5417558886509637,
"eval_loss": 0.8411704897880554,
"eval_runtime": 3.4781,
"eval_samples_per_second": 33.064,
"eval_steps_per_second": 1.15,
"step": 1080
},
{
"epoch": 1.5703069236259815,
"grad_norm": 1.3416516780853271,
"learning_rate": 0.0002995242712684871,
"loss": 1.3623,
"step": 1100
},
{
"epoch": 1.5988579586009992,
"eval_loss": 0.8341473340988159,
"eval_runtime": 3.5251,
"eval_samples_per_second": 32.623,
"eval_steps_per_second": 1.135,
"step": 1120
},
{
"epoch": 1.655960028551035,
"eval_loss": 0.8323267102241516,
"eval_runtime": 3.4934,
"eval_samples_per_second": 32.919,
"eval_steps_per_second": 1.145,
"step": 1160
},
{
"epoch": 1.7130620985010707,
"grad_norm": 5.395486354827881,
"learning_rate": 0.0002994103556451703,
"loss": 1.3262,
"step": 1200
},
{
"epoch": 1.7130620985010707,
"eval_loss": 0.7996125221252441,
"eval_runtime": 3.4858,
"eval_samples_per_second": 32.991,
"eval_steps_per_second": 1.147,
"step": 1200
},
{
"epoch": 1.7701641684511062,
"eval_loss": 0.8066531419754028,
"eval_runtime": 3.4788,
"eval_samples_per_second": 33.057,
"eval_steps_per_second": 1.15,
"step": 1240
},
{
"epoch": 1.827266238401142,
"eval_loss": 0.7751766443252563,
"eval_runtime": 3.4563,
"eval_samples_per_second": 33.273,
"eval_steps_per_second": 1.157,
"step": 1280
},
{
"epoch": 1.85581727337616,
"grad_norm": 1.97977876663208,
"learning_rate": 0.00029928425101304583,
"loss": 1.2514,
"step": 1300
},
{
"epoch": 1.8843683083511777,
"eval_loss": 0.7176135778427124,
"eval_runtime": 3.4289,
"eval_samples_per_second": 33.539,
"eval_steps_per_second": 1.167,
"step": 1320
},
{
"epoch": 1.9414703783012133,
"eval_loss": 0.7170359492301941,
"eval_runtime": 3.4671,
"eval_samples_per_second": 33.168,
"eval_steps_per_second": 1.154,
"step": 1360
},
{
"epoch": 1.9985724482512492,
"grad_norm": 3.208996057510376,
"learning_rate": 0.0002991459676598241,
"loss": 1.2277,
"step": 1400
},
{
"epoch": 1.9985724482512492,
"eval_loss": 0.7038947939872742,
"eval_runtime": 3.4967,
"eval_samples_per_second": 32.888,
"eval_steps_per_second": 1.144,
"step": 1400
},
{
"epoch": 2.0556745182012848,
"eval_loss": 0.7032578587532043,
"eval_runtime": 3.4286,
"eval_samples_per_second": 33.542,
"eval_steps_per_second": 1.167,
"step": 1440
},
{
"epoch": 2.1127765881513203,
"eval_loss": 0.7024741172790527,
"eval_runtime": 3.4401,
"eval_samples_per_second": 33.429,
"eval_steps_per_second": 1.163,
"step": 1480
},
{
"epoch": 2.1413276231263385,
"grad_norm": 5.024557113647461,
"learning_rate": 0.0002989955168667647,
"loss": 0.68,
"step": 1500
},
{
"epoch": 2.1698786581013563,
"eval_loss": 0.7079240083694458,
"eval_runtime": 3.4687,
"eval_samples_per_second": 33.154,
"eval_steps_per_second": 1.153,
"step": 1520
},
{
"epoch": 2.226980728051392,
"eval_loss": 0.7214464545249939,
"eval_runtime": 3.4537,
"eval_samples_per_second": 33.297,
"eval_steps_per_second": 1.158,
"step": 1560
},
{
"epoch": 2.2840827980014273,
"grad_norm": 2.2797622680664062,
"learning_rate": 0.0002988329109077561,
"loss": 0.809,
"step": 1600
},
{
"epoch": 2.2840827980014273,
"eval_loss": 0.6896761059761047,
"eval_runtime": 3.4437,
"eval_samples_per_second": 33.394,
"eval_steps_per_second": 1.162,
"step": 1600
},
{
"epoch": 2.3411848679514633,
"eval_loss": 0.7046868801116943,
"eval_runtime": 3.4498,
"eval_samples_per_second": 33.335,
"eval_steps_per_second": 1.159,
"step": 1640
},
{
"epoch": 2.398286937901499,
"eval_loss": 0.7042288780212402,
"eval_runtime": 3.425,
"eval_samples_per_second": 33.577,
"eval_steps_per_second": 1.168,
"step": 1680
},
{
"epoch": 2.4268379728765166,
"grad_norm": 2.373941421508789,
"learning_rate": 0.00029865816304831436,
"loss": 0.7931,
"step": 1700
},
{
"epoch": 2.455389007851535,
"eval_loss": 0.6791558861732483,
"eval_runtime": 3.4601,
"eval_samples_per_second": 33.236,
"eval_steps_per_second": 1.156,
"step": 1720
},
{
"epoch": 2.5124910778015703,
"eval_loss": 0.6776650547981262,
"eval_runtime": 3.4939,
"eval_samples_per_second": 32.915,
"eval_steps_per_second": 1.145,
"step": 1760
},
{
"epoch": 2.569593147751606,
"grad_norm": 3.9278931617736816,
"learning_rate": 0.0002984712875445008,
"loss": 0.6957,
"step": 1800
},
{
"epoch": 2.569593147751606,
"eval_loss": 0.6809150576591492,
"eval_runtime": 3.4802,
"eval_samples_per_second": 33.044,
"eval_steps_per_second": 1.149,
"step": 1800
},
{
"epoch": 2.626695217701642,
"eval_loss": 0.646575927734375,
"eval_runtime": 3.462,
"eval_samples_per_second": 33.218,
"eval_steps_per_second": 1.155,
"step": 1840
},
{
"epoch": 2.6837972876516774,
"eval_loss": 0.6704702377319336,
"eval_runtime": 3.4863,
"eval_samples_per_second": 32.986,
"eval_steps_per_second": 1.147,
"step": 1880
},
{
"epoch": 2.712348322626695,
"grad_norm": 3.325489044189453,
"learning_rate": 0.0002982722996417592,
"loss": 0.7611,
"step": 1900
},
{
"epoch": 2.7408993576017133,
"eval_loss": 0.6548537015914917,
"eval_runtime": 3.5385,
"eval_samples_per_second": 32.5,
"eval_steps_per_second": 1.13,
"step": 1920
},
{
"epoch": 2.798001427551749,
"eval_loss": 0.6539962887763977,
"eval_runtime": 3.4712,
"eval_samples_per_second": 33.13,
"eval_steps_per_second": 1.152,
"step": 1960
}
],
"logging_steps": 100,
"max_steps": 35000,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 40,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 3
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.3002531516841984e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}