Romain-XV's picture
Training in progress, step 100, checkpoint
3b43c7c verified
raw
history blame
18.7 kB
{
"best_metric": 0.07799232006072998,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 0.07573247503194964,
"eval_steps": 100,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007573247503194964,
"grad_norm": 0.9691388607025146,
"learning_rate": 2e-05,
"loss": 1.3196,
"step": 1
},
{
"epoch": 0.0007573247503194964,
"eval_loss": 1.60433828830719,
"eval_runtime": 661.4874,
"eval_samples_per_second": 6.724,
"eval_steps_per_second": 1.681,
"step": 1
},
{
"epoch": 0.0015146495006389928,
"grad_norm": 1.2349046468734741,
"learning_rate": 4e-05,
"loss": 1.5521,
"step": 2
},
{
"epoch": 0.002271974250958489,
"grad_norm": 1.330872654914856,
"learning_rate": 6e-05,
"loss": 1.4937,
"step": 3
},
{
"epoch": 0.0030292990012779856,
"grad_norm": 1.3608745336532593,
"learning_rate": 8e-05,
"loss": 1.5829,
"step": 4
},
{
"epoch": 0.003786623751597482,
"grad_norm": 1.3821661472320557,
"learning_rate": 0.0001,
"loss": 1.4121,
"step": 5
},
{
"epoch": 0.004543948501916978,
"grad_norm": 1.752526879310608,
"learning_rate": 0.00012,
"loss": 1.353,
"step": 6
},
{
"epoch": 0.005301273252236474,
"grad_norm": 1.3683583736419678,
"learning_rate": 0.00014,
"loss": 1.2337,
"step": 7
},
{
"epoch": 0.006058598002555971,
"grad_norm": 1.4449743032455444,
"learning_rate": 0.00016,
"loss": 0.9753,
"step": 8
},
{
"epoch": 0.006815922752875467,
"grad_norm": 1.432096242904663,
"learning_rate": 0.00018,
"loss": 0.8742,
"step": 9
},
{
"epoch": 0.007573247503194964,
"grad_norm": 1.2310497760772705,
"learning_rate": 0.0002,
"loss": 0.7185,
"step": 10
},
{
"epoch": 0.00833057225351446,
"grad_norm": 1.158161997795105,
"learning_rate": 0.00019996203070249516,
"loss": 0.633,
"step": 11
},
{
"epoch": 0.009087897003833957,
"grad_norm": 1.1543669700622559,
"learning_rate": 0.00019984815164333163,
"loss": 0.5801,
"step": 12
},
{
"epoch": 0.009845221754153453,
"grad_norm": 0.9982590079307556,
"learning_rate": 0.000199658449300667,
"loss": 0.5354,
"step": 13
},
{
"epoch": 0.010602546504472949,
"grad_norm": 0.8578788638114929,
"learning_rate": 0.00019939306773179497,
"loss": 0.5006,
"step": 14
},
{
"epoch": 0.011359871254792446,
"grad_norm": 0.5724056363105774,
"learning_rate": 0.00019905220846375032,
"loss": 0.4145,
"step": 15
},
{
"epoch": 0.012117196005111942,
"grad_norm": 0.5412246584892273,
"learning_rate": 0.00019863613034027224,
"loss": 0.4199,
"step": 16
},
{
"epoch": 0.012874520755431438,
"grad_norm": 0.4610505700111389,
"learning_rate": 0.0001981451493252418,
"loss": 0.4001,
"step": 17
},
{
"epoch": 0.013631845505750934,
"grad_norm": 0.47394484281539917,
"learning_rate": 0.00019757963826274357,
"loss": 0.3721,
"step": 18
},
{
"epoch": 0.014389170256070432,
"grad_norm": 0.43060505390167236,
"learning_rate": 0.00019694002659393305,
"loss": 0.3345,
"step": 19
},
{
"epoch": 0.015146495006389928,
"grad_norm": 0.46104079484939575,
"learning_rate": 0.00019622680003092503,
"loss": 0.4648,
"step": 20
},
{
"epoch": 0.015903819756709424,
"grad_norm": 0.38638371229171753,
"learning_rate": 0.00019544050018795075,
"loss": 0.2985,
"step": 21
},
{
"epoch": 0.01666114450702892,
"grad_norm": 0.4296252727508545,
"learning_rate": 0.00019458172417006347,
"loss": 0.2837,
"step": 22
},
{
"epoch": 0.017418469257348416,
"grad_norm": 0.4201738238334656,
"learning_rate": 0.0001936511241197055,
"loss": 0.2629,
"step": 23
},
{
"epoch": 0.018175794007667914,
"grad_norm": 0.46103158593177795,
"learning_rate": 0.00019264940672148018,
"loss": 0.2894,
"step": 24
},
{
"epoch": 0.01893311875798741,
"grad_norm": 0.36362382769584656,
"learning_rate": 0.00019157733266550575,
"loss": 0.2094,
"step": 25
},
{
"epoch": 0.019690443508306905,
"grad_norm": 0.42857611179351807,
"learning_rate": 0.00019043571606975777,
"loss": 0.2518,
"step": 26
},
{
"epoch": 0.020447768258626403,
"grad_norm": 0.7166701555252075,
"learning_rate": 0.0001892254238618394,
"loss": 0.2777,
"step": 27
},
{
"epoch": 0.021205093008945897,
"grad_norm": 0.3727664053440094,
"learning_rate": 0.0001879473751206489,
"loss": 0.2025,
"step": 28
},
{
"epoch": 0.021962417759265395,
"grad_norm": 0.3549087941646576,
"learning_rate": 0.00018660254037844388,
"loss": 0.1908,
"step": 29
},
{
"epoch": 0.022719742509584893,
"grad_norm": 0.33737561106681824,
"learning_rate": 0.00018519194088383273,
"loss": 0.1957,
"step": 30
},
{
"epoch": 0.023477067259904387,
"grad_norm": 0.34675121307373047,
"learning_rate": 0.00018371664782625287,
"loss": 0.1625,
"step": 31
},
{
"epoch": 0.024234392010223885,
"grad_norm": 0.3653319478034973,
"learning_rate": 0.0001821777815225245,
"loss": 0.1832,
"step": 32
},
{
"epoch": 0.02499171676054338,
"grad_norm": 0.31412753462791443,
"learning_rate": 0.00018057651056609784,
"loss": 0.1717,
"step": 33
},
{
"epoch": 0.025749041510862877,
"grad_norm": 0.3502964675426483,
"learning_rate": 0.00017891405093963938,
"loss": 0.159,
"step": 34
},
{
"epoch": 0.026506366261182374,
"grad_norm": 0.3261137008666992,
"learning_rate": 0.0001771916650916321,
"loss": 0.1488,
"step": 35
},
{
"epoch": 0.02726369101150187,
"grad_norm": 0.31498923897743225,
"learning_rate": 0.00017541066097768963,
"loss": 0.156,
"step": 36
},
{
"epoch": 0.028021015761821366,
"grad_norm": 0.3408859670162201,
"learning_rate": 0.00017357239106731317,
"loss": 0.1598,
"step": 37
},
{
"epoch": 0.028778340512140864,
"grad_norm": 0.324367880821228,
"learning_rate": 0.00017167825131684513,
"loss": 0.145,
"step": 38
},
{
"epoch": 0.029535665262460358,
"grad_norm": 0.31265532970428467,
"learning_rate": 0.00016972968010939954,
"loss": 0.1514,
"step": 39
},
{
"epoch": 0.030292990012779856,
"grad_norm": 0.34943684935569763,
"learning_rate": 0.00016772815716257412,
"loss": 0.1219,
"step": 40
},
{
"epoch": 0.03105031476309935,
"grad_norm": 0.3439270853996277,
"learning_rate": 0.00016567520240477344,
"loss": 0.135,
"step": 41
},
{
"epoch": 0.03180763951341885,
"grad_norm": 0.29284727573394775,
"learning_rate": 0.00016357237482099684,
"loss": 0.1091,
"step": 42
},
{
"epoch": 0.032564964263738345,
"grad_norm": 0.3866511881351471,
"learning_rate": 0.0001614212712689668,
"loss": 0.1463,
"step": 43
},
{
"epoch": 0.03332228901405784,
"grad_norm": 0.35192635655403137,
"learning_rate": 0.00015922352526649803,
"loss": 0.1351,
"step": 44
},
{
"epoch": 0.034079613764377334,
"grad_norm": 0.3105197846889496,
"learning_rate": 0.00015698080575102661,
"loss": 0.125,
"step": 45
},
{
"epoch": 0.03483693851469683,
"grad_norm": 0.4554622173309326,
"learning_rate": 0.00015469481581224272,
"loss": 0.1594,
"step": 46
},
{
"epoch": 0.03559426326501633,
"grad_norm": 0.34930619597435,
"learning_rate": 0.00015236729139878782,
"loss": 0.1222,
"step": 47
},
{
"epoch": 0.03635158801533583,
"grad_norm": 0.35271599888801575,
"learning_rate": 0.00015000000000000001,
"loss": 0.116,
"step": 48
},
{
"epoch": 0.037108912765655325,
"grad_norm": 0.29121342301368713,
"learning_rate": 0.00014759473930370736,
"loss": 0.1188,
"step": 49
},
{
"epoch": 0.03786623751597482,
"grad_norm": 0.320047527551651,
"learning_rate": 0.00014515333583108896,
"loss": 0.1343,
"step": 50
},
{
"epoch": 0.03862356226629431,
"grad_norm": 0.38673272728919983,
"learning_rate": 0.00014267764354964038,
"loss": 0.1142,
"step": 51
},
{
"epoch": 0.03938088701661381,
"grad_norm": 0.5377465486526489,
"learning_rate": 0.00014016954246529696,
"loss": 0.3442,
"step": 52
},
{
"epoch": 0.04013821176693331,
"grad_norm": 0.33489689230918884,
"learning_rate": 0.00013763093719478358,
"loss": 0.113,
"step": 53
},
{
"epoch": 0.040895536517252806,
"grad_norm": 0.29426538944244385,
"learning_rate": 0.00013506375551927547,
"loss": 0.092,
"step": 54
},
{
"epoch": 0.041652861267572304,
"grad_norm": 0.2807617485523224,
"learning_rate": 0.00013246994692046836,
"loss": 0.1682,
"step": 55
},
{
"epoch": 0.042410186017891795,
"grad_norm": 0.32574039697647095,
"learning_rate": 0.00012985148110016947,
"loss": 0.1055,
"step": 56
},
{
"epoch": 0.04316751076821129,
"grad_norm": 0.37014421820640564,
"learning_rate": 0.00012721034648453353,
"loss": 0.1322,
"step": 57
},
{
"epoch": 0.04392483551853079,
"grad_norm": 0.2937864065170288,
"learning_rate": 0.00012454854871407994,
"loss": 0.0958,
"step": 58
},
{
"epoch": 0.04468216026885029,
"grad_norm": 0.2736242711544037,
"learning_rate": 0.0001218681091206376,
"loss": 0.0937,
"step": 59
},
{
"epoch": 0.045439485019169785,
"grad_norm": 0.2477613240480423,
"learning_rate": 0.00011917106319237386,
"loss": 0.0855,
"step": 60
},
{
"epoch": 0.046196809769489276,
"grad_norm": 0.34489548206329346,
"learning_rate": 0.00011645945902807341,
"loss": 0.0924,
"step": 61
},
{
"epoch": 0.046954134519808774,
"grad_norm": 0.28233641386032104,
"learning_rate": 0.00011373535578184082,
"loss": 0.0963,
"step": 62
},
{
"epoch": 0.04771145927012827,
"grad_norm": 0.259147584438324,
"learning_rate": 0.00011100082209940795,
"loss": 0.0803,
"step": 63
},
{
"epoch": 0.04846878402044777,
"grad_norm": 0.2993817627429962,
"learning_rate": 0.00010825793454723325,
"loss": 0.1052,
"step": 64
},
{
"epoch": 0.04922610877076727,
"grad_norm": 0.30414098501205444,
"learning_rate": 0.00010550877603558655,
"loss": 0.0841,
"step": 65
},
{
"epoch": 0.04998343352108676,
"grad_norm": 0.39788779616355896,
"learning_rate": 0.00010275543423681621,
"loss": 0.0723,
"step": 66
},
{
"epoch": 0.050740758271406255,
"grad_norm": 0.28072524070739746,
"learning_rate": 0.0001,
"loss": 0.0866,
"step": 67
},
{
"epoch": 0.05149808302172575,
"grad_norm": 0.2475721836090088,
"learning_rate": 9.724456576318381e-05,
"loss": 0.089,
"step": 68
},
{
"epoch": 0.05225540777204525,
"grad_norm": 0.4116728901863098,
"learning_rate": 9.449122396441345e-05,
"loss": 0.3701,
"step": 69
},
{
"epoch": 0.05301273252236475,
"grad_norm": 0.3344607353210449,
"learning_rate": 9.174206545276677e-05,
"loss": 0.1114,
"step": 70
},
{
"epoch": 0.053770057272684246,
"grad_norm": 0.27849143743515015,
"learning_rate": 8.899917790059208e-05,
"loss": 0.0977,
"step": 71
},
{
"epoch": 0.05452738202300374,
"grad_norm": 0.3085162341594696,
"learning_rate": 8.626464421815919e-05,
"loss": 0.0901,
"step": 72
},
{
"epoch": 0.055284706773323235,
"grad_norm": 0.28304585814476013,
"learning_rate": 8.35405409719266e-05,
"loss": 0.0881,
"step": 73
},
{
"epoch": 0.05604203152364273,
"grad_norm": 0.31556403636932373,
"learning_rate": 8.082893680762619e-05,
"loss": 0.0758,
"step": 74
},
{
"epoch": 0.05679935627396223,
"grad_norm": 0.25185614824295044,
"learning_rate": 7.813189087936243e-05,
"loss": 0.0932,
"step": 75
},
{
"epoch": 0.05755668102428173,
"grad_norm": 0.31156933307647705,
"learning_rate": 7.54514512859201e-05,
"loss": 0.0927,
"step": 76
},
{
"epoch": 0.05831400577460122,
"grad_norm": 0.29737532138824463,
"learning_rate": 7.278965351546648e-05,
"loss": 0.0872,
"step": 77
},
{
"epoch": 0.059071330524920716,
"grad_norm": 0.3499886393547058,
"learning_rate": 7.014851889983057e-05,
"loss": 0.0822,
"step": 78
},
{
"epoch": 0.059828655275240214,
"grad_norm": 0.31357136368751526,
"learning_rate": 6.753005307953167e-05,
"loss": 0.0902,
"step": 79
},
{
"epoch": 0.06058598002555971,
"grad_norm": 0.2909204065799713,
"learning_rate": 6.493624448072457e-05,
"loss": 0.0752,
"step": 80
},
{
"epoch": 0.06134330477587921,
"grad_norm": 0.2916364371776581,
"learning_rate": 6.236906280521646e-05,
"loss": 0.0905,
"step": 81
},
{
"epoch": 0.0621006295261987,
"grad_norm": 0.3733891546726227,
"learning_rate": 5.983045753470308e-05,
"loss": 0.0775,
"step": 82
},
{
"epoch": 0.0628579542765182,
"grad_norm": 0.23621852695941925,
"learning_rate": 5.732235645035964e-05,
"loss": 0.075,
"step": 83
},
{
"epoch": 0.0636152790268377,
"grad_norm": 0.2937829792499542,
"learning_rate": 5.484666416891109e-05,
"loss": 0.1098,
"step": 84
},
{
"epoch": 0.06437260377715719,
"grad_norm": 0.24242699146270752,
"learning_rate": 5.240526069629265e-05,
"loss": 0.0769,
"step": 85
},
{
"epoch": 0.06512992852747669,
"grad_norm": 0.47885003685951233,
"learning_rate": 5.000000000000002e-05,
"loss": 0.225,
"step": 86
},
{
"epoch": 0.06588725327779618,
"grad_norm": 0.2480865865945816,
"learning_rate": 4.763270860121222e-05,
"loss": 0.0739,
"step": 87
},
{
"epoch": 0.06664457802811569,
"grad_norm": 0.33025848865509033,
"learning_rate": 4.530518418775733e-05,
"loss": 0.0982,
"step": 88
},
{
"epoch": 0.06740190277843518,
"grad_norm": 0.26735973358154297,
"learning_rate": 4.301919424897338e-05,
"loss": 0.0774,
"step": 89
},
{
"epoch": 0.06815922752875467,
"grad_norm": 0.3508649170398712,
"learning_rate": 4.077647473350201e-05,
"loss": 0.0848,
"step": 90
},
{
"epoch": 0.06891655227907417,
"grad_norm": 0.2726826071739197,
"learning_rate": 3.857872873103322e-05,
"loss": 0.0847,
"step": 91
},
{
"epoch": 0.06967387702939366,
"grad_norm": 0.2948499023914337,
"learning_rate": 3.642762517900322e-05,
"loss": 0.076,
"step": 92
},
{
"epoch": 0.07043120177971317,
"grad_norm": 0.28020283579826355,
"learning_rate": 3.4324797595226565e-05,
"loss": 0.0618,
"step": 93
},
{
"epoch": 0.07118852653003266,
"grad_norm": 0.293235182762146,
"learning_rate": 3.227184283742591e-05,
"loss": 0.0931,
"step": 94
},
{
"epoch": 0.07194585128035215,
"grad_norm": 0.22709383070468903,
"learning_rate": 3.0270319890600462e-05,
"loss": 0.0661,
"step": 95
},
{
"epoch": 0.07270317603067165,
"grad_norm": 0.2769714593887329,
"learning_rate": 2.8321748683154893e-05,
"loss": 0.075,
"step": 96
},
{
"epoch": 0.07346050078099114,
"grad_norm": 0.2676548957824707,
"learning_rate": 2.6427608932686843e-05,
"loss": 0.0885,
"step": 97
},
{
"epoch": 0.07421782553131065,
"grad_norm": 0.2752656936645508,
"learning_rate": 2.4589339022310386e-05,
"loss": 0.0751,
"step": 98
},
{
"epoch": 0.07497515028163014,
"grad_norm": 0.32387983798980713,
"learning_rate": 2.2808334908367914e-05,
"loss": 0.0886,
"step": 99
},
{
"epoch": 0.07573247503194964,
"grad_norm": 0.3434562683105469,
"learning_rate": 2.1085949060360654e-05,
"loss": 0.1105,
"step": 100
},
{
"epoch": 0.07573247503194964,
"eval_loss": 0.07799232006072998,
"eval_runtime": 665.6799,
"eval_samples_per_second": 6.682,
"eval_steps_per_second": 1.67,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 124,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 2,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.025880320459407e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}