lesso16's picture
Training in progress, step 137, checkpoint
4be388d verified
{
"best_metric": 1.041056513786316,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 3.0273224043715845,
"eval_steps": 50,
"global_step": 137,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02185792349726776,
"grad_norm": 0.054219260811805725,
"learning_rate": 1.16e-05,
"loss": 1.0221,
"step": 1
},
{
"epoch": 0.02185792349726776,
"eval_loss": 1.3064125776290894,
"eval_runtime": 1.5023,
"eval_samples_per_second": 409.374,
"eval_steps_per_second": 13.313,
"step": 1
},
{
"epoch": 0.04371584699453552,
"grad_norm": 0.07374625653028488,
"learning_rate": 2.32e-05,
"loss": 1.13,
"step": 2
},
{
"epoch": 0.06557377049180328,
"grad_norm": 0.08744122087955475,
"learning_rate": 3.48e-05,
"loss": 1.2471,
"step": 3
},
{
"epoch": 0.08743169398907104,
"grad_norm": 0.1099563017487526,
"learning_rate": 4.64e-05,
"loss": 1.3512,
"step": 4
},
{
"epoch": 0.1092896174863388,
"grad_norm": 0.14091312885284424,
"learning_rate": 5.8e-05,
"loss": 1.382,
"step": 5
},
{
"epoch": 0.13114754098360656,
"grad_norm": 0.19244275987148285,
"learning_rate": 6.96e-05,
"loss": 1.524,
"step": 6
},
{
"epoch": 0.15300546448087432,
"grad_norm": 0.052936580032110214,
"learning_rate": 8.12e-05,
"loss": 1.0329,
"step": 7
},
{
"epoch": 0.17486338797814208,
"grad_norm": 0.06494678556919098,
"learning_rate": 9.28e-05,
"loss": 1.1503,
"step": 8
},
{
"epoch": 0.19672131147540983,
"grad_norm": 0.07551469653844833,
"learning_rate": 0.0001044,
"loss": 1.2085,
"step": 9
},
{
"epoch": 0.2185792349726776,
"grad_norm": 0.08664041757583618,
"learning_rate": 0.000116,
"loss": 1.2444,
"step": 10
},
{
"epoch": 0.24043715846994534,
"grad_norm": 0.10655322670936584,
"learning_rate": 0.00011598225532067881,
"loss": 1.3136,
"step": 11
},
{
"epoch": 0.26229508196721313,
"grad_norm": 0.14484980702400208,
"learning_rate": 0.00011592903214042715,
"loss": 1.3774,
"step": 12
},
{
"epoch": 0.28415300546448086,
"grad_norm": 0.049404121935367584,
"learning_rate": 0.00011584036302573693,
"loss": 0.9998,
"step": 13
},
{
"epoch": 0.30601092896174864,
"grad_norm": 0.05533352494239807,
"learning_rate": 0.0001157163022319532,
"loss": 1.077,
"step": 14
},
{
"epoch": 0.32786885245901637,
"grad_norm": 0.06618451327085495,
"learning_rate": 0.00011555692567007598,
"loss": 1.1209,
"step": 15
},
{
"epoch": 0.34972677595628415,
"grad_norm": 0.07199019938707352,
"learning_rate": 0.00011536233086031157,
"loss": 1.2181,
"step": 16
},
{
"epoch": 0.37158469945355194,
"grad_norm": 0.08229127526283264,
"learning_rate": 0.00011513263687240126,
"loss": 1.2544,
"step": 17
},
{
"epoch": 0.39344262295081966,
"grad_norm": 0.10118231177330017,
"learning_rate": 0.00011486798425276428,
"loss": 1.3167,
"step": 18
},
{
"epoch": 0.41530054644808745,
"grad_norm": 0.06382325291633606,
"learning_rate": 0.00011456853493849944,
"loss": 0.9757,
"step": 19
},
{
"epoch": 0.4371584699453552,
"grad_norm": 0.06287430226802826,
"learning_rate": 0.0001142344721582983,
"loss": 1.0141,
"step": 20
},
{
"epoch": 0.45901639344262296,
"grad_norm": 0.061046287417411804,
"learning_rate": 0.00011386600032033012,
"loss": 1.1142,
"step": 21
},
{
"epoch": 0.4808743169398907,
"grad_norm": 0.05975975841283798,
"learning_rate": 0.0001134633448871674,
"loss": 1.172,
"step": 22
},
{
"epoch": 0.5027322404371585,
"grad_norm": 0.06590148061513901,
"learning_rate": 0.00011302675223782873,
"loss": 1.1934,
"step": 23
},
{
"epoch": 0.5245901639344263,
"grad_norm": 0.07652608305215836,
"learning_rate": 0.00011255648951702296,
"loss": 1.2285,
"step": 24
},
{
"epoch": 0.546448087431694,
"grad_norm": 0.11880210041999817,
"learning_rate": 0.0001120528444716872,
"loss": 1.2294,
"step": 25
},
{
"epoch": 0.5683060109289617,
"grad_norm": 0.04327382519841194,
"learning_rate": 0.00011151612527491878,
"loss": 0.9457,
"step": 26
},
{
"epoch": 0.5901639344262295,
"grad_norm": 0.05113707482814789,
"learning_rate": 0.00011094666033740846,
"loss": 1.0301,
"step": 27
},
{
"epoch": 0.6120218579234973,
"grad_norm": 0.04633456468582153,
"learning_rate": 0.00011034479810649071,
"loss": 1.1369,
"step": 28
},
{
"epoch": 0.6338797814207651,
"grad_norm": 0.052176687866449356,
"learning_rate": 0.00010971090685293396,
"loss": 1.1575,
"step": 29
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.05911482125520706,
"learning_rate": 0.00010904537444560093,
"loss": 1.1915,
"step": 30
},
{
"epoch": 0.6775956284153005,
"grad_norm": 0.08560285717248917,
"learning_rate": 0.0001083486081141173,
"loss": 1.1844,
"step": 31
},
{
"epoch": 0.6994535519125683,
"grad_norm": 0.0443929098546505,
"learning_rate": 0.00010762103419969393,
"loss": 0.9784,
"step": 32
},
{
"epoch": 0.7213114754098361,
"grad_norm": 0.04982827231287956,
"learning_rate": 0.00010686309789425474,
"loss": 1.0368,
"step": 33
},
{
"epoch": 0.7431693989071039,
"grad_norm": 0.04613876715302467,
"learning_rate": 0.00010607526296803026,
"loss": 1.0534,
"step": 34
},
{
"epoch": 0.7650273224043715,
"grad_norm": 0.04624936357140541,
"learning_rate": 0.00010525801148578341,
"loss": 1.1136,
"step": 35
},
{
"epoch": 0.7868852459016393,
"grad_norm": 0.050727903842926025,
"learning_rate": 0.000104411843511841,
"loss": 1.1563,
"step": 36
},
{
"epoch": 0.8087431693989071,
"grad_norm": 0.07218360155820847,
"learning_rate": 0.00010353727680411158,
"loss": 1.148,
"step": 37
},
{
"epoch": 0.8306010928961749,
"grad_norm": 0.04049117863178253,
"learning_rate": 0.00010263484649727705,
"loss": 0.9096,
"step": 38
},
{
"epoch": 0.8524590163934426,
"grad_norm": 0.0455789640545845,
"learning_rate": 0.00010170510477535133,
"loss": 1.0006,
"step": 39
},
{
"epoch": 0.8743169398907104,
"grad_norm": 0.039463143795728683,
"learning_rate": 0.00010074862053380711,
"loss": 1.0411,
"step": 40
},
{
"epoch": 0.8961748633879781,
"grad_norm": 0.042614974081516266,
"learning_rate": 9.976597903147682e-05,
"loss": 1.1396,
"step": 41
},
{
"epoch": 0.9180327868852459,
"grad_norm": 0.04930881783366203,
"learning_rate": 9.875778153244143e-05,
"loss": 1.1744,
"step": 42
},
{
"epoch": 0.9398907103825137,
"grad_norm": 0.06974472105503082,
"learning_rate": 9.772464493812549e-05,
"loss": 1.15,
"step": 43
},
{
"epoch": 0.9617486338797814,
"grad_norm": 0.04092060774564743,
"learning_rate": 9.66672014098242e-05,
"loss": 0.9676,
"step": 44
},
{
"epoch": 0.9836065573770492,
"grad_norm": 0.0392816998064518,
"learning_rate": 9.558609798189311e-05,
"loss": 1.0893,
"step": 45
},
{
"epoch": 1.0163934426229508,
"grad_norm": 0.08897832781076431,
"learning_rate": 9.448199616583707e-05,
"loss": 1.8898,
"step": 46
},
{
"epoch": 1.0382513661202186,
"grad_norm": 0.03982605040073395,
"learning_rate": 9.335557154554105e-05,
"loss": 0.9943,
"step": 47
},
{
"epoch": 1.0601092896174864,
"grad_norm": 0.03858646750450134,
"learning_rate": 9.220751336389013e-05,
"loss": 1.0459,
"step": 48
},
{
"epoch": 1.0819672131147542,
"grad_norm": 0.040587618947029114,
"learning_rate": 9.10385241010317e-05,
"loss": 1.1494,
"step": 49
},
{
"epoch": 1.1038251366120218,
"grad_norm": 0.052482884377241135,
"learning_rate": 8.984931904453821e-05,
"loss": 1.1475,
"step": 50
},
{
"epoch": 1.1038251366120218,
"eval_loss": 1.0756638050079346,
"eval_runtime": 1.9721,
"eval_samples_per_second": 311.855,
"eval_steps_per_second": 10.142,
"step": 50
},
{
"epoch": 1.1256830601092895,
"grad_norm": 0.07459885627031326,
"learning_rate": 8.864062585173286e-05,
"loss": 1.1567,
"step": 51
},
{
"epoch": 1.1475409836065573,
"grad_norm": 0.03736037015914917,
"learning_rate": 8.741318410444684e-05,
"loss": 0.9095,
"step": 52
},
{
"epoch": 1.169398907103825,
"grad_norm": 0.04274572804570198,
"learning_rate": 8.616774485647986e-05,
"loss": 1.0274,
"step": 53
},
{
"epoch": 1.1912568306010929,
"grad_norm": 0.03897716477513313,
"learning_rate": 8.49050701740412e-05,
"loss": 1.041,
"step": 54
},
{
"epoch": 1.2131147540983607,
"grad_norm": 0.042398180812597275,
"learning_rate": 8.362593266945242e-05,
"loss": 1.0944,
"step": 55
},
{
"epoch": 1.2349726775956285,
"grad_norm": 0.04643435403704643,
"learning_rate": 8.233111502839728e-05,
"loss": 1.1559,
"step": 56
},
{
"epoch": 1.2568306010928962,
"grad_norm": 0.06617248058319092,
"learning_rate": 8.102140953100746e-05,
"loss": 1.1503,
"step": 57
},
{
"epoch": 1.278688524590164,
"grad_norm": 0.03723934665322304,
"learning_rate": 7.969761756707802e-05,
"loss": 0.7836,
"step": 58
},
{
"epoch": 1.3005464480874318,
"grad_norm": 0.04756947606801987,
"learning_rate": 7.83605491457085e-05,
"loss": 1.0577,
"step": 59
},
{
"epoch": 1.3224043715846996,
"grad_norm": 0.04186735302209854,
"learning_rate": 7.701102239967025e-05,
"loss": 0.9977,
"step": 60
},
{
"epoch": 1.3442622950819672,
"grad_norm": 0.04028384014964104,
"learning_rate": 7.564986308480269e-05,
"loss": 1.0792,
"step": 61
},
{
"epoch": 1.366120218579235,
"grad_norm": 0.04677554965019226,
"learning_rate": 7.42779040747454e-05,
"loss": 1.1321,
"step": 62
},
{
"epoch": 1.3879781420765027,
"grad_norm": 0.06085206940770149,
"learning_rate": 7.289598485131474e-05,
"loss": 1.1295,
"step": 63
},
{
"epoch": 1.4098360655737705,
"grad_norm": 0.039397455751895905,
"learning_rate": 7.15049509908372e-05,
"loss": 0.429,
"step": 64
},
{
"epoch": 1.4316939890710383,
"grad_norm": 0.06355661898851395,
"learning_rate": 7.010565364675344e-05,
"loss": 1.4871,
"step": 65
},
{
"epoch": 1.453551912568306,
"grad_norm": 0.037176258862018585,
"learning_rate": 6.869894902880984e-05,
"loss": 0.969,
"step": 66
},
{
"epoch": 1.4754098360655736,
"grad_norm": 0.038282133638858795,
"learning_rate": 6.728569787915627e-05,
"loss": 1.072,
"step": 67
},
{
"epoch": 1.4972677595628414,
"grad_norm": 0.044967859983444214,
"learning_rate": 6.586676494567028e-05,
"loss": 1.0984,
"step": 68
},
{
"epoch": 1.5191256830601092,
"grad_norm": 0.05643809214234352,
"learning_rate": 6.444301845283067e-05,
"loss": 1.1205,
"step": 69
},
{
"epoch": 1.540983606557377,
"grad_norm": 0.09565304219722748,
"learning_rate": 6.301532957046325e-05,
"loss": 1.1622,
"step": 70
},
{
"epoch": 1.5628415300546448,
"grad_norm": 0.04223218932747841,
"learning_rate": 6.15845718806849e-05,
"loss": 0.9231,
"step": 71
},
{
"epoch": 1.5846994535519126,
"grad_norm": 0.040753450244665146,
"learning_rate": 6.01516208433711e-05,
"loss": 0.9777,
"step": 72
},
{
"epoch": 1.6065573770491803,
"grad_norm": 0.03819667920470238,
"learning_rate": 5.871735326047505e-05,
"loss": 1.0239,
"step": 73
},
{
"epoch": 1.6284153005464481,
"grad_norm": 0.04432765766978264,
"learning_rate": 5.728264673952495e-05,
"loss": 1.0698,
"step": 74
},
{
"epoch": 1.650273224043716,
"grad_norm": 0.05093759670853615,
"learning_rate": 5.58483791566289e-05,
"loss": 1.0708,
"step": 75
},
{
"epoch": 1.6721311475409837,
"grad_norm": 0.07523038983345032,
"learning_rate": 5.441542811931513e-05,
"loss": 1.118,
"step": 76
},
{
"epoch": 1.6939890710382515,
"grad_norm": 0.03937802463769913,
"learning_rate": 5.298467042953676e-05,
"loss": 0.895,
"step": 77
},
{
"epoch": 1.7158469945355193,
"grad_norm": 0.046086255460977554,
"learning_rate": 5.1556981547169334e-05,
"loss": 1.0295,
"step": 78
},
{
"epoch": 1.737704918032787,
"grad_norm": 0.044464047998189926,
"learning_rate": 5.013323505432971e-05,
"loss": 1.0139,
"step": 79
},
{
"epoch": 1.7595628415300546,
"grad_norm": 0.049823347479104996,
"learning_rate": 4.871430212084374e-05,
"loss": 1.0398,
"step": 80
},
{
"epoch": 1.7814207650273224,
"grad_norm": 0.05059857666492462,
"learning_rate": 4.730105097119016e-05,
"loss": 1.1453,
"step": 81
},
{
"epoch": 1.8032786885245902,
"grad_norm": 0.0664261057972908,
"learning_rate": 4.5894346353246564e-05,
"loss": 1.0989,
"step": 82
},
{
"epoch": 1.825136612021858,
"grad_norm": 0.035497602075338364,
"learning_rate": 4.44950490091628e-05,
"loss": 0.7209,
"step": 83
},
{
"epoch": 1.8469945355191257,
"grad_norm": 0.048219550400972366,
"learning_rate": 4.310401514868527e-05,
"loss": 1.1382,
"step": 84
},
{
"epoch": 1.8688524590163933,
"grad_norm": 0.04112359508872032,
"learning_rate": 4.1722095925254615e-05,
"loss": 0.9578,
"step": 85
},
{
"epoch": 1.890710382513661,
"grad_norm": 0.04152638092637062,
"learning_rate": 4.0350136915197304e-05,
"loss": 1.042,
"step": 86
},
{
"epoch": 1.9125683060109289,
"grad_norm": 0.044837482273578644,
"learning_rate": 3.898897760032974e-05,
"loss": 1.0759,
"step": 87
},
{
"epoch": 1.9344262295081966,
"grad_norm": 0.05834497883915901,
"learning_rate": 3.76394508542915e-05,
"loss": 1.0805,
"step": 88
},
{
"epoch": 1.9562841530054644,
"grad_norm": 0.045810725539922714,
"learning_rate": 3.6302382432922e-05,
"loss": 0.5017,
"step": 89
},
{
"epoch": 1.9781420765027322,
"grad_norm": 0.056167762726545334,
"learning_rate": 3.497859046899255e-05,
"loss": 1.4732,
"step": 90
},
{
"epoch": 2.010928961748634,
"grad_norm": 0.10958977788686752,
"learning_rate": 3.366888497160273e-05,
"loss": 1.8592,
"step": 91
},
{
"epoch": 2.0327868852459017,
"grad_norm": 0.03569335490465164,
"learning_rate": 3.2374067330547576e-05,
"loss": 0.8658,
"step": 92
},
{
"epoch": 2.0546448087431695,
"grad_norm": 0.03896716982126236,
"learning_rate": 3.109492982595882e-05,
"loss": 0.9701,
"step": 93
},
{
"epoch": 2.0765027322404372,
"grad_norm": 0.0439588725566864,
"learning_rate": 2.9832255143520147e-05,
"loss": 1.0359,
"step": 94
},
{
"epoch": 2.098360655737705,
"grad_norm": 0.05064794421195984,
"learning_rate": 2.8586815895553156e-05,
"loss": 1.1003,
"step": 95
},
{
"epoch": 2.120218579234973,
"grad_norm": 0.07673317193984985,
"learning_rate": 2.735937414826714e-05,
"loss": 1.0672,
"step": 96
},
{
"epoch": 2.1420765027322406,
"grad_norm": 0.055169906467199326,
"learning_rate": 2.6150680955461813e-05,
"loss": 0.83,
"step": 97
},
{
"epoch": 2.1639344262295084,
"grad_norm": 0.04568566754460335,
"learning_rate": 2.4961475898968298e-05,
"loss": 1.0704,
"step": 98
},
{
"epoch": 2.185792349726776,
"grad_norm": 0.0388328842818737,
"learning_rate": 2.3792486636109876e-05,
"loss": 0.9818,
"step": 99
},
{
"epoch": 2.2076502732240435,
"grad_norm": 0.0437370240688324,
"learning_rate": 2.2644428454458946e-05,
"loss": 1.0655,
"step": 100
},
{
"epoch": 2.2076502732240435,
"eval_loss": 1.041056513786316,
"eval_runtime": 1.9941,
"eval_samples_per_second": 308.415,
"eval_steps_per_second": 10.03,
"step": 100
},
{
"epoch": 2.2295081967213113,
"grad_norm": 0.04856366664171219,
"learning_rate": 2.1518003834162954e-05,
"loss": 1.0995,
"step": 101
},
{
"epoch": 2.251366120218579,
"grad_norm": 0.0619901567697525,
"learning_rate": 2.0413902018106895e-05,
"loss": 1.0795,
"step": 102
},
{
"epoch": 2.273224043715847,
"grad_norm": 0.053943440318107605,
"learning_rate": 1.9332798590175797e-05,
"loss": 0.5584,
"step": 103
},
{
"epoch": 2.2950819672131146,
"grad_norm": 0.06046655401587486,
"learning_rate": 1.8275355061874515e-05,
"loss": 1.3933,
"step": 104
},
{
"epoch": 2.3169398907103824,
"grad_norm": 0.04138614237308502,
"learning_rate": 1.724221846755858e-05,
"loss": 0.9474,
"step": 105
},
{
"epoch": 2.33879781420765,
"grad_norm": 0.0419883206486702,
"learning_rate": 1.623402096852318e-05,
"loss": 1.0178,
"step": 106
},
{
"epoch": 2.360655737704918,
"grad_norm": 0.04966486990451813,
"learning_rate": 1.5251379466192902e-05,
"loss": 1.1369,
"step": 107
},
{
"epoch": 2.3825136612021858,
"grad_norm": 0.05595370754599571,
"learning_rate": 1.4294895224648664e-05,
"loss": 1.1341,
"step": 108
},
{
"epoch": 2.4043715846994536,
"grad_norm": 0.05111997202038765,
"learning_rate": 1.3365153502722967e-05,
"loss": 0.6285,
"step": 109
},
{
"epoch": 2.4262295081967213,
"grad_norm": 0.052137341350317,
"learning_rate": 1.2462723195888415e-05,
"loss": 1.2566,
"step": 110
},
{
"epoch": 2.448087431693989,
"grad_norm": 0.03865412250161171,
"learning_rate": 1.1588156488159008e-05,
"loss": 0.9759,
"step": 111
},
{
"epoch": 2.469945355191257,
"grad_norm": 0.03843948617577553,
"learning_rate": 1.074198851421659e-05,
"loss": 1.0035,
"step": 112
},
{
"epoch": 2.4918032786885247,
"grad_norm": 0.04497023671865463,
"learning_rate": 9.924737031969744e-06,
"loss": 1.0914,
"step": 113
},
{
"epoch": 2.5136612021857925,
"grad_norm": 0.05429847911000252,
"learning_rate": 9.136902105745273e-06,
"loss": 1.1209,
"step": 114
},
{
"epoch": 2.5355191256830603,
"grad_norm": 0.08493578433990479,
"learning_rate": 8.378965800306078e-06,
"loss": 1.1134,
"step": 115
},
{
"epoch": 2.557377049180328,
"grad_norm": 0.06231605261564255,
"learning_rate": 7.651391885882701e-06,
"loss": 0.965,
"step": 116
},
{
"epoch": 2.579234972677596,
"grad_norm": 0.03692341595888138,
"learning_rate": 6.954625554399086e-06,
"loss": 0.8894,
"step": 117
},
{
"epoch": 2.6010928961748636,
"grad_norm": 0.04275006055831909,
"learning_rate": 6.289093147066023e-06,
"loss": 1.0013,
"step": 118
},
{
"epoch": 2.6229508196721314,
"grad_norm": 0.04334869980812073,
"learning_rate": 5.655201893509272e-06,
"loss": 1.0516,
"step": 119
},
{
"epoch": 2.644808743169399,
"grad_norm": 0.052981842309236526,
"learning_rate": 5.053339662591549e-06,
"loss": 1.0457,
"step": 120
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.0736251100897789,
"learning_rate": 4.483874725081219e-06,
"loss": 1.1736,
"step": 121
},
{
"epoch": 2.6885245901639343,
"grad_norm": 0.057101909071207047,
"learning_rate": 3.9471555283128005e-06,
"loss": 0.8181,
"step": 122
},
{
"epoch": 2.710382513661202,
"grad_norm": 0.04115651920437813,
"learning_rate": 3.4435104829770587e-06,
"loss": 1.0691,
"step": 123
},
{
"epoch": 2.73224043715847,
"grad_norm": 0.038360998034477234,
"learning_rate": 2.9732477621712853e-06,
"loss": 0.986,
"step": 124
},
{
"epoch": 2.7540983606557377,
"grad_norm": 0.0409964919090271,
"learning_rate": 2.53665511283261e-06,
"loss": 1.0381,
"step": 125
},
{
"epoch": 2.7759562841530054,
"grad_norm": 0.04703905060887337,
"learning_rate": 2.1339996796698887e-06,
"loss": 1.0692,
"step": 126
},
{
"epoch": 2.797814207650273,
"grad_norm": 0.06132422015070915,
"learning_rate": 1.7655278417016956e-06,
"loss": 1.117,
"step": 127
},
{
"epoch": 2.819672131147541,
"grad_norm": 0.05300451070070267,
"learning_rate": 1.4314650615005687e-06,
"loss": 0.543,
"step": 128
},
{
"epoch": 2.841530054644809,
"grad_norm": 0.055577926337718964,
"learning_rate": 1.1320157472357307e-06,
"loss": 1.3244,
"step": 129
},
{
"epoch": 2.8633879781420766,
"grad_norm": 0.04094787687063217,
"learning_rate": 8.673631275987297e-07,
"loss": 0.9801,
"step": 130
},
{
"epoch": 2.8852459016393444,
"grad_norm": 0.0408557653427124,
"learning_rate": 6.376691396884168e-07,
"loss": 1.0152,
"step": 131
},
{
"epoch": 2.907103825136612,
"grad_norm": 0.047284748405218124,
"learning_rate": 4.430743299240307e-07,
"loss": 1.0816,
"step": 132
},
{
"epoch": 2.92896174863388,
"grad_norm": 0.055055923759937286,
"learning_rate": 2.836977680468222e-07,
"loss": 1.0597,
"step": 133
},
{
"epoch": 2.9508196721311473,
"grad_norm": 0.05081977695226669,
"learning_rate": 1.5963697426306723e-07,
"loss": 0.6892,
"step": 134
},
{
"epoch": 2.972677595628415,
"grad_norm": 0.055216483771800995,
"learning_rate": 7.096785957284602e-08,
"loss": 1.341,
"step": 135
},
{
"epoch": 3.0054644808743167,
"grad_norm": 0.10709080845117569,
"learning_rate": 1.774467932117818e-08,
"loss": 1.761,
"step": 136
},
{
"epoch": 3.0273224043715845,
"grad_norm": 0.03700735419988632,
"learning_rate": 0.0,
"loss": 0.8896,
"step": 137
}
],
"logging_steps": 1,
"max_steps": 137,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.74466163399721e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}