infogeo's picture
Training in progress, step 200, checkpoint
fe897d1 verified
{
"best_metric": 0.28237149119377136,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.04105933073290905,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00020529665366454526,
"grad_norm": 2.6672608852386475,
"learning_rate": 1.004e-05,
"loss": 0.7063,
"step": 1
},
{
"epoch": 0.00020529665366454526,
"eval_loss": 1.4657493829727173,
"eval_runtime": 283.5709,
"eval_samples_per_second": 7.233,
"eval_steps_per_second": 1.809,
"step": 1
},
{
"epoch": 0.0004105933073290905,
"grad_norm": 2.4921810626983643,
"learning_rate": 2.008e-05,
"loss": 0.7398,
"step": 2
},
{
"epoch": 0.0006158899609936358,
"grad_norm": 2.315711498260498,
"learning_rate": 3.012e-05,
"loss": 0.6311,
"step": 3
},
{
"epoch": 0.000821186614658181,
"grad_norm": 2.6371049880981445,
"learning_rate": 4.016e-05,
"loss": 0.7873,
"step": 4
},
{
"epoch": 0.0010264832683227264,
"grad_norm": 2.293344259262085,
"learning_rate": 5.02e-05,
"loss": 0.7672,
"step": 5
},
{
"epoch": 0.0012317799219872716,
"grad_norm": 2.3576221466064453,
"learning_rate": 6.024e-05,
"loss": 0.8368,
"step": 6
},
{
"epoch": 0.0014370765756518168,
"grad_norm": 1.5463850498199463,
"learning_rate": 7.028e-05,
"loss": 0.6442,
"step": 7
},
{
"epoch": 0.001642373229316362,
"grad_norm": 1.3786958456039429,
"learning_rate": 8.032e-05,
"loss": 0.5664,
"step": 8
},
{
"epoch": 0.0018476698829809075,
"grad_norm": 1.4701026678085327,
"learning_rate": 9.036000000000001e-05,
"loss": 0.5766,
"step": 9
},
{
"epoch": 0.0020529665366454527,
"grad_norm": 1.3166791200637817,
"learning_rate": 0.0001004,
"loss": 0.4609,
"step": 10
},
{
"epoch": 0.0022582631903099977,
"grad_norm": 1.1514837741851807,
"learning_rate": 9.987157894736842e-05,
"loss": 0.3202,
"step": 11
},
{
"epoch": 0.002463559843974543,
"grad_norm": 1.2850422859191895,
"learning_rate": 9.934315789473684e-05,
"loss": 0.4206,
"step": 12
},
{
"epoch": 0.0026688564976390886,
"grad_norm": 1.5056142807006836,
"learning_rate": 9.881473684210525e-05,
"loss": 0.3428,
"step": 13
},
{
"epoch": 0.0028741531513036336,
"grad_norm": 1.1962010860443115,
"learning_rate": 9.828631578947369e-05,
"loss": 0.3687,
"step": 14
},
{
"epoch": 0.003079449804968179,
"grad_norm": 1.5500695705413818,
"learning_rate": 9.77578947368421e-05,
"loss": 0.381,
"step": 15
},
{
"epoch": 0.003284746458632724,
"grad_norm": 1.4160789251327515,
"learning_rate": 9.722947368421052e-05,
"loss": 0.3512,
"step": 16
},
{
"epoch": 0.0034900431122972695,
"grad_norm": 1.4462215900421143,
"learning_rate": 9.670105263157895e-05,
"loss": 0.353,
"step": 17
},
{
"epoch": 0.003695339765961815,
"grad_norm": 1.0240310430526733,
"learning_rate": 9.617263157894737e-05,
"loss": 0.322,
"step": 18
},
{
"epoch": 0.00390063641962636,
"grad_norm": 1.1923655271530151,
"learning_rate": 9.564421052631579e-05,
"loss": 0.3566,
"step": 19
},
{
"epoch": 0.0041059330732909054,
"grad_norm": 1.6432424783706665,
"learning_rate": 9.511578947368421e-05,
"loss": 0.3615,
"step": 20
},
{
"epoch": 0.0043112297269554505,
"grad_norm": 1.3957858085632324,
"learning_rate": 9.458736842105264e-05,
"loss": 0.3816,
"step": 21
},
{
"epoch": 0.0045165263806199955,
"grad_norm": 1.0589792728424072,
"learning_rate": 9.405894736842106e-05,
"loss": 0.3055,
"step": 22
},
{
"epoch": 0.004721823034284541,
"grad_norm": 1.3135740756988525,
"learning_rate": 9.353052631578947e-05,
"loss": 0.3828,
"step": 23
},
{
"epoch": 0.004927119687949086,
"grad_norm": 0.984417736530304,
"learning_rate": 9.300210526315789e-05,
"loss": 0.3207,
"step": 24
},
{
"epoch": 0.005132416341613631,
"grad_norm": 1.0034253597259521,
"learning_rate": 9.247368421052631e-05,
"loss": 0.2155,
"step": 25
},
{
"epoch": 0.005337712995278177,
"grad_norm": 0.9280335903167725,
"learning_rate": 9.194526315789473e-05,
"loss": 0.288,
"step": 26
},
{
"epoch": 0.005543009648942722,
"grad_norm": 1.225964903831482,
"learning_rate": 9.141684210526316e-05,
"loss": 0.3412,
"step": 27
},
{
"epoch": 0.005748306302607267,
"grad_norm": 1.1848243474960327,
"learning_rate": 9.088842105263158e-05,
"loss": 0.3534,
"step": 28
},
{
"epoch": 0.005953602956271813,
"grad_norm": 0.9669045209884644,
"learning_rate": 9.036000000000001e-05,
"loss": 0.2628,
"step": 29
},
{
"epoch": 0.006158899609936358,
"grad_norm": 1.0077804327011108,
"learning_rate": 8.983157894736843e-05,
"loss": 0.3054,
"step": 30
},
{
"epoch": 0.006364196263600903,
"grad_norm": 1.309670090675354,
"learning_rate": 8.930315789473684e-05,
"loss": 0.3558,
"step": 31
},
{
"epoch": 0.006569492917265448,
"grad_norm": 1.3912608623504639,
"learning_rate": 8.877473684210526e-05,
"loss": 0.3219,
"step": 32
},
{
"epoch": 0.006774789570929994,
"grad_norm": 2.0321879386901855,
"learning_rate": 8.824631578947368e-05,
"loss": 0.3565,
"step": 33
},
{
"epoch": 0.006980086224594539,
"grad_norm": 1.429619312286377,
"learning_rate": 8.771789473684211e-05,
"loss": 0.4488,
"step": 34
},
{
"epoch": 0.007185382878259084,
"grad_norm": 1.5244200229644775,
"learning_rate": 8.718947368421053e-05,
"loss": 0.3932,
"step": 35
},
{
"epoch": 0.00739067953192363,
"grad_norm": 1.9189207553863525,
"learning_rate": 8.666105263157895e-05,
"loss": 0.325,
"step": 36
},
{
"epoch": 0.007595976185588175,
"grad_norm": 1.4107446670532227,
"learning_rate": 8.613263157894737e-05,
"loss": 0.2386,
"step": 37
},
{
"epoch": 0.00780127283925272,
"grad_norm": 1.8684407472610474,
"learning_rate": 8.560421052631578e-05,
"loss": 0.5354,
"step": 38
},
{
"epoch": 0.008006569492917266,
"grad_norm": 1.7079691886901855,
"learning_rate": 8.50757894736842e-05,
"loss": 0.3189,
"step": 39
},
{
"epoch": 0.008211866146581811,
"grad_norm": 1.4461852312088013,
"learning_rate": 8.454736842105263e-05,
"loss": 0.322,
"step": 40
},
{
"epoch": 0.008417162800246356,
"grad_norm": 1.3741295337677002,
"learning_rate": 8.401894736842106e-05,
"loss": 0.3541,
"step": 41
},
{
"epoch": 0.008622459453910901,
"grad_norm": 1.7616922855377197,
"learning_rate": 8.349052631578948e-05,
"loss": 0.3264,
"step": 42
},
{
"epoch": 0.008827756107575446,
"grad_norm": 1.8308717012405396,
"learning_rate": 8.29621052631579e-05,
"loss": 0.306,
"step": 43
},
{
"epoch": 0.009033052761239991,
"grad_norm": 1.9519275426864624,
"learning_rate": 8.243368421052632e-05,
"loss": 0.3909,
"step": 44
},
{
"epoch": 0.009238349414904538,
"grad_norm": 1.9473401308059692,
"learning_rate": 8.190526315789474e-05,
"loss": 0.4332,
"step": 45
},
{
"epoch": 0.009443646068569083,
"grad_norm": 1.8211240768432617,
"learning_rate": 8.137684210526315e-05,
"loss": 0.4124,
"step": 46
},
{
"epoch": 0.009648942722233628,
"grad_norm": 1.6044187545776367,
"learning_rate": 8.084842105263157e-05,
"loss": 0.2797,
"step": 47
},
{
"epoch": 0.009854239375898173,
"grad_norm": 1.8763623237609863,
"learning_rate": 8.032e-05,
"loss": 0.3744,
"step": 48
},
{
"epoch": 0.010059536029562718,
"grad_norm": 2.20684814453125,
"learning_rate": 7.979157894736842e-05,
"loss": 0.5014,
"step": 49
},
{
"epoch": 0.010264832683227263,
"grad_norm": 2.2577061653137207,
"learning_rate": 7.926315789473684e-05,
"loss": 0.4693,
"step": 50
},
{
"epoch": 0.010264832683227263,
"eval_loss": 0.3905164897441864,
"eval_runtime": 283.6848,
"eval_samples_per_second": 7.23,
"eval_steps_per_second": 1.808,
"step": 50
},
{
"epoch": 0.01047012933689181,
"grad_norm": 1.0799939632415771,
"learning_rate": 7.873473684210526e-05,
"loss": 0.3619,
"step": 51
},
{
"epoch": 0.010675425990556354,
"grad_norm": 1.2252871990203857,
"learning_rate": 7.820631578947369e-05,
"loss": 0.4295,
"step": 52
},
{
"epoch": 0.0108807226442209,
"grad_norm": 0.8576818704605103,
"learning_rate": 7.76778947368421e-05,
"loss": 0.3482,
"step": 53
},
{
"epoch": 0.011086019297885445,
"grad_norm": 0.5568431615829468,
"learning_rate": 7.714947368421052e-05,
"loss": 0.286,
"step": 54
},
{
"epoch": 0.01129131595154999,
"grad_norm": 0.7329469323158264,
"learning_rate": 7.662105263157896e-05,
"loss": 0.2958,
"step": 55
},
{
"epoch": 0.011496612605214535,
"grad_norm": 0.7412177920341492,
"learning_rate": 7.609263157894737e-05,
"loss": 0.38,
"step": 56
},
{
"epoch": 0.01170190925887908,
"grad_norm": 0.569349467754364,
"learning_rate": 7.556421052631579e-05,
"loss": 0.2482,
"step": 57
},
{
"epoch": 0.011907205912543626,
"grad_norm": 0.7977584600448608,
"learning_rate": 7.503578947368421e-05,
"loss": 0.3199,
"step": 58
},
{
"epoch": 0.012112502566208171,
"grad_norm": 0.6417466998100281,
"learning_rate": 7.450736842105263e-05,
"loss": 0.2742,
"step": 59
},
{
"epoch": 0.012317799219872716,
"grad_norm": 0.7626402378082275,
"learning_rate": 7.397894736842105e-05,
"loss": 0.3268,
"step": 60
},
{
"epoch": 0.012523095873537261,
"grad_norm": 0.6782163381576538,
"learning_rate": 7.345052631578948e-05,
"loss": 0.3317,
"step": 61
},
{
"epoch": 0.012728392527201806,
"grad_norm": 0.7630534172058105,
"learning_rate": 7.29221052631579e-05,
"loss": 0.3016,
"step": 62
},
{
"epoch": 0.012933689180866351,
"grad_norm": 0.7664050459861755,
"learning_rate": 7.239368421052631e-05,
"loss": 0.379,
"step": 63
},
{
"epoch": 0.013138985834530896,
"grad_norm": 0.720039427280426,
"learning_rate": 7.186526315789474e-05,
"loss": 0.2917,
"step": 64
},
{
"epoch": 0.013344282488195443,
"grad_norm": 0.681718111038208,
"learning_rate": 7.133684210526316e-05,
"loss": 0.3043,
"step": 65
},
{
"epoch": 0.013549579141859988,
"grad_norm": 0.7472444772720337,
"learning_rate": 7.080842105263158e-05,
"loss": 0.3209,
"step": 66
},
{
"epoch": 0.013754875795524533,
"grad_norm": 0.821285605430603,
"learning_rate": 7.028e-05,
"loss": 0.3087,
"step": 67
},
{
"epoch": 0.013960172449189078,
"grad_norm": 0.7269453406333923,
"learning_rate": 6.975157894736843e-05,
"loss": 0.2669,
"step": 68
},
{
"epoch": 0.014165469102853623,
"grad_norm": 0.8093358278274536,
"learning_rate": 6.922315789473685e-05,
"loss": 0.3617,
"step": 69
},
{
"epoch": 0.014370765756518168,
"grad_norm": 0.672995924949646,
"learning_rate": 6.869473684210527e-05,
"loss": 0.2406,
"step": 70
},
{
"epoch": 0.014576062410182713,
"grad_norm": 0.7517425417900085,
"learning_rate": 6.816631578947368e-05,
"loss": 0.3277,
"step": 71
},
{
"epoch": 0.01478135906384726,
"grad_norm": 0.8492874503135681,
"learning_rate": 6.76378947368421e-05,
"loss": 0.3203,
"step": 72
},
{
"epoch": 0.014986655717511805,
"grad_norm": 0.7951849699020386,
"learning_rate": 6.710947368421052e-05,
"loss": 0.2832,
"step": 73
},
{
"epoch": 0.01519195237117635,
"grad_norm": 0.8031511902809143,
"learning_rate": 6.658105263157894e-05,
"loss": 0.3054,
"step": 74
},
{
"epoch": 0.015397249024840895,
"grad_norm": 0.9755042195320129,
"learning_rate": 6.605263157894737e-05,
"loss": 0.3527,
"step": 75
},
{
"epoch": 0.01560254567850544,
"grad_norm": 0.7796215415000916,
"learning_rate": 6.55242105263158e-05,
"loss": 0.2549,
"step": 76
},
{
"epoch": 0.015807842332169985,
"grad_norm": 0.8745598793029785,
"learning_rate": 6.499578947368422e-05,
"loss": 0.3623,
"step": 77
},
{
"epoch": 0.016013138985834532,
"grad_norm": 0.8187890648841858,
"learning_rate": 6.446736842105264e-05,
"loss": 0.2409,
"step": 78
},
{
"epoch": 0.016218435639499075,
"grad_norm": 0.7740800380706787,
"learning_rate": 6.393894736842105e-05,
"loss": 0.2695,
"step": 79
},
{
"epoch": 0.016423732293163622,
"grad_norm": 0.7411985397338867,
"learning_rate": 6.341052631578947e-05,
"loss": 0.2926,
"step": 80
},
{
"epoch": 0.016629028946828165,
"grad_norm": 0.6864801645278931,
"learning_rate": 6.288210526315789e-05,
"loss": 0.2298,
"step": 81
},
{
"epoch": 0.016834325600492712,
"grad_norm": 1.017356276512146,
"learning_rate": 6.235368421052632e-05,
"loss": 0.3427,
"step": 82
},
{
"epoch": 0.01703962225415726,
"grad_norm": 1.3836476802825928,
"learning_rate": 6.182526315789474e-05,
"loss": 0.2717,
"step": 83
},
{
"epoch": 0.017244918907821802,
"grad_norm": 0.8246234655380249,
"learning_rate": 6.129684210526316e-05,
"loss": 0.236,
"step": 84
},
{
"epoch": 0.01745021556148635,
"grad_norm": 0.8777735829353333,
"learning_rate": 6.076842105263158e-05,
"loss": 0.2285,
"step": 85
},
{
"epoch": 0.017655512215150892,
"grad_norm": 1.1864032745361328,
"learning_rate": 6.024e-05,
"loss": 0.3507,
"step": 86
},
{
"epoch": 0.01786080886881544,
"grad_norm": 1.2794263362884521,
"learning_rate": 5.971157894736842e-05,
"loss": 0.3693,
"step": 87
},
{
"epoch": 0.018066105522479982,
"grad_norm": 1.1183644533157349,
"learning_rate": 5.9183157894736835e-05,
"loss": 0.3053,
"step": 88
},
{
"epoch": 0.01827140217614453,
"grad_norm": 1.1237081289291382,
"learning_rate": 5.8654736842105267e-05,
"loss": 0.2993,
"step": 89
},
{
"epoch": 0.018476698829809075,
"grad_norm": 1.0422002077102661,
"learning_rate": 5.8126315789473684e-05,
"loss": 0.3116,
"step": 90
},
{
"epoch": 0.01868199548347362,
"grad_norm": 1.0887373685836792,
"learning_rate": 5.759789473684211e-05,
"loss": 0.2462,
"step": 91
},
{
"epoch": 0.018887292137138165,
"grad_norm": 1.1609467267990112,
"learning_rate": 5.706947368421053e-05,
"loss": 0.2892,
"step": 92
},
{
"epoch": 0.01909258879080271,
"grad_norm": 1.0913397073745728,
"learning_rate": 5.6541052631578945e-05,
"loss": 0.2621,
"step": 93
},
{
"epoch": 0.019297885444467255,
"grad_norm": 1.2688875198364258,
"learning_rate": 5.601263157894736e-05,
"loss": 0.2619,
"step": 94
},
{
"epoch": 0.019503182098131802,
"grad_norm": 1.8537800312042236,
"learning_rate": 5.5484210526315794e-05,
"loss": 0.3301,
"step": 95
},
{
"epoch": 0.019708478751796345,
"grad_norm": 1.5115448236465454,
"learning_rate": 5.495578947368421e-05,
"loss": 0.3174,
"step": 96
},
{
"epoch": 0.019913775405460892,
"grad_norm": 1.3447884321212769,
"learning_rate": 5.442736842105264e-05,
"loss": 0.2984,
"step": 97
},
{
"epoch": 0.020119072059125435,
"grad_norm": 1.405712604522705,
"learning_rate": 5.3898947368421055e-05,
"loss": 0.3403,
"step": 98
},
{
"epoch": 0.020324368712789982,
"grad_norm": 1.7032535076141357,
"learning_rate": 5.337052631578947e-05,
"loss": 0.352,
"step": 99
},
{
"epoch": 0.020529665366454525,
"grad_norm": 1.6720629930496216,
"learning_rate": 5.284210526315789e-05,
"loss": 0.2699,
"step": 100
},
{
"epoch": 0.020529665366454525,
"eval_loss": 0.3957465887069702,
"eval_runtime": 283.8573,
"eval_samples_per_second": 7.225,
"eval_steps_per_second": 1.807,
"step": 100
},
{
"epoch": 0.020734962020119072,
"grad_norm": 1.3754338026046753,
"learning_rate": 5.231368421052631e-05,
"loss": 0.5216,
"step": 101
},
{
"epoch": 0.02094025867378362,
"grad_norm": 0.9947471618652344,
"learning_rate": 5.178526315789474e-05,
"loss": 0.3717,
"step": 102
},
{
"epoch": 0.021145555327448162,
"grad_norm": 1.0772303342819214,
"learning_rate": 5.1256842105263165e-05,
"loss": 0.3756,
"step": 103
},
{
"epoch": 0.02135085198111271,
"grad_norm": 1.0028579235076904,
"learning_rate": 5.072842105263158e-05,
"loss": 0.3998,
"step": 104
},
{
"epoch": 0.021556148634777252,
"grad_norm": 0.9716396927833557,
"learning_rate": 5.02e-05,
"loss": 0.4231,
"step": 105
},
{
"epoch": 0.0217614452884418,
"grad_norm": 0.6188949942588806,
"learning_rate": 4.967157894736842e-05,
"loss": 0.3012,
"step": 106
},
{
"epoch": 0.021966741942106342,
"grad_norm": 0.6589746475219727,
"learning_rate": 4.914315789473684e-05,
"loss": 0.2888,
"step": 107
},
{
"epoch": 0.02217203859577089,
"grad_norm": 0.5264716148376465,
"learning_rate": 4.861473684210526e-05,
"loss": 0.2964,
"step": 108
},
{
"epoch": 0.022377335249435436,
"grad_norm": 0.5431289076805115,
"learning_rate": 4.8086315789473686e-05,
"loss": 0.2952,
"step": 109
},
{
"epoch": 0.02258263190309998,
"grad_norm": 0.7499707937240601,
"learning_rate": 4.7557894736842104e-05,
"loss": 0.311,
"step": 110
},
{
"epoch": 0.022787928556764526,
"grad_norm": 0.5410442352294922,
"learning_rate": 4.702947368421053e-05,
"loss": 0.2698,
"step": 111
},
{
"epoch": 0.02299322521042907,
"grad_norm": 0.49856746196746826,
"learning_rate": 4.6501052631578946e-05,
"loss": 0.2441,
"step": 112
},
{
"epoch": 0.023198521864093616,
"grad_norm": 0.5425069332122803,
"learning_rate": 4.5972631578947364e-05,
"loss": 0.252,
"step": 113
},
{
"epoch": 0.02340381851775816,
"grad_norm": 0.7178559899330139,
"learning_rate": 4.544421052631579e-05,
"loss": 0.3255,
"step": 114
},
{
"epoch": 0.023609115171422706,
"grad_norm": 0.5658791065216064,
"learning_rate": 4.4915789473684213e-05,
"loss": 0.2957,
"step": 115
},
{
"epoch": 0.023814411825087253,
"grad_norm": 0.5597440600395203,
"learning_rate": 4.438736842105263e-05,
"loss": 0.2893,
"step": 116
},
{
"epoch": 0.024019708478751796,
"grad_norm": 0.6386386156082153,
"learning_rate": 4.3858947368421056e-05,
"loss": 0.3598,
"step": 117
},
{
"epoch": 0.024225005132416343,
"grad_norm": 0.6502652764320374,
"learning_rate": 4.3330526315789474e-05,
"loss": 0.2804,
"step": 118
},
{
"epoch": 0.024430301786080886,
"grad_norm": 0.6374607086181641,
"learning_rate": 4.280210526315789e-05,
"loss": 0.2767,
"step": 119
},
{
"epoch": 0.024635598439745433,
"grad_norm": 0.6071303486824036,
"learning_rate": 4.2273684210526317e-05,
"loss": 0.2579,
"step": 120
},
{
"epoch": 0.024840895093409976,
"grad_norm": 0.7243674397468567,
"learning_rate": 4.174526315789474e-05,
"loss": 0.358,
"step": 121
},
{
"epoch": 0.025046191747074523,
"grad_norm": 0.658960223197937,
"learning_rate": 4.121684210526316e-05,
"loss": 0.2744,
"step": 122
},
{
"epoch": 0.02525148840073907,
"grad_norm": 0.6552606225013733,
"learning_rate": 4.068842105263158e-05,
"loss": 0.2575,
"step": 123
},
{
"epoch": 0.025456785054403613,
"grad_norm": 0.8720560669898987,
"learning_rate": 4.016e-05,
"loss": 0.3128,
"step": 124
},
{
"epoch": 0.02566208170806816,
"grad_norm": 0.7429736256599426,
"learning_rate": 3.963157894736842e-05,
"loss": 0.282,
"step": 125
},
{
"epoch": 0.025867378361732703,
"grad_norm": 0.6923242807388306,
"learning_rate": 3.9103157894736844e-05,
"loss": 0.1901,
"step": 126
},
{
"epoch": 0.02607267501539725,
"grad_norm": 0.7189604640007019,
"learning_rate": 3.857473684210526e-05,
"loss": 0.2721,
"step": 127
},
{
"epoch": 0.026277971669061793,
"grad_norm": 0.9543197751045227,
"learning_rate": 3.804631578947369e-05,
"loss": 0.2603,
"step": 128
},
{
"epoch": 0.02648326832272634,
"grad_norm": 0.764552891254425,
"learning_rate": 3.7517894736842105e-05,
"loss": 0.2741,
"step": 129
},
{
"epoch": 0.026688564976390886,
"grad_norm": 0.7686837911605835,
"learning_rate": 3.698947368421052e-05,
"loss": 0.2984,
"step": 130
},
{
"epoch": 0.02689386163005543,
"grad_norm": 0.7207261323928833,
"learning_rate": 3.646105263157895e-05,
"loss": 0.2831,
"step": 131
},
{
"epoch": 0.027099158283719976,
"grad_norm": 0.714175820350647,
"learning_rate": 3.593263157894737e-05,
"loss": 0.1969,
"step": 132
},
{
"epoch": 0.02730445493738452,
"grad_norm": 0.8730839490890503,
"learning_rate": 3.540421052631579e-05,
"loss": 0.2755,
"step": 133
},
{
"epoch": 0.027509751591049066,
"grad_norm": 0.9100469946861267,
"learning_rate": 3.4875789473684215e-05,
"loss": 0.2463,
"step": 134
},
{
"epoch": 0.02771504824471361,
"grad_norm": 0.8955867886543274,
"learning_rate": 3.434736842105263e-05,
"loss": 0.2368,
"step": 135
},
{
"epoch": 0.027920344898378156,
"grad_norm": 0.8493711352348328,
"learning_rate": 3.381894736842105e-05,
"loss": 0.2701,
"step": 136
},
{
"epoch": 0.028125641552042703,
"grad_norm": 0.8181521892547607,
"learning_rate": 3.329052631578947e-05,
"loss": 0.2705,
"step": 137
},
{
"epoch": 0.028330938205707246,
"grad_norm": 0.869841456413269,
"learning_rate": 3.27621052631579e-05,
"loss": 0.2457,
"step": 138
},
{
"epoch": 0.028536234859371793,
"grad_norm": 1.0162545442581177,
"learning_rate": 3.223368421052632e-05,
"loss": 0.3208,
"step": 139
},
{
"epoch": 0.028741531513036336,
"grad_norm": 1.2462005615234375,
"learning_rate": 3.1705263157894736e-05,
"loss": 0.3156,
"step": 140
},
{
"epoch": 0.028946828166700883,
"grad_norm": 1.145050287246704,
"learning_rate": 3.117684210526316e-05,
"loss": 0.2674,
"step": 141
},
{
"epoch": 0.029152124820365426,
"grad_norm": 1.3327499628067017,
"learning_rate": 3.064842105263158e-05,
"loss": 0.3207,
"step": 142
},
{
"epoch": 0.029357421474029973,
"grad_norm": 1.1416566371917725,
"learning_rate": 3.012e-05,
"loss": 0.298,
"step": 143
},
{
"epoch": 0.02956271812769452,
"grad_norm": 1.1980369091033936,
"learning_rate": 2.9591578947368418e-05,
"loss": 0.2707,
"step": 144
},
{
"epoch": 0.029768014781359063,
"grad_norm": 1.1596184968948364,
"learning_rate": 2.9063157894736842e-05,
"loss": 0.2976,
"step": 145
},
{
"epoch": 0.02997331143502361,
"grad_norm": 1.197420597076416,
"learning_rate": 2.8534736842105264e-05,
"loss": 0.2229,
"step": 146
},
{
"epoch": 0.030178608088688153,
"grad_norm": 1.0424596071243286,
"learning_rate": 2.800631578947368e-05,
"loss": 0.2084,
"step": 147
},
{
"epoch": 0.0303839047423527,
"grad_norm": 1.268896222114563,
"learning_rate": 2.7477894736842106e-05,
"loss": 0.3301,
"step": 148
},
{
"epoch": 0.030589201396017243,
"grad_norm": 1.5805307626724243,
"learning_rate": 2.6949473684210527e-05,
"loss": 0.2622,
"step": 149
},
{
"epoch": 0.03079449804968179,
"grad_norm": 2.11923885345459,
"learning_rate": 2.6421052631578945e-05,
"loss": 0.3621,
"step": 150
},
{
"epoch": 0.03079449804968179,
"eval_loss": 0.34410402178764343,
"eval_runtime": 284.7503,
"eval_samples_per_second": 7.203,
"eval_steps_per_second": 1.802,
"step": 150
},
{
"epoch": 0.030999794703346337,
"grad_norm": 1.0855504274368286,
"learning_rate": 2.589263157894737e-05,
"loss": 0.3438,
"step": 151
},
{
"epoch": 0.03120509135701088,
"grad_norm": 1.0626095533370972,
"learning_rate": 2.536421052631579e-05,
"loss": 0.3517,
"step": 152
},
{
"epoch": 0.03141038801067542,
"grad_norm": 1.0813599824905396,
"learning_rate": 2.483578947368421e-05,
"loss": 0.4109,
"step": 153
},
{
"epoch": 0.03161568466433997,
"grad_norm": 1.1301844120025635,
"learning_rate": 2.430736842105263e-05,
"loss": 0.366,
"step": 154
},
{
"epoch": 0.03182098131800452,
"grad_norm": 0.83632493019104,
"learning_rate": 2.3778947368421052e-05,
"loss": 0.3065,
"step": 155
},
{
"epoch": 0.032026277971669063,
"grad_norm": 0.8896894454956055,
"learning_rate": 2.3250526315789473e-05,
"loss": 0.2917,
"step": 156
},
{
"epoch": 0.03223157462533361,
"grad_norm": 0.9853330254554749,
"learning_rate": 2.2722105263157894e-05,
"loss": 0.327,
"step": 157
},
{
"epoch": 0.03243687127899815,
"grad_norm": 0.523433268070221,
"learning_rate": 2.2193684210526316e-05,
"loss": 0.3005,
"step": 158
},
{
"epoch": 0.0326421679326627,
"grad_norm": 0.507231593132019,
"learning_rate": 2.1665263157894737e-05,
"loss": 0.2515,
"step": 159
},
{
"epoch": 0.032847464586327244,
"grad_norm": 0.4762112498283386,
"learning_rate": 2.1136842105263158e-05,
"loss": 0.2701,
"step": 160
},
{
"epoch": 0.03305276123999179,
"grad_norm": 0.700114905834198,
"learning_rate": 2.060842105263158e-05,
"loss": 0.2307,
"step": 161
},
{
"epoch": 0.03325805789365633,
"grad_norm": 0.6575700044631958,
"learning_rate": 2.008e-05,
"loss": 0.2601,
"step": 162
},
{
"epoch": 0.03346335454732088,
"grad_norm": 0.5447333455085754,
"learning_rate": 1.9551578947368422e-05,
"loss": 0.2787,
"step": 163
},
{
"epoch": 0.033668651200985424,
"grad_norm": 0.5627842545509338,
"learning_rate": 1.9023157894736843e-05,
"loss": 0.2804,
"step": 164
},
{
"epoch": 0.03387394785464997,
"grad_norm": 0.6934998035430908,
"learning_rate": 1.849473684210526e-05,
"loss": 0.3029,
"step": 165
},
{
"epoch": 0.03407924450831452,
"grad_norm": 0.608713686466217,
"learning_rate": 1.7966315789473686e-05,
"loss": 0.3032,
"step": 166
},
{
"epoch": 0.03428454116197906,
"grad_norm": 0.7269811034202576,
"learning_rate": 1.7437894736842107e-05,
"loss": 0.3062,
"step": 167
},
{
"epoch": 0.034489837815643604,
"grad_norm": 0.7339184880256653,
"learning_rate": 1.6909473684210525e-05,
"loss": 0.3572,
"step": 168
},
{
"epoch": 0.03469513446930815,
"grad_norm": 0.6014161109924316,
"learning_rate": 1.638105263157895e-05,
"loss": 0.2341,
"step": 169
},
{
"epoch": 0.0349004311229727,
"grad_norm": 0.5328456163406372,
"learning_rate": 1.5852631578947368e-05,
"loss": 0.1971,
"step": 170
},
{
"epoch": 0.035105727776637244,
"grad_norm": 0.600787341594696,
"learning_rate": 1.532421052631579e-05,
"loss": 0.2759,
"step": 171
},
{
"epoch": 0.035311024430301784,
"grad_norm": 0.6568222045898438,
"learning_rate": 1.4795789473684209e-05,
"loss": 0.2669,
"step": 172
},
{
"epoch": 0.03551632108396633,
"grad_norm": 0.8094263076782227,
"learning_rate": 1.4267368421052632e-05,
"loss": 0.3178,
"step": 173
},
{
"epoch": 0.03572161773763088,
"grad_norm": 0.7459115982055664,
"learning_rate": 1.3738947368421053e-05,
"loss": 0.3089,
"step": 174
},
{
"epoch": 0.035926914391295424,
"grad_norm": 0.5925175547599792,
"learning_rate": 1.3210526315789473e-05,
"loss": 0.2371,
"step": 175
},
{
"epoch": 0.036132211044959964,
"grad_norm": 0.7327297329902649,
"learning_rate": 1.2682105263157896e-05,
"loss": 0.2102,
"step": 176
},
{
"epoch": 0.03633750769862451,
"grad_norm": 0.829053521156311,
"learning_rate": 1.2153684210526315e-05,
"loss": 0.3248,
"step": 177
},
{
"epoch": 0.03654280435228906,
"grad_norm": 0.9253886938095093,
"learning_rate": 1.1625263157894737e-05,
"loss": 0.36,
"step": 178
},
{
"epoch": 0.036748101005953604,
"grad_norm": 0.8999148011207581,
"learning_rate": 1.1096842105263158e-05,
"loss": 0.3776,
"step": 179
},
{
"epoch": 0.03695339765961815,
"grad_norm": 0.9121159911155701,
"learning_rate": 1.0568421052631579e-05,
"loss": 0.3397,
"step": 180
},
{
"epoch": 0.03715869431328269,
"grad_norm": 0.6963509917259216,
"learning_rate": 1.004e-05,
"loss": 0.2286,
"step": 181
},
{
"epoch": 0.03736399096694724,
"grad_norm": 0.8616005182266235,
"learning_rate": 9.511578947368422e-06,
"loss": 0.3376,
"step": 182
},
{
"epoch": 0.037569287620611784,
"grad_norm": 0.9709863066673279,
"learning_rate": 8.983157894736843e-06,
"loss": 0.3455,
"step": 183
},
{
"epoch": 0.03777458427427633,
"grad_norm": 0.9001310467720032,
"learning_rate": 8.454736842105263e-06,
"loss": 0.247,
"step": 184
},
{
"epoch": 0.03797988092794088,
"grad_norm": 0.8515599966049194,
"learning_rate": 7.926315789473684e-06,
"loss": 0.2948,
"step": 185
},
{
"epoch": 0.03818517758160542,
"grad_norm": 0.9445661306381226,
"learning_rate": 7.397894736842104e-06,
"loss": 0.3226,
"step": 186
},
{
"epoch": 0.038390474235269964,
"grad_norm": 0.9558519721031189,
"learning_rate": 6.8694736842105265e-06,
"loss": 0.2099,
"step": 187
},
{
"epoch": 0.03859577088893451,
"grad_norm": 1.244987964630127,
"learning_rate": 6.341052631578948e-06,
"loss": 0.4038,
"step": 188
},
{
"epoch": 0.03880106754259906,
"grad_norm": 1.1432714462280273,
"learning_rate": 5.812631578947368e-06,
"loss": 0.3254,
"step": 189
},
{
"epoch": 0.039006364196263604,
"grad_norm": 1.165374755859375,
"learning_rate": 5.2842105263157896e-06,
"loss": 0.3425,
"step": 190
},
{
"epoch": 0.039211660849928144,
"grad_norm": 0.8977206945419312,
"learning_rate": 4.755789473684211e-06,
"loss": 0.2447,
"step": 191
},
{
"epoch": 0.03941695750359269,
"grad_norm": 1.1840242147445679,
"learning_rate": 4.227368421052631e-06,
"loss": 0.2973,
"step": 192
},
{
"epoch": 0.03962225415725724,
"grad_norm": 0.9542776942253113,
"learning_rate": 3.698947368421052e-06,
"loss": 0.2706,
"step": 193
},
{
"epoch": 0.039827550810921784,
"grad_norm": 1.2954764366149902,
"learning_rate": 3.170526315789474e-06,
"loss": 0.39,
"step": 194
},
{
"epoch": 0.040032847464586324,
"grad_norm": 1.069343090057373,
"learning_rate": 2.6421052631578948e-06,
"loss": 0.3317,
"step": 195
},
{
"epoch": 0.04023814411825087,
"grad_norm": 1.210374116897583,
"learning_rate": 2.1136842105263157e-06,
"loss": 0.2649,
"step": 196
},
{
"epoch": 0.04044344077191542,
"grad_norm": 1.4895799160003662,
"learning_rate": 1.585263157894737e-06,
"loss": 0.3533,
"step": 197
},
{
"epoch": 0.040648737425579964,
"grad_norm": 1.157914400100708,
"learning_rate": 1.0568421052631578e-06,
"loss": 0.2274,
"step": 198
},
{
"epoch": 0.04085403407924451,
"grad_norm": 1.3822752237319946,
"learning_rate": 5.284210526315789e-07,
"loss": 0.2567,
"step": 199
},
{
"epoch": 0.04105933073290905,
"grad_norm": 1.841983675956726,
"learning_rate": 0.0,
"loss": 0.3759,
"step": 200
},
{
"epoch": 0.04105933073290905,
"eval_loss": 0.28237149119377136,
"eval_runtime": 284.5717,
"eval_samples_per_second": 7.207,
"eval_steps_per_second": 1.803,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.700880338341069e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}