error577's picture
Training in progress, step 300, checkpoint
a427614 verified
raw
history blame
54.7 kB
{
"best_metric": 0.6729200482368469,
"best_model_checkpoint": "miner_id_24/checkpoint-300",
"epoch": 0.24737167594310452,
"eval_steps": 50,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008245722531436817,
"grad_norm": 0.4181750416755676,
"learning_rate": 2.9999999999999997e-05,
"loss": 0.7232,
"step": 1
},
{
"epoch": 0.0008245722531436817,
"eval_loss": 0.8246814608573914,
"eval_runtime": 42.8717,
"eval_samples_per_second": 9.237,
"eval_steps_per_second": 9.237,
"step": 1
},
{
"epoch": 0.0016491445062873633,
"grad_norm": 0.40615200996398926,
"learning_rate": 5.9999999999999995e-05,
"loss": 0.8142,
"step": 2
},
{
"epoch": 0.0024737167594310453,
"grad_norm": 0.37523695826530457,
"learning_rate": 8.999999999999999e-05,
"loss": 0.7982,
"step": 3
},
{
"epoch": 0.0032982890125747267,
"grad_norm": 0.36036789417266846,
"learning_rate": 0.00011999999999999999,
"loss": 0.836,
"step": 4
},
{
"epoch": 0.0041228612657184084,
"grad_norm": 0.394962877035141,
"learning_rate": 0.00015,
"loss": 0.8198,
"step": 5
},
{
"epoch": 0.004947433518862091,
"grad_norm": 0.416413277387619,
"learning_rate": 0.00017999999999999998,
"loss": 0.7857,
"step": 6
},
{
"epoch": 0.005772005772005772,
"grad_norm": 0.5225532650947571,
"learning_rate": 0.00020999999999999998,
"loss": 0.7836,
"step": 7
},
{
"epoch": 0.006596578025149453,
"grad_norm": 0.39612719416618347,
"learning_rate": 0.00023999999999999998,
"loss": 0.7467,
"step": 8
},
{
"epoch": 0.0074211502782931356,
"grad_norm": 0.3856079578399658,
"learning_rate": 0.00027,
"loss": 0.7699,
"step": 9
},
{
"epoch": 0.008245722531436817,
"grad_norm": 0.3804507851600647,
"learning_rate": 0.0003,
"loss": 0.7913,
"step": 10
},
{
"epoch": 0.009070294784580499,
"grad_norm": 0.3713577389717102,
"learning_rate": 0.0002999911984174669,
"loss": 0.7259,
"step": 11
},
{
"epoch": 0.009894867037724181,
"grad_norm": 0.3443286418914795,
"learning_rate": 0.0002999647947027726,
"loss": 0.7223,
"step": 12
},
{
"epoch": 0.010719439290867862,
"grad_norm": 0.32728496193885803,
"learning_rate": 0.0002999207919545099,
"loss": 0.7663,
"step": 13
},
{
"epoch": 0.011544011544011544,
"grad_norm": 0.3246591091156006,
"learning_rate": 0.0002998591953365965,
"loss": 0.749,
"step": 14
},
{
"epoch": 0.012368583797155226,
"grad_norm": 0.32717230916023254,
"learning_rate": 0.00029978001207766854,
"loss": 0.7442,
"step": 15
},
{
"epoch": 0.013193156050298907,
"grad_norm": 0.3290279805660248,
"learning_rate": 0.00029968325147023263,
"loss": 0.7572,
"step": 16
},
{
"epoch": 0.014017728303442589,
"grad_norm": 0.35251685976982117,
"learning_rate": 0.000299568924869575,
"loss": 0.7737,
"step": 17
},
{
"epoch": 0.014842300556586271,
"grad_norm": 0.3338168263435364,
"learning_rate": 0.00029943704569242917,
"loss": 0.7588,
"step": 18
},
{
"epoch": 0.01566687280972995,
"grad_norm": 0.32795682549476624,
"learning_rate": 0.0002992876294154013,
"loss": 0.7875,
"step": 19
},
{
"epoch": 0.016491445062873634,
"grad_norm": 0.3186565339565277,
"learning_rate": 0.00029912069357315393,
"loss": 0.7336,
"step": 20
},
{
"epoch": 0.017316017316017316,
"grad_norm": 0.3066052794456482,
"learning_rate": 0.00029893625775634835,
"loss": 0.7219,
"step": 21
},
{
"epoch": 0.018140589569160998,
"grad_norm": 0.31656959652900696,
"learning_rate": 0.0002987343436093454,
"loss": 0.7583,
"step": 22
},
{
"epoch": 0.01896516182230468,
"grad_norm": 0.3124566674232483,
"learning_rate": 0.00029851497482766547,
"loss": 0.7673,
"step": 23
},
{
"epoch": 0.019789734075448363,
"grad_norm": 0.30470794439315796,
"learning_rate": 0.00029827817715520773,
"loss": 0.7122,
"step": 24
},
{
"epoch": 0.02061430632859204,
"grad_norm": 0.3141001760959625,
"learning_rate": 0.0002980239783812289,
"loss": 0.8052,
"step": 25
},
{
"epoch": 0.021438878581735724,
"grad_norm": 0.32485461235046387,
"learning_rate": 0.0002977524083370822,
"loss": 0.7482,
"step": 26
},
{
"epoch": 0.022263450834879406,
"grad_norm": 0.31203949451446533,
"learning_rate": 0.00029746349889271645,
"loss": 0.6784,
"step": 27
},
{
"epoch": 0.023088023088023088,
"grad_norm": 0.32462912797927856,
"learning_rate": 0.0002971572839529358,
"loss": 0.767,
"step": 28
},
{
"epoch": 0.02391259534116677,
"grad_norm": 0.3174619972705841,
"learning_rate": 0.00029683379945342125,
"loss": 0.7283,
"step": 29
},
{
"epoch": 0.024737167594310452,
"grad_norm": 0.33072593808174133,
"learning_rate": 0.000296493083356513,
"loss": 0.8054,
"step": 30
},
{
"epoch": 0.025561739847454135,
"grad_norm": 0.31485095620155334,
"learning_rate": 0.00029613517564675565,
"loss": 0.7499,
"step": 31
},
{
"epoch": 0.026386312100597813,
"grad_norm": 0.3060151934623718,
"learning_rate": 0.0002957601183262058,
"loss": 0.7342,
"step": 32
},
{
"epoch": 0.027210884353741496,
"grad_norm": 0.31382423639297485,
"learning_rate": 0.000295367955409503,
"loss": 0.7529,
"step": 33
},
{
"epoch": 0.028035456606885178,
"grad_norm": 0.3076959550380707,
"learning_rate": 0.00029495873291870436,
"loss": 0.6956,
"step": 34
},
{
"epoch": 0.02886002886002886,
"grad_norm": 0.31915611028671265,
"learning_rate": 0.0002945324988778834,
"loss": 0.7885,
"step": 35
},
{
"epoch": 0.029684601113172542,
"grad_norm": 0.3209708034992218,
"learning_rate": 0.00029408930330749477,
"loss": 0.7536,
"step": 36
},
{
"epoch": 0.030509173366316224,
"grad_norm": 0.33205971121788025,
"learning_rate": 0.0002936291982185036,
"loss": 0.7633,
"step": 37
},
{
"epoch": 0.0313337456194599,
"grad_norm": 0.3191162049770355,
"learning_rate": 0.00029315223760628217,
"loss": 0.7437,
"step": 38
},
{
"epoch": 0.032158317872603585,
"grad_norm": 0.34463781118392944,
"learning_rate": 0.00029265847744427303,
"loss": 0.7586,
"step": 39
},
{
"epoch": 0.03298289012574727,
"grad_norm": 0.3288670778274536,
"learning_rate": 0.00029214797567742035,
"loss": 0.7812,
"step": 40
},
{
"epoch": 0.03380746237889095,
"grad_norm": 0.3351598381996155,
"learning_rate": 0.00029162079221537,
"loss": 0.785,
"step": 41
},
{
"epoch": 0.03463203463203463,
"grad_norm": 0.3358539938926697,
"learning_rate": 0.0002910769889254386,
"loss": 0.7145,
"step": 42
},
{
"epoch": 0.035456606885178314,
"grad_norm": 0.3250412344932556,
"learning_rate": 0.0002905166296253533,
"loss": 0.7333,
"step": 43
},
{
"epoch": 0.036281179138321996,
"grad_norm": 0.324008584022522,
"learning_rate": 0.0002899397800757626,
"loss": 0.7508,
"step": 44
},
{
"epoch": 0.03710575139146568,
"grad_norm": 0.3390953242778778,
"learning_rate": 0.0002893465079725187,
"loss": 0.6994,
"step": 45
},
{
"epoch": 0.03793032364460936,
"grad_norm": 0.3244946002960205,
"learning_rate": 0.0002887368829387333,
"loss": 0.6945,
"step": 46
},
{
"epoch": 0.03875489589775304,
"grad_norm": 0.34154826402664185,
"learning_rate": 0.0002881109765166071,
"loss": 0.7437,
"step": 47
},
{
"epoch": 0.039579468150896725,
"grad_norm": 0.36960309743881226,
"learning_rate": 0.00028746886215903387,
"loss": 0.7878,
"step": 48
},
{
"epoch": 0.04040404040404041,
"grad_norm": 0.39132601022720337,
"learning_rate": 0.00028681061522098047,
"loss": 0.731,
"step": 49
},
{
"epoch": 0.04122861265718408,
"grad_norm": 0.68941730260849,
"learning_rate": 0.0002861363129506435,
"loss": 0.7756,
"step": 50
},
{
"epoch": 0.04122861265718408,
"eval_loss": 0.7312861084938049,
"eval_runtime": 42.6476,
"eval_samples_per_second": 9.285,
"eval_steps_per_second": 9.285,
"step": 50
},
{
"epoch": 0.042053184910327765,
"grad_norm": 0.3118657171726227,
"learning_rate": 0.0002854460344803842,
"loss": 0.6581,
"step": 51
},
{
"epoch": 0.04287775716347145,
"grad_norm": 0.33690398931503296,
"learning_rate": 0.00028473986081744163,
"loss": 0.7433,
"step": 52
},
{
"epoch": 0.04370232941661513,
"grad_norm": 0.33627963066101074,
"learning_rate": 0.000284017874834426,
"loss": 0.73,
"step": 53
},
{
"epoch": 0.04452690166975881,
"grad_norm": 0.3204888105392456,
"learning_rate": 0.0002832801612595937,
"loss": 0.7398,
"step": 54
},
{
"epoch": 0.045351473922902494,
"grad_norm": 0.3057969808578491,
"learning_rate": 0.0002825268066669034,
"loss": 0.6966,
"step": 55
},
{
"epoch": 0.046176046176046176,
"grad_norm": 0.30388593673706055,
"learning_rate": 0.00028175789946585693,
"loss": 0.672,
"step": 56
},
{
"epoch": 0.04700061842918986,
"grad_norm": 0.31205031275749207,
"learning_rate": 0.0002809735298911234,
"loss": 0.7422,
"step": 57
},
{
"epoch": 0.04782519068233354,
"grad_norm": 0.317719042301178,
"learning_rate": 0.00028017378999195015,
"loss": 0.7359,
"step": 58
},
{
"epoch": 0.04864976293547722,
"grad_norm": 0.3247784674167633,
"learning_rate": 0.0002793587736213603,
"loss": 0.688,
"step": 59
},
{
"epoch": 0.049474335188620905,
"grad_norm": 0.32107478380203247,
"learning_rate": 0.00027852857642513836,
"loss": 0.7123,
"step": 60
},
{
"epoch": 0.05029890744176459,
"grad_norm": 0.3145858645439148,
"learning_rate": 0.00027768329583060635,
"loss": 0.7274,
"step": 61
},
{
"epoch": 0.05112347969490827,
"grad_norm": 0.3061201870441437,
"learning_rate": 0.00027682303103518976,
"loss": 0.7098,
"step": 62
},
{
"epoch": 0.05194805194805195,
"grad_norm": 0.31068190932273865,
"learning_rate": 0.00027594788299477655,
"loss": 0.7176,
"step": 63
},
{
"epoch": 0.05277262420119563,
"grad_norm": 0.32404160499572754,
"learning_rate": 0.0002750579544118695,
"loss": 0.7304,
"step": 64
},
{
"epoch": 0.05359719645433931,
"grad_norm": 0.3103507459163666,
"learning_rate": 0.00027415334972353357,
"loss": 0.7112,
"step": 65
},
{
"epoch": 0.05442176870748299,
"grad_norm": 0.31706494092941284,
"learning_rate": 0.0002732341750891397,
"loss": 0.6998,
"step": 66
},
{
"epoch": 0.05524634096062667,
"grad_norm": 0.30976295471191406,
"learning_rate": 0.00027230053837790666,
"loss": 0.671,
"step": 67
},
{
"epoch": 0.056070913213770356,
"grad_norm": 0.3198986351490021,
"learning_rate": 0.0002713525491562421,
"loss": 0.7438,
"step": 68
},
{
"epoch": 0.05689548546691404,
"grad_norm": 0.3304222822189331,
"learning_rate": 0.0002703903186748843,
"loss": 0.814,
"step": 69
},
{
"epoch": 0.05772005772005772,
"grad_norm": 0.3097413182258606,
"learning_rate": 0.00026941395985584653,
"loss": 0.6695,
"step": 70
},
{
"epoch": 0.0585446299732014,
"grad_norm": 0.31618732213974,
"learning_rate": 0.00026842358727916524,
"loss": 0.739,
"step": 71
},
{
"epoch": 0.059369202226345084,
"grad_norm": 0.3159608542919159,
"learning_rate": 0.0002674193171694533,
"loss": 0.7172,
"step": 72
},
{
"epoch": 0.06019377447948877,
"grad_norm": 0.30797278881073,
"learning_rate": 0.0002664012673822609,
"loss": 0.7187,
"step": 73
},
{
"epoch": 0.06101834673263245,
"grad_norm": 0.3251792788505554,
"learning_rate": 0.0002653695573902443,
"loss": 0.796,
"step": 74
},
{
"epoch": 0.06184291898577613,
"grad_norm": 0.3200572431087494,
"learning_rate": 0.0002643243082691454,
"loss": 0.7268,
"step": 75
},
{
"epoch": 0.0626674912389198,
"grad_norm": 0.29941457509994507,
"learning_rate": 0.0002632656426835831,
"loss": 0.6928,
"step": 76
},
{
"epoch": 0.06349206349206349,
"grad_norm": 0.3021852672100067,
"learning_rate": 0.00026219368487265753,
"loss": 0.7257,
"step": 77
},
{
"epoch": 0.06431663574520717,
"grad_norm": 0.30670538544654846,
"learning_rate": 0.00026110856063537083,
"loss": 0.699,
"step": 78
},
{
"epoch": 0.06514120799835085,
"grad_norm": 0.3165689706802368,
"learning_rate": 0.00026001039731586334,
"loss": 0.7149,
"step": 79
},
{
"epoch": 0.06596578025149454,
"grad_norm": 0.3253694474697113,
"learning_rate": 0.0002588993237884696,
"loss": 0.8077,
"step": 80
},
{
"epoch": 0.06679035250463822,
"grad_norm": 0.31187790632247925,
"learning_rate": 0.00025777547044259435,
"loss": 0.7574,
"step": 81
},
{
"epoch": 0.0676149247577819,
"grad_norm": 0.32256990671157837,
"learning_rate": 0.0002566389691674106,
"loss": 0.7692,
"step": 82
},
{
"epoch": 0.06843949701092558,
"grad_norm": 0.3136816918849945,
"learning_rate": 0.00025548995333638197,
"loss": 0.7452,
"step": 83
},
{
"epoch": 0.06926406926406926,
"grad_norm": 0.3113918602466583,
"learning_rate": 0.00025432855779161076,
"loss": 0.711,
"step": 84
},
{
"epoch": 0.07008864151721295,
"grad_norm": 0.30899491906166077,
"learning_rate": 0.00025315491882801347,
"loss": 0.7414,
"step": 85
},
{
"epoch": 0.07091321377035663,
"grad_norm": 0.31038275361061096,
"learning_rate": 0.00025196917417732615,
"loss": 0.69,
"step": 86
},
{
"epoch": 0.07173778602350031,
"grad_norm": 0.32005515694618225,
"learning_rate": 0.0002507714629919409,
"loss": 0.7283,
"step": 87
},
{
"epoch": 0.07256235827664399,
"grad_norm": 0.31575390696525574,
"learning_rate": 0.0002495619258285757,
"loss": 0.6974,
"step": 88
},
{
"epoch": 0.07338693052978768,
"grad_norm": 0.3133029341697693,
"learning_rate": 0.0002483407046317794,
"loss": 0.7083,
"step": 89
},
{
"epoch": 0.07421150278293136,
"grad_norm": 0.32530999183654785,
"learning_rate": 0.00024710794271727413,
"loss": 0.7165,
"step": 90
},
{
"epoch": 0.07503607503607504,
"grad_norm": 0.31330084800720215,
"learning_rate": 0.0002458637847551364,
"loss": 0.6542,
"step": 91
},
{
"epoch": 0.07586064728921872,
"grad_norm": 0.3188161849975586,
"learning_rate": 0.00024460837675281926,
"loss": 0.6734,
"step": 92
},
{
"epoch": 0.0766852195423624,
"grad_norm": 0.3343876600265503,
"learning_rate": 0.00024334186603801807,
"loss": 0.7359,
"step": 93
},
{
"epoch": 0.07750979179550609,
"grad_norm": 0.33376866579055786,
"learning_rate": 0.00024206440124138062,
"loss": 0.6967,
"step": 94
},
{
"epoch": 0.07833436404864977,
"grad_norm": 0.3637959957122803,
"learning_rate": 0.0002407761322790648,
"loss": 0.7499,
"step": 95
},
{
"epoch": 0.07915893630179345,
"grad_norm": 0.335549533367157,
"learning_rate": 0.00023947721033514512,
"loss": 0.7196,
"step": 96
},
{
"epoch": 0.07998350855493713,
"grad_norm": 0.35548627376556396,
"learning_rate": 0.00023816778784387094,
"loss": 0.737,
"step": 97
},
{
"epoch": 0.08080808080808081,
"grad_norm": 0.3780907094478607,
"learning_rate": 0.0002368480184717773,
"loss": 0.7565,
"step": 98
},
{
"epoch": 0.08163265306122448,
"grad_norm": 0.40498775243759155,
"learning_rate": 0.00023551805709965147,
"loss": 0.6692,
"step": 99
},
{
"epoch": 0.08245722531436817,
"grad_norm": 0.8926903605461121,
"learning_rate": 0.00023417805980435736,
"loss": 0.4799,
"step": 100
},
{
"epoch": 0.08245722531436817,
"eval_loss": 0.7042385935783386,
"eval_runtime": 42.7803,
"eval_samples_per_second": 9.257,
"eval_steps_per_second": 9.257,
"step": 100
},
{
"epoch": 0.08328179756751185,
"grad_norm": 0.29144763946533203,
"learning_rate": 0.00023282818384051866,
"loss": 0.6653,
"step": 101
},
{
"epoch": 0.08410636982065553,
"grad_norm": 0.290532648563385,
"learning_rate": 0.00023146858762206489,
"loss": 0.6903,
"step": 102
},
{
"epoch": 0.08493094207379921,
"grad_norm": 0.304470956325531,
"learning_rate": 0.00023009943070364044,
"loss": 0.6943,
"step": 103
},
{
"epoch": 0.0857555143269429,
"grad_norm": 0.31192028522491455,
"learning_rate": 0.0002287208737618801,
"loss": 0.7152,
"step": 104
},
{
"epoch": 0.08658008658008658,
"grad_norm": 0.30607473850250244,
"learning_rate": 0.00022733307857655325,
"loss": 0.7248,
"step": 105
},
{
"epoch": 0.08740465883323026,
"grad_norm": 0.2998904883861542,
"learning_rate": 0.00022593620801157808,
"loss": 0.7291,
"step": 106
},
{
"epoch": 0.08822923108637394,
"grad_norm": 0.3096725344657898,
"learning_rate": 0.00022453042599590882,
"loss": 0.7241,
"step": 107
},
{
"epoch": 0.08905380333951762,
"grad_norm": 0.3047013282775879,
"learning_rate": 0.00022311589750429787,
"loss": 0.7206,
"step": 108
},
{
"epoch": 0.0898783755926613,
"grad_norm": 0.3195563554763794,
"learning_rate": 0.00022169278853793545,
"loss": 0.7217,
"step": 109
},
{
"epoch": 0.09070294784580499,
"grad_norm": 0.32428956031799316,
"learning_rate": 0.00022026126610496852,
"loss": 0.6962,
"step": 110
},
{
"epoch": 0.09152752009894867,
"grad_norm": 0.31494930386543274,
"learning_rate": 0.0002188214982009016,
"loss": 0.6812,
"step": 111
},
{
"epoch": 0.09235209235209235,
"grad_norm": 0.329039603471756,
"learning_rate": 0.00021737365378888187,
"loss": 0.7871,
"step": 112
},
{
"epoch": 0.09317666460523603,
"grad_norm": 0.3266455829143524,
"learning_rate": 0.00021591790277987043,
"loss": 0.7367,
"step": 113
},
{
"epoch": 0.09400123685837972,
"grad_norm": 0.31767553091049194,
"learning_rate": 0.00021445441601270276,
"loss": 0.7702,
"step": 114
},
{
"epoch": 0.0948258091115234,
"grad_norm": 0.32063761353492737,
"learning_rate": 0.00021298336523403968,
"loss": 0.7514,
"step": 115
},
{
"epoch": 0.09565038136466708,
"grad_norm": 0.297237366437912,
"learning_rate": 0.0002115049230782124,
"loss": 0.6745,
"step": 116
},
{
"epoch": 0.09647495361781076,
"grad_norm": 0.30952492356300354,
"learning_rate": 0.00021001926304696296,
"loss": 0.6953,
"step": 117
},
{
"epoch": 0.09729952587095445,
"grad_norm": 0.3084593415260315,
"learning_rate": 0.00020852655948908316,
"loss": 0.7184,
"step": 118
},
{
"epoch": 0.09812409812409813,
"grad_norm": 0.30032604932785034,
"learning_rate": 0.0002070269875799538,
"loss": 0.6477,
"step": 119
},
{
"epoch": 0.09894867037724181,
"grad_norm": 0.3085014224052429,
"learning_rate": 0.00020552072330098716,
"loss": 0.7137,
"step": 120
},
{
"epoch": 0.09977324263038549,
"grad_norm": 0.3050251007080078,
"learning_rate": 0.0002040079434189748,
"loss": 0.6947,
"step": 121
},
{
"epoch": 0.10059781488352917,
"grad_norm": 0.29092761874198914,
"learning_rate": 0.00020248882546534326,
"loss": 0.6166,
"step": 122
},
{
"epoch": 0.10142238713667286,
"grad_norm": 0.32282838225364685,
"learning_rate": 0.00020096354771531976,
"loss": 0.7807,
"step": 123
},
{
"epoch": 0.10224695938981654,
"grad_norm": 0.3140111565589905,
"learning_rate": 0.00019943228916701104,
"loss": 0.7271,
"step": 124
},
{
"epoch": 0.10307153164296022,
"grad_norm": 0.3199913203716278,
"learning_rate": 0.00019789522952039695,
"loss": 0.6998,
"step": 125
},
{
"epoch": 0.1038961038961039,
"grad_norm": 0.3230836093425751,
"learning_rate": 0.0001963525491562421,
"loss": 0.7354,
"step": 126
},
{
"epoch": 0.10472067614924757,
"grad_norm": 0.32371896505355835,
"learning_rate": 0.00019480442911492702,
"loss": 0.7583,
"step": 127
},
{
"epoch": 0.10554524840239125,
"grad_norm": 0.3246193826198578,
"learning_rate": 0.00019325105107520263,
"loss": 0.7296,
"step": 128
},
{
"epoch": 0.10636982065553494,
"grad_norm": 0.3160342872142792,
"learning_rate": 0.00019169259733286913,
"loss": 0.7091,
"step": 129
},
{
"epoch": 0.10719439290867862,
"grad_norm": 0.3068806231021881,
"learning_rate": 0.00019012925077938314,
"loss": 0.7068,
"step": 130
},
{
"epoch": 0.1080189651618223,
"grad_norm": 0.31874704360961914,
"learning_rate": 0.0001885611948803941,
"loss": 0.7188,
"step": 131
},
{
"epoch": 0.10884353741496598,
"grad_norm": 0.30816522240638733,
"learning_rate": 0.0001869886136542143,
"loss": 0.7145,
"step": 132
},
{
"epoch": 0.10966810966810966,
"grad_norm": 0.3057398498058319,
"learning_rate": 0.00018541169165022298,
"loss": 0.6788,
"step": 133
},
{
"epoch": 0.11049268192125335,
"grad_norm": 0.30414751172065735,
"learning_rate": 0.00018383061392720913,
"loss": 0.6881,
"step": 134
},
{
"epoch": 0.11131725417439703,
"grad_norm": 0.31472885608673096,
"learning_rate": 0.0001822455660316536,
"loss": 0.7089,
"step": 135
},
{
"epoch": 0.11214182642754071,
"grad_norm": 0.309356153011322,
"learning_rate": 0.00018065673397595473,
"loss": 0.6732,
"step": 136
},
{
"epoch": 0.1129663986806844,
"grad_norm": 0.30579566955566406,
"learning_rate": 0.00017906430421659876,
"loss": 0.6453,
"step": 137
},
{
"epoch": 0.11379097093382808,
"grad_norm": 0.3306417465209961,
"learning_rate": 0.00017746846363227842,
"loss": 0.7327,
"step": 138
},
{
"epoch": 0.11461554318697176,
"grad_norm": 0.32207512855529785,
"learning_rate": 0.00017586939950196186,
"loss": 0.677,
"step": 139
},
{
"epoch": 0.11544011544011544,
"grad_norm": 0.3170906603336334,
"learning_rate": 0.00017426729948291474,
"loss": 0.6764,
"step": 140
},
{
"epoch": 0.11626468769325912,
"grad_norm": 0.32157909870147705,
"learning_rate": 0.00017266235158867752,
"loss": 0.7016,
"step": 141
},
{
"epoch": 0.1170892599464028,
"grad_norm": 0.3478415012359619,
"learning_rate": 0.00017105474416700164,
"loss": 0.6956,
"step": 142
},
{
"epoch": 0.11791383219954649,
"grad_norm": 0.32598310708999634,
"learning_rate": 0.0001694446658777458,
"loss": 0.6716,
"step": 143
},
{
"epoch": 0.11873840445269017,
"grad_norm": 0.3289110064506531,
"learning_rate": 0.00016783230567073596,
"loss": 0.7107,
"step": 144
},
{
"epoch": 0.11956297670583385,
"grad_norm": 0.34824198484420776,
"learning_rate": 0.00016621785276359127,
"loss": 0.6887,
"step": 145
},
{
"epoch": 0.12038754895897753,
"grad_norm": 0.35514163970947266,
"learning_rate": 0.0001646014966195185,
"loss": 0.7147,
"step": 146
},
{
"epoch": 0.12121212121212122,
"grad_norm": 0.3446742296218872,
"learning_rate": 0.00016298342692507763,
"loss": 0.6324,
"step": 147
},
{
"epoch": 0.1220366934652649,
"grad_norm": 0.3650054633617401,
"learning_rate": 0.00016136383356792156,
"loss": 0.6782,
"step": 148
},
{
"epoch": 0.12286126571840858,
"grad_norm": 0.3771435022354126,
"learning_rate": 0.0001597429066145116,
"loss": 0.6823,
"step": 149
},
{
"epoch": 0.12368583797155226,
"grad_norm": 0.6211467981338501,
"learning_rate": 0.0001581208362878126,
"loss": 0.6651,
"step": 150
},
{
"epoch": 0.12368583797155226,
"eval_loss": 0.6936895847320557,
"eval_runtime": 42.757,
"eval_samples_per_second": 9.262,
"eval_steps_per_second": 9.262,
"step": 150
},
{
"epoch": 0.12451041022469594,
"grad_norm": 0.3050060570240021,
"learning_rate": 0.00015649781294496933,
"loss": 0.5971,
"step": 151
},
{
"epoch": 0.1253349824778396,
"grad_norm": 0.31458163261413574,
"learning_rate": 0.00015487402705496707,
"loss": 0.7042,
"step": 152
},
{
"epoch": 0.1261595547309833,
"grad_norm": 0.30443075299263,
"learning_rate": 0.0001532496691762796,
"loss": 0.6492,
"step": 153
},
{
"epoch": 0.12698412698412698,
"grad_norm": 0.31165623664855957,
"learning_rate": 0.00015162492993450597,
"loss": 0.7077,
"step": 154
},
{
"epoch": 0.12780869923727067,
"grad_norm": 0.3184472322463989,
"learning_rate": 0.00015,
"loss": 0.7271,
"step": 155
},
{
"epoch": 0.12863327149041434,
"grad_norm": 0.31272202730178833,
"learning_rate": 0.00014837507006549403,
"loss": 0.7611,
"step": 156
},
{
"epoch": 0.12945784374355804,
"grad_norm": 0.3030358850955963,
"learning_rate": 0.00014675033082372038,
"loss": 0.7276,
"step": 157
},
{
"epoch": 0.1302824159967017,
"grad_norm": 0.31098514795303345,
"learning_rate": 0.00014512597294503293,
"loss": 0.6906,
"step": 158
},
{
"epoch": 0.1311069882498454,
"grad_norm": 0.3137834370136261,
"learning_rate": 0.00014350218705503067,
"loss": 0.7106,
"step": 159
},
{
"epoch": 0.13193156050298907,
"grad_norm": 0.3150627315044403,
"learning_rate": 0.00014187916371218736,
"loss": 0.7156,
"step": 160
},
{
"epoch": 0.13275613275613277,
"grad_norm": 0.31015968322753906,
"learning_rate": 0.00014025709338548836,
"loss": 0.6631,
"step": 161
},
{
"epoch": 0.13358070500927643,
"grad_norm": 0.29935508966445923,
"learning_rate": 0.00013863616643207844,
"loss": 0.6572,
"step": 162
},
{
"epoch": 0.13440527726242013,
"grad_norm": 0.30627861618995667,
"learning_rate": 0.00013701657307492235,
"loss": 0.6925,
"step": 163
},
{
"epoch": 0.1352298495155638,
"grad_norm": 0.32339316606521606,
"learning_rate": 0.00013539850338048154,
"loss": 0.7109,
"step": 164
},
{
"epoch": 0.1360544217687075,
"grad_norm": 0.31456539034843445,
"learning_rate": 0.00013378214723640876,
"loss": 0.697,
"step": 165
},
{
"epoch": 0.13687899402185116,
"grad_norm": 0.3141114413738251,
"learning_rate": 0.00013216769432926404,
"loss": 0.6922,
"step": 166
},
{
"epoch": 0.13770356627499486,
"grad_norm": 0.32136693596839905,
"learning_rate": 0.00013055533412225422,
"loss": 0.7408,
"step": 167
},
{
"epoch": 0.13852813852813853,
"grad_norm": 0.31578657031059265,
"learning_rate": 0.00012894525583299833,
"loss": 0.7104,
"step": 168
},
{
"epoch": 0.1393527107812822,
"grad_norm": 0.3017723262310028,
"learning_rate": 0.0001273376484113225,
"loss": 0.672,
"step": 169
},
{
"epoch": 0.1401772830344259,
"grad_norm": 0.3208543658256531,
"learning_rate": 0.0001257327005170853,
"loss": 0.7146,
"step": 170
},
{
"epoch": 0.14100185528756956,
"grad_norm": 0.31153324246406555,
"learning_rate": 0.00012413060049803814,
"loss": 0.6276,
"step": 171
},
{
"epoch": 0.14182642754071326,
"grad_norm": 0.3135003447532654,
"learning_rate": 0.00012253153636772156,
"loss": 0.6705,
"step": 172
},
{
"epoch": 0.14265099979385693,
"grad_norm": 0.30405810475349426,
"learning_rate": 0.00012093569578340124,
"loss": 0.6602,
"step": 173
},
{
"epoch": 0.14347557204700062,
"grad_norm": 0.32480090856552124,
"learning_rate": 0.00011934326602404528,
"loss": 0.6755,
"step": 174
},
{
"epoch": 0.1443001443001443,
"grad_norm": 0.3095938563346863,
"learning_rate": 0.00011775443396834638,
"loss": 0.7024,
"step": 175
},
{
"epoch": 0.14512471655328799,
"grad_norm": 0.32258421182632446,
"learning_rate": 0.00011616938607279086,
"loss": 0.7044,
"step": 176
},
{
"epoch": 0.14594928880643165,
"grad_norm": 0.3185218870639801,
"learning_rate": 0.00011458830834977698,
"loss": 0.7224,
"step": 177
},
{
"epoch": 0.14677386105957535,
"grad_norm": 0.31658223271369934,
"learning_rate": 0.0001130113863457857,
"loss": 0.7346,
"step": 178
},
{
"epoch": 0.14759843331271902,
"grad_norm": 0.31286993622779846,
"learning_rate": 0.00011143880511960584,
"loss": 0.7063,
"step": 179
},
{
"epoch": 0.14842300556586271,
"grad_norm": 0.30426645278930664,
"learning_rate": 0.00010987074922061689,
"loss": 0.6781,
"step": 180
},
{
"epoch": 0.14924757781900638,
"grad_norm": 0.30204102396965027,
"learning_rate": 0.00010830740266713087,
"loss": 0.6914,
"step": 181
},
{
"epoch": 0.15007215007215008,
"grad_norm": 0.3117939233779907,
"learning_rate": 0.00010674894892479738,
"loss": 0.714,
"step": 182
},
{
"epoch": 0.15089672232529375,
"grad_norm": 0.30473530292510986,
"learning_rate": 0.00010519557088507298,
"loss": 0.6351,
"step": 183
},
{
"epoch": 0.15172129457843744,
"grad_norm": 0.30228060483932495,
"learning_rate": 0.0001036474508437579,
"loss": 0.6095,
"step": 184
},
{
"epoch": 0.1525458668315811,
"grad_norm": 0.32118985056877136,
"learning_rate": 0.00010210477047960302,
"loss": 0.7311,
"step": 185
},
{
"epoch": 0.1533704390847248,
"grad_norm": 0.31862205266952515,
"learning_rate": 0.00010056771083298893,
"loss": 0.6728,
"step": 186
},
{
"epoch": 0.15419501133786848,
"grad_norm": 0.3058205246925354,
"learning_rate": 9.903645228468024e-05,
"loss": 0.6495,
"step": 187
},
{
"epoch": 0.15501958359101217,
"grad_norm": 0.3158295750617981,
"learning_rate": 9.751117453465673e-05,
"loss": 0.688,
"step": 188
},
{
"epoch": 0.15584415584415584,
"grad_norm": 0.32295459508895874,
"learning_rate": 9.59920565810252e-05,
"loss": 0.7007,
"step": 189
},
{
"epoch": 0.15666872809729954,
"grad_norm": 0.3153195083141327,
"learning_rate": 9.447927669901282e-05,
"loss": 0.6781,
"step": 190
},
{
"epoch": 0.1574933003504432,
"grad_norm": 0.31914830207824707,
"learning_rate": 9.297301242004618e-05,
"loss": 0.6459,
"step": 191
},
{
"epoch": 0.1583178726035869,
"grad_norm": 0.32512032985687256,
"learning_rate": 9.14734405109168e-05,
"loss": 0.708,
"step": 192
},
{
"epoch": 0.15914244485673057,
"grad_norm": 0.3304908573627472,
"learning_rate": 8.998073695303701e-05,
"loss": 0.6703,
"step": 193
},
{
"epoch": 0.15996701710987427,
"grad_norm": 0.33657538890838623,
"learning_rate": 8.849507692178758e-05,
"loss": 0.6989,
"step": 194
},
{
"epoch": 0.16079158936301793,
"grad_norm": 0.33851417899131775,
"learning_rate": 8.70166347659603e-05,
"loss": 0.709,
"step": 195
},
{
"epoch": 0.16161616161616163,
"grad_norm": 0.3366221785545349,
"learning_rate": 8.554558398729725e-05,
"loss": 0.6213,
"step": 196
},
{
"epoch": 0.1624407338693053,
"grad_norm": 0.3471497595310211,
"learning_rate": 8.408209722012956e-05,
"loss": 0.7261,
"step": 197
},
{
"epoch": 0.16326530612244897,
"grad_norm": 0.35795000195503235,
"learning_rate": 8.262634621111818e-05,
"loss": 0.639,
"step": 198
},
{
"epoch": 0.16408987837559266,
"grad_norm": 0.3911006450653076,
"learning_rate": 8.117850179909842e-05,
"loss": 0.6943,
"step": 199
},
{
"epoch": 0.16491445062873633,
"grad_norm": 0.5652897357940674,
"learning_rate": 7.973873389503149e-05,
"loss": 0.6569,
"step": 200
},
{
"epoch": 0.16491445062873633,
"eval_loss": 0.6824482679367065,
"eval_runtime": 42.7512,
"eval_samples_per_second": 9.263,
"eval_steps_per_second": 9.263,
"step": 200
},
{
"epoch": 0.16573902288188003,
"grad_norm": 0.29706528782844543,
"learning_rate": 7.830721146206451e-05,
"loss": 0.5975,
"step": 201
},
{
"epoch": 0.1665635951350237,
"grad_norm": 0.3147106170654297,
"learning_rate": 7.688410249570214e-05,
"loss": 0.7244,
"step": 202
},
{
"epoch": 0.1673881673881674,
"grad_norm": 0.3083764314651489,
"learning_rate": 7.54695740040912e-05,
"loss": 0.6659,
"step": 203
},
{
"epoch": 0.16821273964131106,
"grad_norm": 0.31014275550842285,
"learning_rate": 7.406379198842189e-05,
"loss": 0.6614,
"step": 204
},
{
"epoch": 0.16903731189445476,
"grad_norm": 0.3195944130420685,
"learning_rate": 7.266692142344672e-05,
"loss": 0.7009,
"step": 205
},
{
"epoch": 0.16986188414759842,
"grad_norm": 0.30758917331695557,
"learning_rate": 7.127912623811993e-05,
"loss": 0.6654,
"step": 206
},
{
"epoch": 0.17068645640074212,
"grad_norm": 0.30025753378868103,
"learning_rate": 6.990056929635957e-05,
"loss": 0.6463,
"step": 207
},
{
"epoch": 0.1715110286538858,
"grad_norm": 0.3022189736366272,
"learning_rate": 6.853141237793506e-05,
"loss": 0.6522,
"step": 208
},
{
"epoch": 0.17233560090702948,
"grad_norm": 0.3084527850151062,
"learning_rate": 6.717181615948126e-05,
"loss": 0.7102,
"step": 209
},
{
"epoch": 0.17316017316017315,
"grad_norm": 0.3123379647731781,
"learning_rate": 6.582194019564266e-05,
"loss": 0.6977,
"step": 210
},
{
"epoch": 0.17398474541331685,
"grad_norm": 0.3006177842617035,
"learning_rate": 6.448194290034848e-05,
"loss": 0.6934,
"step": 211
},
{
"epoch": 0.17480931766646052,
"grad_norm": 0.30770057439804077,
"learning_rate": 6.315198152822272e-05,
"loss": 0.6854,
"step": 212
},
{
"epoch": 0.1756338899196042,
"grad_norm": 0.31282415986061096,
"learning_rate": 6.183221215612904e-05,
"loss": 0.7131,
"step": 213
},
{
"epoch": 0.17645846217274788,
"grad_norm": 0.3112223148345947,
"learning_rate": 6.052278966485491e-05,
"loss": 0.7005,
"step": 214
},
{
"epoch": 0.17728303442589158,
"grad_norm": 0.30379047989845276,
"learning_rate": 5.922386772093526e-05,
"loss": 0.6716,
"step": 215
},
{
"epoch": 0.17810760667903525,
"grad_norm": 0.318124920129776,
"learning_rate": 5.793559875861938e-05,
"loss": 0.6676,
"step": 216
},
{
"epoch": 0.17893217893217894,
"grad_norm": 0.32162636518478394,
"learning_rate": 5.6658133961981894e-05,
"loss": 0.716,
"step": 217
},
{
"epoch": 0.1797567511853226,
"grad_norm": 0.2995055615901947,
"learning_rate": 5.5391623247180744e-05,
"loss": 0.6375,
"step": 218
},
{
"epoch": 0.1805813234384663,
"grad_norm": 0.30936968326568604,
"learning_rate": 5.413621524486363e-05,
"loss": 0.6462,
"step": 219
},
{
"epoch": 0.18140589569160998,
"grad_norm": 0.31474071741104126,
"learning_rate": 5.289205728272586e-05,
"loss": 0.7098,
"step": 220
},
{
"epoch": 0.18223046794475367,
"grad_norm": 0.31727197766304016,
"learning_rate": 5.165929536822059e-05,
"loss": 0.7516,
"step": 221
},
{
"epoch": 0.18305504019789734,
"grad_norm": 0.32454827427864075,
"learning_rate": 5.043807417142436e-05,
"loss": 0.7308,
"step": 222
},
{
"epoch": 0.18387961245104104,
"grad_norm": 0.3066427409648895,
"learning_rate": 4.922853700805909e-05,
"loss": 0.6555,
"step": 223
},
{
"epoch": 0.1847041847041847,
"grad_norm": 0.3160225749015808,
"learning_rate": 4.8030825822673814e-05,
"loss": 0.6733,
"step": 224
},
{
"epoch": 0.18552875695732837,
"grad_norm": 0.30562353134155273,
"learning_rate": 4.684508117198648e-05,
"loss": 0.6771,
"step": 225
},
{
"epoch": 0.18635332921047207,
"grad_norm": 0.3080686628818512,
"learning_rate": 4.567144220838923e-05,
"loss": 0.6759,
"step": 226
},
{
"epoch": 0.18717790146361574,
"grad_norm": 0.31016868352890015,
"learning_rate": 4.4510046663617996e-05,
"loss": 0.6937,
"step": 227
},
{
"epoch": 0.18800247371675943,
"grad_norm": 0.30470389127731323,
"learning_rate": 4.336103083258942e-05,
"loss": 0.6954,
"step": 228
},
{
"epoch": 0.1888270459699031,
"grad_norm": 0.3189505338668823,
"learning_rate": 4.2224529557405645e-05,
"loss": 0.7103,
"step": 229
},
{
"epoch": 0.1896516182230468,
"grad_norm": 0.31451600790023804,
"learning_rate": 4.1100676211530404e-05,
"loss": 0.7146,
"step": 230
},
{
"epoch": 0.19047619047619047,
"grad_norm": 0.3073336184024811,
"learning_rate": 3.998960268413666e-05,
"loss": 0.6761,
"step": 231
},
{
"epoch": 0.19130076272933416,
"grad_norm": 0.31194430589675903,
"learning_rate": 3.889143936462914e-05,
"loss": 0.6936,
"step": 232
},
{
"epoch": 0.19212533498247783,
"grad_norm": 0.30823662877082825,
"learning_rate": 3.780631512734241e-05,
"loss": 0.6915,
"step": 233
},
{
"epoch": 0.19294990723562153,
"grad_norm": 0.31047096848487854,
"learning_rate": 3.673435731641691e-05,
"loss": 0.6986,
"step": 234
},
{
"epoch": 0.1937744794887652,
"grad_norm": 0.3224363625049591,
"learning_rate": 3.567569173085454e-05,
"loss": 0.7141,
"step": 235
},
{
"epoch": 0.1945990517419089,
"grad_norm": 0.3098570704460144,
"learning_rate": 3.463044260975566e-05,
"loss": 0.6893,
"step": 236
},
{
"epoch": 0.19542362399505256,
"grad_norm": 0.30459511280059814,
"learning_rate": 3.3598732617739036e-05,
"loss": 0.6608,
"step": 237
},
{
"epoch": 0.19624819624819625,
"grad_norm": 0.32793962955474854,
"learning_rate": 3.258068283054666e-05,
"loss": 0.7535,
"step": 238
},
{
"epoch": 0.19707276850133992,
"grad_norm": 0.31355416774749756,
"learning_rate": 3.1576412720834746e-05,
"loss": 0.6545,
"step": 239
},
{
"epoch": 0.19789734075448362,
"grad_norm": 0.3191066086292267,
"learning_rate": 3.058604014415343e-05,
"loss": 0.6362,
"step": 240
},
{
"epoch": 0.1987219130076273,
"grad_norm": 0.3128627836704254,
"learning_rate": 2.960968132511567e-05,
"loss": 0.625,
"step": 241
},
{
"epoch": 0.19954648526077098,
"grad_norm": 0.33767595887184143,
"learning_rate": 2.8647450843757897e-05,
"loss": 0.7637,
"step": 242
},
{
"epoch": 0.20037105751391465,
"grad_norm": 0.32248735427856445,
"learning_rate": 2.7699461622093304e-05,
"loss": 0.6197,
"step": 243
},
{
"epoch": 0.20119562976705835,
"grad_norm": 0.33070605993270874,
"learning_rate": 2.67658249108603e-05,
"loss": 0.6475,
"step": 244
},
{
"epoch": 0.20202020202020202,
"grad_norm": 0.3260677754878998,
"learning_rate": 2.584665027646643e-05,
"loss": 0.6493,
"step": 245
},
{
"epoch": 0.2028447742733457,
"grad_norm": 0.35679200291633606,
"learning_rate": 2.49420455881305e-05,
"loss": 0.6593,
"step": 246
},
{
"epoch": 0.20366934652648938,
"grad_norm": 0.35772812366485596,
"learning_rate": 2.4052117005223455e-05,
"loss": 0.7124,
"step": 247
},
{
"epoch": 0.20449391877963308,
"grad_norm": 0.3646053671836853,
"learning_rate": 2.317696896481024e-05,
"loss": 0.6362,
"step": 248
},
{
"epoch": 0.20531849103277675,
"grad_norm": 0.4201851487159729,
"learning_rate": 2.231670416939364e-05,
"loss": 0.7586,
"step": 249
},
{
"epoch": 0.20614306328592044,
"grad_norm": 0.7209053635597229,
"learning_rate": 2.147142357486164e-05,
"loss": 0.7507,
"step": 250
},
{
"epoch": 0.20614306328592044,
"eval_loss": 0.6745719313621521,
"eval_runtime": 42.7078,
"eval_samples_per_second": 9.272,
"eval_steps_per_second": 9.272,
"step": 250
},
{
"epoch": 0.2069676355390641,
"grad_norm": 0.2964789569377899,
"learning_rate": 2.0641226378639715e-05,
"loss": 0.6769,
"step": 251
},
{
"epoch": 0.2077922077922078,
"grad_norm": 0.2993400990962982,
"learning_rate": 1.9826210008049785e-05,
"loss": 0.6628,
"step": 252
},
{
"epoch": 0.20861678004535147,
"grad_norm": 0.3071148991584778,
"learning_rate": 1.902647010887655e-05,
"loss": 0.6987,
"step": 253
},
{
"epoch": 0.20944135229849514,
"grad_norm": 0.3106604814529419,
"learning_rate": 1.8242100534143062e-05,
"loss": 0.6984,
"step": 254
},
{
"epoch": 0.21026592455163884,
"grad_norm": 0.3057470917701721,
"learning_rate": 1.7473193333096575e-05,
"loss": 0.6721,
"step": 255
},
{
"epoch": 0.2110904968047825,
"grad_norm": 0.319230854511261,
"learning_rate": 1.671983874040631e-05,
"loss": 0.6974,
"step": 256
},
{
"epoch": 0.2119150690579262,
"grad_norm": 0.3045044243335724,
"learning_rate": 1.598212516557394e-05,
"loss": 0.6795,
"step": 257
},
{
"epoch": 0.21273964131106987,
"grad_norm": 0.3158814013004303,
"learning_rate": 1.526013918255836e-05,
"loss": 0.7189,
"step": 258
},
{
"epoch": 0.21356421356421357,
"grad_norm": 0.3174242377281189,
"learning_rate": 1.4553965519615723e-05,
"loss": 0.6901,
"step": 259
},
{
"epoch": 0.21438878581735724,
"grad_norm": 0.3104889392852783,
"learning_rate": 1.3863687049356464e-05,
"loss": 0.6771,
"step": 260
},
{
"epoch": 0.21521335807050093,
"grad_norm": 0.3152613043785095,
"learning_rate": 1.3189384779019535e-05,
"loss": 0.7524,
"step": 261
},
{
"epoch": 0.2160379303236446,
"grad_norm": 0.32586848735809326,
"learning_rate": 1.25311378409661e-05,
"loss": 0.733,
"step": 262
},
{
"epoch": 0.2168625025767883,
"grad_norm": 0.30279600620269775,
"learning_rate": 1.1889023483392879e-05,
"loss": 0.626,
"step": 263
},
{
"epoch": 0.21768707482993196,
"grad_norm": 0.31115543842315674,
"learning_rate": 1.1263117061266675e-05,
"loss": 0.6681,
"step": 264
},
{
"epoch": 0.21851164708307566,
"grad_norm": 0.31493860483169556,
"learning_rate": 1.0653492027481286e-05,
"loss": 0.6506,
"step": 265
},
{
"epoch": 0.21933621933621933,
"grad_norm": 0.3196756839752197,
"learning_rate": 1.0060219924237379e-05,
"loss": 0.712,
"step": 266
},
{
"epoch": 0.22016079158936303,
"grad_norm": 0.3244759738445282,
"learning_rate": 9.48337037464666e-06,
"loss": 0.7272,
"step": 267
},
{
"epoch": 0.2209853638425067,
"grad_norm": 0.30338799953460693,
"learning_rate": 8.923011074561404e-06,
"loss": 0.6724,
"step": 268
},
{
"epoch": 0.2218099360956504,
"grad_norm": 0.31627827882766724,
"learning_rate": 8.379207784630004e-06,
"loss": 0.6856,
"step": 269
},
{
"epoch": 0.22263450834879406,
"grad_norm": 0.31490376591682434,
"learning_rate": 7.852024322579648e-06,
"loss": 0.6818,
"step": 270
},
{
"epoch": 0.22345908060193775,
"grad_norm": 0.3102516829967499,
"learning_rate": 7.34152255572697e-06,
"loss": 0.6574,
"step": 271
},
{
"epoch": 0.22428365285508142,
"grad_norm": 0.3195677101612091,
"learning_rate": 6.847762393717782e-06,
"loss": 0.7313,
"step": 272
},
{
"epoch": 0.22510822510822512,
"grad_norm": 0.3145836591720581,
"learning_rate": 6.370801781496326e-06,
"loss": 0.7007,
"step": 273
},
{
"epoch": 0.2259327973613688,
"grad_norm": 0.3069111704826355,
"learning_rate": 5.910696692505201e-06,
"loss": 0.6735,
"step": 274
},
{
"epoch": 0.22675736961451248,
"grad_norm": 0.31376153230667114,
"learning_rate": 5.467501122116563e-06,
"loss": 0.7041,
"step": 275
},
{
"epoch": 0.22758194186765615,
"grad_norm": 0.30935463309288025,
"learning_rate": 5.0412670812956465e-06,
"loss": 0.6993,
"step": 276
},
{
"epoch": 0.22840651412079985,
"grad_norm": 0.30352145433425903,
"learning_rate": 4.6320445904969475e-06,
"loss": 0.674,
"step": 277
},
{
"epoch": 0.22923108637394352,
"grad_norm": 0.317184716463089,
"learning_rate": 4.239881673794165e-06,
"loss": 0.7051,
"step": 278
},
{
"epoch": 0.2300556586270872,
"grad_norm": 0.3102111220359802,
"learning_rate": 3.864824353244367e-06,
"loss": 0.6934,
"step": 279
},
{
"epoch": 0.23088023088023088,
"grad_norm": 0.31064939498901367,
"learning_rate": 3.506916643487001e-06,
"loss": 0.7003,
"step": 280
},
{
"epoch": 0.23170480313337455,
"grad_norm": 0.3119175434112549,
"learning_rate": 3.166200546578718e-06,
"loss": 0.6886,
"step": 281
},
{
"epoch": 0.23252937538651824,
"grad_norm": 0.31088459491729736,
"learning_rate": 2.8427160470641253e-06,
"loss": 0.6805,
"step": 282
},
{
"epoch": 0.2333539476396619,
"grad_norm": 0.3150649070739746,
"learning_rate": 2.5365011072835117e-06,
"loss": 0.7153,
"step": 283
},
{
"epoch": 0.2341785198928056,
"grad_norm": 0.31142279505729675,
"learning_rate": 2.2475916629177415e-06,
"loss": 0.7158,
"step": 284
},
{
"epoch": 0.23500309214594928,
"grad_norm": 0.3094358444213867,
"learning_rate": 1.9760216187710787e-06,
"loss": 0.6481,
"step": 285
},
{
"epoch": 0.23582766439909297,
"grad_norm": 0.3161930441856384,
"learning_rate": 1.7218228447922867e-06,
"loss": 0.7038,
"step": 286
},
{
"epoch": 0.23665223665223664,
"grad_norm": 0.3040229082107544,
"learning_rate": 1.4850251723345196e-06,
"loss": 0.6384,
"step": 287
},
{
"epoch": 0.23747680890538034,
"grad_norm": 0.30395784974098206,
"learning_rate": 1.2656563906545902e-06,
"loss": 0.6362,
"step": 288
},
{
"epoch": 0.238301381158524,
"grad_norm": 0.30829399824142456,
"learning_rate": 1.0637422436516274e-06,
"loss": 0.6538,
"step": 289
},
{
"epoch": 0.2391259534116677,
"grad_norm": 0.32518744468688965,
"learning_rate": 8.793064268460604e-07,
"loss": 0.7038,
"step": 290
},
{
"epoch": 0.23995052566481137,
"grad_norm": 0.3239896893501282,
"learning_rate": 7.123705845987093e-07,
"loss": 0.6706,
"step": 291
},
{
"epoch": 0.24077509791795507,
"grad_norm": 0.31476303935050964,
"learning_rate": 5.629543075708176e-07,
"loss": 0.6519,
"step": 292
},
{
"epoch": 0.24159967017109873,
"grad_norm": 0.3170252740383148,
"learning_rate": 4.310751304249738e-07,
"loss": 0.6642,
"step": 293
},
{
"epoch": 0.24242424242424243,
"grad_norm": 0.31761467456817627,
"learning_rate": 3.167485297673411e-07,
"loss": 0.6443,
"step": 294
},
{
"epoch": 0.2432488146773861,
"grad_norm": 0.32578980922698975,
"learning_rate": 2.1998792233142714e-07,
"loss": 0.7081,
"step": 295
},
{
"epoch": 0.2440733869305298,
"grad_norm": 0.32927650213241577,
"learning_rate": 1.4080466340349316e-07,
"loss": 0.6772,
"step": 296
},
{
"epoch": 0.24489795918367346,
"grad_norm": 0.3357813358306885,
"learning_rate": 7.92080454900701e-08,
"loss": 0.6857,
"step": 297
},
{
"epoch": 0.24572253143681716,
"grad_norm": 0.3505016565322876,
"learning_rate": 3.5205297227380855e-08,
"loss": 0.7116,
"step": 298
},
{
"epoch": 0.24654710368996083,
"grad_norm": 0.37650617957115173,
"learning_rate": 8.801582533035644e-09,
"loss": 0.6173,
"step": 299
},
{
"epoch": 0.24737167594310452,
"grad_norm": 0.5433653593063354,
"learning_rate": 0.0,
"loss": 0.6145,
"step": 300
},
{
"epoch": 0.24737167594310452,
"eval_loss": 0.6729200482368469,
"eval_runtime": 42.7292,
"eval_samples_per_second": 9.268,
"eval_steps_per_second": 9.268,
"step": 300
}
],
"logging_steps": 1,
"max_steps": 300,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.406657706577101e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}