{ "best_metric": 0.6729200482368469, "best_model_checkpoint": "miner_id_24/checkpoint-300", "epoch": 0.24737167594310452, "eval_steps": 50, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008245722531436817, "grad_norm": 0.4181750416755676, "learning_rate": 2.9999999999999997e-05, "loss": 0.7232, "step": 1 }, { "epoch": 0.0008245722531436817, "eval_loss": 0.8246814608573914, "eval_runtime": 42.8717, "eval_samples_per_second": 9.237, "eval_steps_per_second": 9.237, "step": 1 }, { "epoch": 0.0016491445062873633, "grad_norm": 0.40615200996398926, "learning_rate": 5.9999999999999995e-05, "loss": 0.8142, "step": 2 }, { "epoch": 0.0024737167594310453, "grad_norm": 0.37523695826530457, "learning_rate": 8.999999999999999e-05, "loss": 0.7982, "step": 3 }, { "epoch": 0.0032982890125747267, "grad_norm": 0.36036789417266846, "learning_rate": 0.00011999999999999999, "loss": 0.836, "step": 4 }, { "epoch": 0.0041228612657184084, "grad_norm": 0.394962877035141, "learning_rate": 0.00015, "loss": 0.8198, "step": 5 }, { "epoch": 0.004947433518862091, "grad_norm": 0.416413277387619, "learning_rate": 0.00017999999999999998, "loss": 0.7857, "step": 6 }, { "epoch": 0.005772005772005772, "grad_norm": 0.5225532650947571, "learning_rate": 0.00020999999999999998, "loss": 0.7836, "step": 7 }, { "epoch": 0.006596578025149453, "grad_norm": 0.39612719416618347, "learning_rate": 0.00023999999999999998, "loss": 0.7467, "step": 8 }, { "epoch": 0.0074211502782931356, "grad_norm": 0.3856079578399658, "learning_rate": 0.00027, "loss": 0.7699, "step": 9 }, { "epoch": 0.008245722531436817, "grad_norm": 0.3804507851600647, "learning_rate": 0.0003, "loss": 0.7913, "step": 10 }, { "epoch": 0.009070294784580499, "grad_norm": 0.3713577389717102, "learning_rate": 0.0002999911984174669, "loss": 0.7259, "step": 11 }, { "epoch": 0.009894867037724181, "grad_norm": 0.3443286418914795, "learning_rate": 0.0002999647947027726, "loss": 0.7223, "step": 12 }, { "epoch": 0.010719439290867862, "grad_norm": 0.32728496193885803, "learning_rate": 0.0002999207919545099, "loss": 0.7663, "step": 13 }, { "epoch": 0.011544011544011544, "grad_norm": 0.3246591091156006, "learning_rate": 0.0002998591953365965, "loss": 0.749, "step": 14 }, { "epoch": 0.012368583797155226, "grad_norm": 0.32717230916023254, "learning_rate": 0.00029978001207766854, "loss": 0.7442, "step": 15 }, { "epoch": 0.013193156050298907, "grad_norm": 0.3290279805660248, "learning_rate": 0.00029968325147023263, "loss": 0.7572, "step": 16 }, { "epoch": 0.014017728303442589, "grad_norm": 0.35251685976982117, "learning_rate": 0.000299568924869575, "loss": 0.7737, "step": 17 }, { "epoch": 0.014842300556586271, "grad_norm": 0.3338168263435364, "learning_rate": 0.00029943704569242917, "loss": 0.7588, "step": 18 }, { "epoch": 0.01566687280972995, "grad_norm": 0.32795682549476624, "learning_rate": 0.0002992876294154013, "loss": 0.7875, "step": 19 }, { "epoch": 0.016491445062873634, "grad_norm": 0.3186565339565277, "learning_rate": 0.00029912069357315393, "loss": 0.7336, "step": 20 }, { "epoch": 0.017316017316017316, "grad_norm": 0.3066052794456482, "learning_rate": 0.00029893625775634835, "loss": 0.7219, "step": 21 }, { "epoch": 0.018140589569160998, "grad_norm": 0.31656959652900696, "learning_rate": 0.0002987343436093454, "loss": 0.7583, "step": 22 }, { "epoch": 0.01896516182230468, "grad_norm": 0.3124566674232483, "learning_rate": 0.00029851497482766547, "loss": 0.7673, "step": 23 }, { "epoch": 0.019789734075448363, "grad_norm": 0.30470794439315796, "learning_rate": 0.00029827817715520773, "loss": 0.7122, "step": 24 }, { "epoch": 0.02061430632859204, "grad_norm": 0.3141001760959625, "learning_rate": 0.0002980239783812289, "loss": 0.8052, "step": 25 }, { "epoch": 0.021438878581735724, "grad_norm": 0.32485461235046387, "learning_rate": 0.0002977524083370822, "loss": 0.7482, "step": 26 }, { "epoch": 0.022263450834879406, "grad_norm": 0.31203949451446533, "learning_rate": 0.00029746349889271645, "loss": 0.6784, "step": 27 }, { "epoch": 0.023088023088023088, "grad_norm": 0.32462912797927856, "learning_rate": 0.0002971572839529358, "loss": 0.767, "step": 28 }, { "epoch": 0.02391259534116677, "grad_norm": 0.3174619972705841, "learning_rate": 0.00029683379945342125, "loss": 0.7283, "step": 29 }, { "epoch": 0.024737167594310452, "grad_norm": 0.33072593808174133, "learning_rate": 0.000296493083356513, "loss": 0.8054, "step": 30 }, { "epoch": 0.025561739847454135, "grad_norm": 0.31485095620155334, "learning_rate": 0.00029613517564675565, "loss": 0.7499, "step": 31 }, { "epoch": 0.026386312100597813, "grad_norm": 0.3060151934623718, "learning_rate": 0.0002957601183262058, "loss": 0.7342, "step": 32 }, { "epoch": 0.027210884353741496, "grad_norm": 0.31382423639297485, "learning_rate": 0.000295367955409503, "loss": 0.7529, "step": 33 }, { "epoch": 0.028035456606885178, "grad_norm": 0.3076959550380707, "learning_rate": 0.00029495873291870436, "loss": 0.6956, "step": 34 }, { "epoch": 0.02886002886002886, "grad_norm": 0.31915611028671265, "learning_rate": 0.0002945324988778834, "loss": 0.7885, "step": 35 }, { "epoch": 0.029684601113172542, "grad_norm": 0.3209708034992218, "learning_rate": 0.00029408930330749477, "loss": 0.7536, "step": 36 }, { "epoch": 0.030509173366316224, "grad_norm": 0.33205971121788025, "learning_rate": 0.0002936291982185036, "loss": 0.7633, "step": 37 }, { "epoch": 0.0313337456194599, "grad_norm": 0.3191162049770355, "learning_rate": 0.00029315223760628217, "loss": 0.7437, "step": 38 }, { "epoch": 0.032158317872603585, "grad_norm": 0.34463781118392944, "learning_rate": 0.00029265847744427303, "loss": 0.7586, "step": 39 }, { "epoch": 0.03298289012574727, "grad_norm": 0.3288670778274536, "learning_rate": 0.00029214797567742035, "loss": 0.7812, "step": 40 }, { "epoch": 0.03380746237889095, "grad_norm": 0.3351598381996155, "learning_rate": 0.00029162079221537, "loss": 0.785, "step": 41 }, { "epoch": 0.03463203463203463, "grad_norm": 0.3358539938926697, "learning_rate": 0.0002910769889254386, "loss": 0.7145, "step": 42 }, { "epoch": 0.035456606885178314, "grad_norm": 0.3250412344932556, "learning_rate": 0.0002905166296253533, "loss": 0.7333, "step": 43 }, { "epoch": 0.036281179138321996, "grad_norm": 0.324008584022522, "learning_rate": 0.0002899397800757626, "loss": 0.7508, "step": 44 }, { "epoch": 0.03710575139146568, "grad_norm": 0.3390953242778778, "learning_rate": 0.0002893465079725187, "loss": 0.6994, "step": 45 }, { "epoch": 0.03793032364460936, "grad_norm": 0.3244946002960205, "learning_rate": 0.0002887368829387333, "loss": 0.6945, "step": 46 }, { "epoch": 0.03875489589775304, "grad_norm": 0.34154826402664185, "learning_rate": 0.0002881109765166071, "loss": 0.7437, "step": 47 }, { "epoch": 0.039579468150896725, "grad_norm": 0.36960309743881226, "learning_rate": 0.00028746886215903387, "loss": 0.7878, "step": 48 }, { "epoch": 0.04040404040404041, "grad_norm": 0.39132601022720337, "learning_rate": 0.00028681061522098047, "loss": 0.731, "step": 49 }, { "epoch": 0.04122861265718408, "grad_norm": 0.68941730260849, "learning_rate": 0.0002861363129506435, "loss": 0.7756, "step": 50 }, { "epoch": 0.04122861265718408, "eval_loss": 0.7312861084938049, "eval_runtime": 42.6476, "eval_samples_per_second": 9.285, "eval_steps_per_second": 9.285, "step": 50 }, { "epoch": 0.042053184910327765, "grad_norm": 0.3118657171726227, "learning_rate": 0.0002854460344803842, "loss": 0.6581, "step": 51 }, { "epoch": 0.04287775716347145, "grad_norm": 0.33690398931503296, "learning_rate": 0.00028473986081744163, "loss": 0.7433, "step": 52 }, { "epoch": 0.04370232941661513, "grad_norm": 0.33627963066101074, "learning_rate": 0.000284017874834426, "loss": 0.73, "step": 53 }, { "epoch": 0.04452690166975881, "grad_norm": 0.3204888105392456, "learning_rate": 0.0002832801612595937, "loss": 0.7398, "step": 54 }, { "epoch": 0.045351473922902494, "grad_norm": 0.3057969808578491, "learning_rate": 0.0002825268066669034, "loss": 0.6966, "step": 55 }, { "epoch": 0.046176046176046176, "grad_norm": 0.30388593673706055, "learning_rate": 0.00028175789946585693, "loss": 0.672, "step": 56 }, { "epoch": 0.04700061842918986, "grad_norm": 0.31205031275749207, "learning_rate": 0.0002809735298911234, "loss": 0.7422, "step": 57 }, { "epoch": 0.04782519068233354, "grad_norm": 0.317719042301178, "learning_rate": 0.00028017378999195015, "loss": 0.7359, "step": 58 }, { "epoch": 0.04864976293547722, "grad_norm": 0.3247784674167633, "learning_rate": 0.0002793587736213603, "loss": 0.688, "step": 59 }, { "epoch": 0.049474335188620905, "grad_norm": 0.32107478380203247, "learning_rate": 0.00027852857642513836, "loss": 0.7123, "step": 60 }, { "epoch": 0.05029890744176459, "grad_norm": 0.3145858645439148, "learning_rate": 0.00027768329583060635, "loss": 0.7274, "step": 61 }, { "epoch": 0.05112347969490827, "grad_norm": 0.3061201870441437, "learning_rate": 0.00027682303103518976, "loss": 0.7098, "step": 62 }, { "epoch": 0.05194805194805195, "grad_norm": 0.31068190932273865, "learning_rate": 0.00027594788299477655, "loss": 0.7176, "step": 63 }, { "epoch": 0.05277262420119563, "grad_norm": 0.32404160499572754, "learning_rate": 0.0002750579544118695, "loss": 0.7304, "step": 64 }, { "epoch": 0.05359719645433931, "grad_norm": 0.3103507459163666, "learning_rate": 0.00027415334972353357, "loss": 0.7112, "step": 65 }, { "epoch": 0.05442176870748299, "grad_norm": 0.31706494092941284, "learning_rate": 0.0002732341750891397, "loss": 0.6998, "step": 66 }, { "epoch": 0.05524634096062667, "grad_norm": 0.30976295471191406, "learning_rate": 0.00027230053837790666, "loss": 0.671, "step": 67 }, { "epoch": 0.056070913213770356, "grad_norm": 0.3198986351490021, "learning_rate": 0.0002713525491562421, "loss": 0.7438, "step": 68 }, { "epoch": 0.05689548546691404, "grad_norm": 0.3304222822189331, "learning_rate": 0.0002703903186748843, "loss": 0.814, "step": 69 }, { "epoch": 0.05772005772005772, "grad_norm": 0.3097413182258606, "learning_rate": 0.00026941395985584653, "loss": 0.6695, "step": 70 }, { "epoch": 0.0585446299732014, "grad_norm": 0.31618732213974, "learning_rate": 0.00026842358727916524, "loss": 0.739, "step": 71 }, { "epoch": 0.059369202226345084, "grad_norm": 0.3159608542919159, "learning_rate": 0.0002674193171694533, "loss": 0.7172, "step": 72 }, { "epoch": 0.06019377447948877, "grad_norm": 0.30797278881073, "learning_rate": 0.0002664012673822609, "loss": 0.7187, "step": 73 }, { "epoch": 0.06101834673263245, "grad_norm": 0.3251792788505554, "learning_rate": 0.0002653695573902443, "loss": 0.796, "step": 74 }, { "epoch": 0.06184291898577613, "grad_norm": 0.3200572431087494, "learning_rate": 0.0002643243082691454, "loss": 0.7268, "step": 75 }, { "epoch": 0.0626674912389198, "grad_norm": 0.29941457509994507, "learning_rate": 0.0002632656426835831, "loss": 0.6928, "step": 76 }, { "epoch": 0.06349206349206349, "grad_norm": 0.3021852672100067, "learning_rate": 0.00026219368487265753, "loss": 0.7257, "step": 77 }, { "epoch": 0.06431663574520717, "grad_norm": 0.30670538544654846, "learning_rate": 0.00026110856063537083, "loss": 0.699, "step": 78 }, { "epoch": 0.06514120799835085, "grad_norm": 0.3165689706802368, "learning_rate": 0.00026001039731586334, "loss": 0.7149, "step": 79 }, { "epoch": 0.06596578025149454, "grad_norm": 0.3253694474697113, "learning_rate": 0.0002588993237884696, "loss": 0.8077, "step": 80 }, { "epoch": 0.06679035250463822, "grad_norm": 0.31187790632247925, "learning_rate": 0.00025777547044259435, "loss": 0.7574, "step": 81 }, { "epoch": 0.0676149247577819, "grad_norm": 0.32256990671157837, "learning_rate": 0.0002566389691674106, "loss": 0.7692, "step": 82 }, { "epoch": 0.06843949701092558, "grad_norm": 0.3136816918849945, "learning_rate": 0.00025548995333638197, "loss": 0.7452, "step": 83 }, { "epoch": 0.06926406926406926, "grad_norm": 0.3113918602466583, "learning_rate": 0.00025432855779161076, "loss": 0.711, "step": 84 }, { "epoch": 0.07008864151721295, "grad_norm": 0.30899491906166077, "learning_rate": 0.00025315491882801347, "loss": 0.7414, "step": 85 }, { "epoch": 0.07091321377035663, "grad_norm": 0.31038275361061096, "learning_rate": 0.00025196917417732615, "loss": 0.69, "step": 86 }, { "epoch": 0.07173778602350031, "grad_norm": 0.32005515694618225, "learning_rate": 0.0002507714629919409, "loss": 0.7283, "step": 87 }, { "epoch": 0.07256235827664399, "grad_norm": 0.31575390696525574, "learning_rate": 0.0002495619258285757, "loss": 0.6974, "step": 88 }, { "epoch": 0.07338693052978768, "grad_norm": 0.3133029341697693, "learning_rate": 0.0002483407046317794, "loss": 0.7083, "step": 89 }, { "epoch": 0.07421150278293136, "grad_norm": 0.32530999183654785, "learning_rate": 0.00024710794271727413, "loss": 0.7165, "step": 90 }, { "epoch": 0.07503607503607504, "grad_norm": 0.31330084800720215, "learning_rate": 0.0002458637847551364, "loss": 0.6542, "step": 91 }, { "epoch": 0.07586064728921872, "grad_norm": 0.3188161849975586, "learning_rate": 0.00024460837675281926, "loss": 0.6734, "step": 92 }, { "epoch": 0.0766852195423624, "grad_norm": 0.3343876600265503, "learning_rate": 0.00024334186603801807, "loss": 0.7359, "step": 93 }, { "epoch": 0.07750979179550609, "grad_norm": 0.33376866579055786, "learning_rate": 0.00024206440124138062, "loss": 0.6967, "step": 94 }, { "epoch": 0.07833436404864977, "grad_norm": 0.3637959957122803, "learning_rate": 0.0002407761322790648, "loss": 0.7499, "step": 95 }, { "epoch": 0.07915893630179345, "grad_norm": 0.335549533367157, "learning_rate": 0.00023947721033514512, "loss": 0.7196, "step": 96 }, { "epoch": 0.07998350855493713, "grad_norm": 0.35548627376556396, "learning_rate": 0.00023816778784387094, "loss": 0.737, "step": 97 }, { "epoch": 0.08080808080808081, "grad_norm": 0.3780907094478607, "learning_rate": 0.0002368480184717773, "loss": 0.7565, "step": 98 }, { "epoch": 0.08163265306122448, "grad_norm": 0.40498775243759155, "learning_rate": 0.00023551805709965147, "loss": 0.6692, "step": 99 }, { "epoch": 0.08245722531436817, "grad_norm": 0.8926903605461121, "learning_rate": 0.00023417805980435736, "loss": 0.4799, "step": 100 }, { "epoch": 0.08245722531436817, "eval_loss": 0.7042385935783386, "eval_runtime": 42.7803, "eval_samples_per_second": 9.257, "eval_steps_per_second": 9.257, "step": 100 }, { "epoch": 0.08328179756751185, "grad_norm": 0.29144763946533203, "learning_rate": 0.00023282818384051866, "loss": 0.6653, "step": 101 }, { "epoch": 0.08410636982065553, "grad_norm": 0.290532648563385, "learning_rate": 0.00023146858762206489, "loss": 0.6903, "step": 102 }, { "epoch": 0.08493094207379921, "grad_norm": 0.304470956325531, "learning_rate": 0.00023009943070364044, "loss": 0.6943, "step": 103 }, { "epoch": 0.0857555143269429, "grad_norm": 0.31192028522491455, "learning_rate": 0.0002287208737618801, "loss": 0.7152, "step": 104 }, { "epoch": 0.08658008658008658, "grad_norm": 0.30607473850250244, "learning_rate": 0.00022733307857655325, "loss": 0.7248, "step": 105 }, { "epoch": 0.08740465883323026, "grad_norm": 0.2998904883861542, "learning_rate": 0.00022593620801157808, "loss": 0.7291, "step": 106 }, { "epoch": 0.08822923108637394, "grad_norm": 0.3096725344657898, "learning_rate": 0.00022453042599590882, "loss": 0.7241, "step": 107 }, { "epoch": 0.08905380333951762, "grad_norm": 0.3047013282775879, "learning_rate": 0.00022311589750429787, "loss": 0.7206, "step": 108 }, { "epoch": 0.0898783755926613, "grad_norm": 0.3195563554763794, "learning_rate": 0.00022169278853793545, "loss": 0.7217, "step": 109 }, { "epoch": 0.09070294784580499, "grad_norm": 0.32428956031799316, "learning_rate": 0.00022026126610496852, "loss": 0.6962, "step": 110 }, { "epoch": 0.09152752009894867, "grad_norm": 0.31494930386543274, "learning_rate": 0.0002188214982009016, "loss": 0.6812, "step": 111 }, { "epoch": 0.09235209235209235, "grad_norm": 0.329039603471756, "learning_rate": 0.00021737365378888187, "loss": 0.7871, "step": 112 }, { "epoch": 0.09317666460523603, "grad_norm": 0.3266455829143524, "learning_rate": 0.00021591790277987043, "loss": 0.7367, "step": 113 }, { "epoch": 0.09400123685837972, "grad_norm": 0.31767553091049194, "learning_rate": 0.00021445441601270276, "loss": 0.7702, "step": 114 }, { "epoch": 0.0948258091115234, "grad_norm": 0.32063761353492737, "learning_rate": 0.00021298336523403968, "loss": 0.7514, "step": 115 }, { "epoch": 0.09565038136466708, "grad_norm": 0.297237366437912, "learning_rate": 0.0002115049230782124, "loss": 0.6745, "step": 116 }, { "epoch": 0.09647495361781076, "grad_norm": 0.30952492356300354, "learning_rate": 0.00021001926304696296, "loss": 0.6953, "step": 117 }, { "epoch": 0.09729952587095445, "grad_norm": 0.3084593415260315, "learning_rate": 0.00020852655948908316, "loss": 0.7184, "step": 118 }, { "epoch": 0.09812409812409813, "grad_norm": 0.30032604932785034, "learning_rate": 0.0002070269875799538, "loss": 0.6477, "step": 119 }, { "epoch": 0.09894867037724181, "grad_norm": 0.3085014224052429, "learning_rate": 0.00020552072330098716, "loss": 0.7137, "step": 120 }, { "epoch": 0.09977324263038549, "grad_norm": 0.3050251007080078, "learning_rate": 0.0002040079434189748, "loss": 0.6947, "step": 121 }, { "epoch": 0.10059781488352917, "grad_norm": 0.29092761874198914, "learning_rate": 0.00020248882546534326, "loss": 0.6166, "step": 122 }, { "epoch": 0.10142238713667286, "grad_norm": 0.32282838225364685, "learning_rate": 0.00020096354771531976, "loss": 0.7807, "step": 123 }, { "epoch": 0.10224695938981654, "grad_norm": 0.3140111565589905, "learning_rate": 0.00019943228916701104, "loss": 0.7271, "step": 124 }, { "epoch": 0.10307153164296022, "grad_norm": 0.3199913203716278, "learning_rate": 0.00019789522952039695, "loss": 0.6998, "step": 125 }, { "epoch": 0.1038961038961039, "grad_norm": 0.3230836093425751, "learning_rate": 0.0001963525491562421, "loss": 0.7354, "step": 126 }, { "epoch": 0.10472067614924757, "grad_norm": 0.32371896505355835, "learning_rate": 0.00019480442911492702, "loss": 0.7583, "step": 127 }, { "epoch": 0.10554524840239125, "grad_norm": 0.3246193826198578, "learning_rate": 0.00019325105107520263, "loss": 0.7296, "step": 128 }, { "epoch": 0.10636982065553494, "grad_norm": 0.3160342872142792, "learning_rate": 0.00019169259733286913, "loss": 0.7091, "step": 129 }, { "epoch": 0.10719439290867862, "grad_norm": 0.3068806231021881, "learning_rate": 0.00019012925077938314, "loss": 0.7068, "step": 130 }, { "epoch": 0.1080189651618223, "grad_norm": 0.31874704360961914, "learning_rate": 0.0001885611948803941, "loss": 0.7188, "step": 131 }, { "epoch": 0.10884353741496598, "grad_norm": 0.30816522240638733, "learning_rate": 0.0001869886136542143, "loss": 0.7145, "step": 132 }, { "epoch": 0.10966810966810966, "grad_norm": 0.3057398498058319, "learning_rate": 0.00018541169165022298, "loss": 0.6788, "step": 133 }, { "epoch": 0.11049268192125335, "grad_norm": 0.30414751172065735, "learning_rate": 0.00018383061392720913, "loss": 0.6881, "step": 134 }, { "epoch": 0.11131725417439703, "grad_norm": 0.31472885608673096, "learning_rate": 0.0001822455660316536, "loss": 0.7089, "step": 135 }, { "epoch": 0.11214182642754071, "grad_norm": 0.309356153011322, "learning_rate": 0.00018065673397595473, "loss": 0.6732, "step": 136 }, { "epoch": 0.1129663986806844, "grad_norm": 0.30579566955566406, "learning_rate": 0.00017906430421659876, "loss": 0.6453, "step": 137 }, { "epoch": 0.11379097093382808, "grad_norm": 0.3306417465209961, "learning_rate": 0.00017746846363227842, "loss": 0.7327, "step": 138 }, { "epoch": 0.11461554318697176, "grad_norm": 0.32207512855529785, "learning_rate": 0.00017586939950196186, "loss": 0.677, "step": 139 }, { "epoch": 0.11544011544011544, "grad_norm": 0.3170906603336334, "learning_rate": 0.00017426729948291474, "loss": 0.6764, "step": 140 }, { "epoch": 0.11626468769325912, "grad_norm": 0.32157909870147705, "learning_rate": 0.00017266235158867752, "loss": 0.7016, "step": 141 }, { "epoch": 0.1170892599464028, "grad_norm": 0.3478415012359619, "learning_rate": 0.00017105474416700164, "loss": 0.6956, "step": 142 }, { "epoch": 0.11791383219954649, "grad_norm": 0.32598310708999634, "learning_rate": 0.0001694446658777458, "loss": 0.6716, "step": 143 }, { "epoch": 0.11873840445269017, "grad_norm": 0.3289110064506531, "learning_rate": 0.00016783230567073596, "loss": 0.7107, "step": 144 }, { "epoch": 0.11956297670583385, "grad_norm": 0.34824198484420776, "learning_rate": 0.00016621785276359127, "loss": 0.6887, "step": 145 }, { "epoch": 0.12038754895897753, "grad_norm": 0.35514163970947266, "learning_rate": 0.0001646014966195185, "loss": 0.7147, "step": 146 }, { "epoch": 0.12121212121212122, "grad_norm": 0.3446742296218872, "learning_rate": 0.00016298342692507763, "loss": 0.6324, "step": 147 }, { "epoch": 0.1220366934652649, "grad_norm": 0.3650054633617401, "learning_rate": 0.00016136383356792156, "loss": 0.6782, "step": 148 }, { "epoch": 0.12286126571840858, "grad_norm": 0.3771435022354126, "learning_rate": 0.0001597429066145116, "loss": 0.6823, "step": 149 }, { "epoch": 0.12368583797155226, "grad_norm": 0.6211467981338501, "learning_rate": 0.0001581208362878126, "loss": 0.6651, "step": 150 }, { "epoch": 0.12368583797155226, "eval_loss": 0.6936895847320557, "eval_runtime": 42.757, "eval_samples_per_second": 9.262, "eval_steps_per_second": 9.262, "step": 150 }, { "epoch": 0.12451041022469594, "grad_norm": 0.3050060570240021, "learning_rate": 0.00015649781294496933, "loss": 0.5971, "step": 151 }, { "epoch": 0.1253349824778396, "grad_norm": 0.31458163261413574, "learning_rate": 0.00015487402705496707, "loss": 0.7042, "step": 152 }, { "epoch": 0.1261595547309833, "grad_norm": 0.30443075299263, "learning_rate": 0.0001532496691762796, "loss": 0.6492, "step": 153 }, { "epoch": 0.12698412698412698, "grad_norm": 0.31165623664855957, "learning_rate": 0.00015162492993450597, "loss": 0.7077, "step": 154 }, { "epoch": 0.12780869923727067, "grad_norm": 0.3184472322463989, "learning_rate": 0.00015, "loss": 0.7271, "step": 155 }, { "epoch": 0.12863327149041434, "grad_norm": 0.31272202730178833, "learning_rate": 0.00014837507006549403, "loss": 0.7611, "step": 156 }, { "epoch": 0.12945784374355804, "grad_norm": 0.3030358850955963, "learning_rate": 0.00014675033082372038, "loss": 0.7276, "step": 157 }, { "epoch": 0.1302824159967017, "grad_norm": 0.31098514795303345, "learning_rate": 0.00014512597294503293, "loss": 0.6906, "step": 158 }, { "epoch": 0.1311069882498454, "grad_norm": 0.3137834370136261, "learning_rate": 0.00014350218705503067, "loss": 0.7106, "step": 159 }, { "epoch": 0.13193156050298907, "grad_norm": 0.3150627315044403, "learning_rate": 0.00014187916371218736, "loss": 0.7156, "step": 160 }, { "epoch": 0.13275613275613277, "grad_norm": 0.31015968322753906, "learning_rate": 0.00014025709338548836, "loss": 0.6631, "step": 161 }, { "epoch": 0.13358070500927643, "grad_norm": 0.29935508966445923, "learning_rate": 0.00013863616643207844, "loss": 0.6572, "step": 162 }, { "epoch": 0.13440527726242013, "grad_norm": 0.30627861618995667, "learning_rate": 0.00013701657307492235, "loss": 0.6925, "step": 163 }, { "epoch": 0.1352298495155638, "grad_norm": 0.32339316606521606, "learning_rate": 0.00013539850338048154, "loss": 0.7109, "step": 164 }, { "epoch": 0.1360544217687075, "grad_norm": 0.31456539034843445, "learning_rate": 0.00013378214723640876, "loss": 0.697, "step": 165 }, { "epoch": 0.13687899402185116, "grad_norm": 0.3141114413738251, "learning_rate": 0.00013216769432926404, "loss": 0.6922, "step": 166 }, { "epoch": 0.13770356627499486, "grad_norm": 0.32136693596839905, "learning_rate": 0.00013055533412225422, "loss": 0.7408, "step": 167 }, { "epoch": 0.13852813852813853, "grad_norm": 0.31578657031059265, "learning_rate": 0.00012894525583299833, "loss": 0.7104, "step": 168 }, { "epoch": 0.1393527107812822, "grad_norm": 0.3017723262310028, "learning_rate": 0.0001273376484113225, "loss": 0.672, "step": 169 }, { "epoch": 0.1401772830344259, "grad_norm": 0.3208543658256531, "learning_rate": 0.0001257327005170853, "loss": 0.7146, "step": 170 }, { "epoch": 0.14100185528756956, "grad_norm": 0.31153324246406555, "learning_rate": 0.00012413060049803814, "loss": 0.6276, "step": 171 }, { "epoch": 0.14182642754071326, "grad_norm": 0.3135003447532654, "learning_rate": 0.00012253153636772156, "loss": 0.6705, "step": 172 }, { "epoch": 0.14265099979385693, "grad_norm": 0.30405810475349426, "learning_rate": 0.00012093569578340124, "loss": 0.6602, "step": 173 }, { "epoch": 0.14347557204700062, "grad_norm": 0.32480090856552124, "learning_rate": 0.00011934326602404528, "loss": 0.6755, "step": 174 }, { "epoch": 0.1443001443001443, "grad_norm": 0.3095938563346863, "learning_rate": 0.00011775443396834638, "loss": 0.7024, "step": 175 }, { "epoch": 0.14512471655328799, "grad_norm": 0.32258421182632446, "learning_rate": 0.00011616938607279086, "loss": 0.7044, "step": 176 }, { "epoch": 0.14594928880643165, "grad_norm": 0.3185218870639801, "learning_rate": 0.00011458830834977698, "loss": 0.7224, "step": 177 }, { "epoch": 0.14677386105957535, "grad_norm": 0.31658223271369934, "learning_rate": 0.0001130113863457857, "loss": 0.7346, "step": 178 }, { "epoch": 0.14759843331271902, "grad_norm": 0.31286993622779846, "learning_rate": 0.00011143880511960584, "loss": 0.7063, "step": 179 }, { "epoch": 0.14842300556586271, "grad_norm": 0.30426645278930664, "learning_rate": 0.00010987074922061689, "loss": 0.6781, "step": 180 }, { "epoch": 0.14924757781900638, "grad_norm": 0.30204102396965027, "learning_rate": 0.00010830740266713087, "loss": 0.6914, "step": 181 }, { "epoch": 0.15007215007215008, "grad_norm": 0.3117939233779907, "learning_rate": 0.00010674894892479738, "loss": 0.714, "step": 182 }, { "epoch": 0.15089672232529375, "grad_norm": 0.30473530292510986, "learning_rate": 0.00010519557088507298, "loss": 0.6351, "step": 183 }, { "epoch": 0.15172129457843744, "grad_norm": 0.30228060483932495, "learning_rate": 0.0001036474508437579, "loss": 0.6095, "step": 184 }, { "epoch": 0.1525458668315811, "grad_norm": 0.32118985056877136, "learning_rate": 0.00010210477047960302, "loss": 0.7311, "step": 185 }, { "epoch": 0.1533704390847248, "grad_norm": 0.31862205266952515, "learning_rate": 0.00010056771083298893, "loss": 0.6728, "step": 186 }, { "epoch": 0.15419501133786848, "grad_norm": 0.3058205246925354, "learning_rate": 9.903645228468024e-05, "loss": 0.6495, "step": 187 }, { "epoch": 0.15501958359101217, "grad_norm": 0.3158295750617981, "learning_rate": 9.751117453465673e-05, "loss": 0.688, "step": 188 }, { "epoch": 0.15584415584415584, "grad_norm": 0.32295459508895874, "learning_rate": 9.59920565810252e-05, "loss": 0.7007, "step": 189 }, { "epoch": 0.15666872809729954, "grad_norm": 0.3153195083141327, "learning_rate": 9.447927669901282e-05, "loss": 0.6781, "step": 190 }, { "epoch": 0.1574933003504432, "grad_norm": 0.31914830207824707, "learning_rate": 9.297301242004618e-05, "loss": 0.6459, "step": 191 }, { "epoch": 0.1583178726035869, "grad_norm": 0.32512032985687256, "learning_rate": 9.14734405109168e-05, "loss": 0.708, "step": 192 }, { "epoch": 0.15914244485673057, "grad_norm": 0.3304908573627472, "learning_rate": 8.998073695303701e-05, "loss": 0.6703, "step": 193 }, { "epoch": 0.15996701710987427, "grad_norm": 0.33657538890838623, "learning_rate": 8.849507692178758e-05, "loss": 0.6989, "step": 194 }, { "epoch": 0.16079158936301793, "grad_norm": 0.33851417899131775, "learning_rate": 8.70166347659603e-05, "loss": 0.709, "step": 195 }, { "epoch": 0.16161616161616163, "grad_norm": 0.3366221785545349, "learning_rate": 8.554558398729725e-05, "loss": 0.6213, "step": 196 }, { "epoch": 0.1624407338693053, "grad_norm": 0.3471497595310211, "learning_rate": 8.408209722012956e-05, "loss": 0.7261, "step": 197 }, { "epoch": 0.16326530612244897, "grad_norm": 0.35795000195503235, "learning_rate": 8.262634621111818e-05, "loss": 0.639, "step": 198 }, { "epoch": 0.16408987837559266, "grad_norm": 0.3911006450653076, "learning_rate": 8.117850179909842e-05, "loss": 0.6943, "step": 199 }, { "epoch": 0.16491445062873633, "grad_norm": 0.5652897357940674, "learning_rate": 7.973873389503149e-05, "loss": 0.6569, "step": 200 }, { "epoch": 0.16491445062873633, "eval_loss": 0.6824482679367065, "eval_runtime": 42.7512, "eval_samples_per_second": 9.263, "eval_steps_per_second": 9.263, "step": 200 }, { "epoch": 0.16573902288188003, "grad_norm": 0.29706528782844543, "learning_rate": 7.830721146206451e-05, "loss": 0.5975, "step": 201 }, { "epoch": 0.1665635951350237, "grad_norm": 0.3147106170654297, "learning_rate": 7.688410249570214e-05, "loss": 0.7244, "step": 202 }, { "epoch": 0.1673881673881674, "grad_norm": 0.3083764314651489, "learning_rate": 7.54695740040912e-05, "loss": 0.6659, "step": 203 }, { "epoch": 0.16821273964131106, "grad_norm": 0.31014275550842285, "learning_rate": 7.406379198842189e-05, "loss": 0.6614, "step": 204 }, { "epoch": 0.16903731189445476, "grad_norm": 0.3195944130420685, "learning_rate": 7.266692142344672e-05, "loss": 0.7009, "step": 205 }, { "epoch": 0.16986188414759842, "grad_norm": 0.30758917331695557, "learning_rate": 7.127912623811993e-05, "loss": 0.6654, "step": 206 }, { "epoch": 0.17068645640074212, "grad_norm": 0.30025753378868103, "learning_rate": 6.990056929635957e-05, "loss": 0.6463, "step": 207 }, { "epoch": 0.1715110286538858, "grad_norm": 0.3022189736366272, "learning_rate": 6.853141237793506e-05, "loss": 0.6522, "step": 208 }, { "epoch": 0.17233560090702948, "grad_norm": 0.3084527850151062, "learning_rate": 6.717181615948126e-05, "loss": 0.7102, "step": 209 }, { "epoch": 0.17316017316017315, "grad_norm": 0.3123379647731781, "learning_rate": 6.582194019564266e-05, "loss": 0.6977, "step": 210 }, { "epoch": 0.17398474541331685, "grad_norm": 0.3006177842617035, "learning_rate": 6.448194290034848e-05, "loss": 0.6934, "step": 211 }, { "epoch": 0.17480931766646052, "grad_norm": 0.30770057439804077, "learning_rate": 6.315198152822272e-05, "loss": 0.6854, "step": 212 }, { "epoch": 0.1756338899196042, "grad_norm": 0.31282415986061096, "learning_rate": 6.183221215612904e-05, "loss": 0.7131, "step": 213 }, { "epoch": 0.17645846217274788, "grad_norm": 0.3112223148345947, "learning_rate": 6.052278966485491e-05, "loss": 0.7005, "step": 214 }, { "epoch": 0.17728303442589158, "grad_norm": 0.30379047989845276, "learning_rate": 5.922386772093526e-05, "loss": 0.6716, "step": 215 }, { "epoch": 0.17810760667903525, "grad_norm": 0.318124920129776, "learning_rate": 5.793559875861938e-05, "loss": 0.6676, "step": 216 }, { "epoch": 0.17893217893217894, "grad_norm": 0.32162636518478394, "learning_rate": 5.6658133961981894e-05, "loss": 0.716, "step": 217 }, { "epoch": 0.1797567511853226, "grad_norm": 0.2995055615901947, "learning_rate": 5.5391623247180744e-05, "loss": 0.6375, "step": 218 }, { "epoch": 0.1805813234384663, "grad_norm": 0.30936968326568604, "learning_rate": 5.413621524486363e-05, "loss": 0.6462, "step": 219 }, { "epoch": 0.18140589569160998, "grad_norm": 0.31474071741104126, "learning_rate": 5.289205728272586e-05, "loss": 0.7098, "step": 220 }, { "epoch": 0.18223046794475367, "grad_norm": 0.31727197766304016, "learning_rate": 5.165929536822059e-05, "loss": 0.7516, "step": 221 }, { "epoch": 0.18305504019789734, "grad_norm": 0.32454827427864075, "learning_rate": 5.043807417142436e-05, "loss": 0.7308, "step": 222 }, { "epoch": 0.18387961245104104, "grad_norm": 0.3066427409648895, "learning_rate": 4.922853700805909e-05, "loss": 0.6555, "step": 223 }, { "epoch": 0.1847041847041847, "grad_norm": 0.3160225749015808, "learning_rate": 4.8030825822673814e-05, "loss": 0.6733, "step": 224 }, { "epoch": 0.18552875695732837, "grad_norm": 0.30562353134155273, "learning_rate": 4.684508117198648e-05, "loss": 0.6771, "step": 225 }, { "epoch": 0.18635332921047207, "grad_norm": 0.3080686628818512, "learning_rate": 4.567144220838923e-05, "loss": 0.6759, "step": 226 }, { "epoch": 0.18717790146361574, "grad_norm": 0.31016868352890015, "learning_rate": 4.4510046663617996e-05, "loss": 0.6937, "step": 227 }, { "epoch": 0.18800247371675943, "grad_norm": 0.30470389127731323, "learning_rate": 4.336103083258942e-05, "loss": 0.6954, "step": 228 }, { "epoch": 0.1888270459699031, "grad_norm": 0.3189505338668823, "learning_rate": 4.2224529557405645e-05, "loss": 0.7103, "step": 229 }, { "epoch": 0.1896516182230468, "grad_norm": 0.31451600790023804, "learning_rate": 4.1100676211530404e-05, "loss": 0.7146, "step": 230 }, { "epoch": 0.19047619047619047, "grad_norm": 0.3073336184024811, "learning_rate": 3.998960268413666e-05, "loss": 0.6761, "step": 231 }, { "epoch": 0.19130076272933416, "grad_norm": 0.31194430589675903, "learning_rate": 3.889143936462914e-05, "loss": 0.6936, "step": 232 }, { "epoch": 0.19212533498247783, "grad_norm": 0.30823662877082825, "learning_rate": 3.780631512734241e-05, "loss": 0.6915, "step": 233 }, { "epoch": 0.19294990723562153, "grad_norm": 0.31047096848487854, "learning_rate": 3.673435731641691e-05, "loss": 0.6986, "step": 234 }, { "epoch": 0.1937744794887652, "grad_norm": 0.3224363625049591, "learning_rate": 3.567569173085454e-05, "loss": 0.7141, "step": 235 }, { "epoch": 0.1945990517419089, "grad_norm": 0.3098570704460144, "learning_rate": 3.463044260975566e-05, "loss": 0.6893, "step": 236 }, { "epoch": 0.19542362399505256, "grad_norm": 0.30459511280059814, "learning_rate": 3.3598732617739036e-05, "loss": 0.6608, "step": 237 }, { "epoch": 0.19624819624819625, "grad_norm": 0.32793962955474854, "learning_rate": 3.258068283054666e-05, "loss": 0.7535, "step": 238 }, { "epoch": 0.19707276850133992, "grad_norm": 0.31355416774749756, "learning_rate": 3.1576412720834746e-05, "loss": 0.6545, "step": 239 }, { "epoch": 0.19789734075448362, "grad_norm": 0.3191066086292267, "learning_rate": 3.058604014415343e-05, "loss": 0.6362, "step": 240 }, { "epoch": 0.1987219130076273, "grad_norm": 0.3128627836704254, "learning_rate": 2.960968132511567e-05, "loss": 0.625, "step": 241 }, { "epoch": 0.19954648526077098, "grad_norm": 0.33767595887184143, "learning_rate": 2.8647450843757897e-05, "loss": 0.7637, "step": 242 }, { "epoch": 0.20037105751391465, "grad_norm": 0.32248735427856445, "learning_rate": 2.7699461622093304e-05, "loss": 0.6197, "step": 243 }, { "epoch": 0.20119562976705835, "grad_norm": 0.33070605993270874, "learning_rate": 2.67658249108603e-05, "loss": 0.6475, "step": 244 }, { "epoch": 0.20202020202020202, "grad_norm": 0.3260677754878998, "learning_rate": 2.584665027646643e-05, "loss": 0.6493, "step": 245 }, { "epoch": 0.2028447742733457, "grad_norm": 0.35679200291633606, "learning_rate": 2.49420455881305e-05, "loss": 0.6593, "step": 246 }, { "epoch": 0.20366934652648938, "grad_norm": 0.35772812366485596, "learning_rate": 2.4052117005223455e-05, "loss": 0.7124, "step": 247 }, { "epoch": 0.20449391877963308, "grad_norm": 0.3646053671836853, "learning_rate": 2.317696896481024e-05, "loss": 0.6362, "step": 248 }, { "epoch": 0.20531849103277675, "grad_norm": 0.4201851487159729, "learning_rate": 2.231670416939364e-05, "loss": 0.7586, "step": 249 }, { "epoch": 0.20614306328592044, "grad_norm": 0.7209053635597229, "learning_rate": 2.147142357486164e-05, "loss": 0.7507, "step": 250 }, { "epoch": 0.20614306328592044, "eval_loss": 0.6745719313621521, "eval_runtime": 42.7078, "eval_samples_per_second": 9.272, "eval_steps_per_second": 9.272, "step": 250 }, { "epoch": 0.2069676355390641, "grad_norm": 0.2964789569377899, "learning_rate": 2.0641226378639715e-05, "loss": 0.6769, "step": 251 }, { "epoch": 0.2077922077922078, "grad_norm": 0.2993400990962982, "learning_rate": 1.9826210008049785e-05, "loss": 0.6628, "step": 252 }, { "epoch": 0.20861678004535147, "grad_norm": 0.3071148991584778, "learning_rate": 1.902647010887655e-05, "loss": 0.6987, "step": 253 }, { "epoch": 0.20944135229849514, "grad_norm": 0.3106604814529419, "learning_rate": 1.8242100534143062e-05, "loss": 0.6984, "step": 254 }, { "epoch": 0.21026592455163884, "grad_norm": 0.3057470917701721, "learning_rate": 1.7473193333096575e-05, "loss": 0.6721, "step": 255 }, { "epoch": 0.2110904968047825, "grad_norm": 0.319230854511261, "learning_rate": 1.671983874040631e-05, "loss": 0.6974, "step": 256 }, { "epoch": 0.2119150690579262, "grad_norm": 0.3045044243335724, "learning_rate": 1.598212516557394e-05, "loss": 0.6795, "step": 257 }, { "epoch": 0.21273964131106987, "grad_norm": 0.3158814013004303, "learning_rate": 1.526013918255836e-05, "loss": 0.7189, "step": 258 }, { "epoch": 0.21356421356421357, "grad_norm": 0.3174242377281189, "learning_rate": 1.4553965519615723e-05, "loss": 0.6901, "step": 259 }, { "epoch": 0.21438878581735724, "grad_norm": 0.3104889392852783, "learning_rate": 1.3863687049356464e-05, "loss": 0.6771, "step": 260 }, { "epoch": 0.21521335807050093, "grad_norm": 0.3152613043785095, "learning_rate": 1.3189384779019535e-05, "loss": 0.7524, "step": 261 }, { "epoch": 0.2160379303236446, "grad_norm": 0.32586848735809326, "learning_rate": 1.25311378409661e-05, "loss": 0.733, "step": 262 }, { "epoch": 0.2168625025767883, "grad_norm": 0.30279600620269775, "learning_rate": 1.1889023483392879e-05, "loss": 0.626, "step": 263 }, { "epoch": 0.21768707482993196, "grad_norm": 0.31115543842315674, "learning_rate": 1.1263117061266675e-05, "loss": 0.6681, "step": 264 }, { "epoch": 0.21851164708307566, "grad_norm": 0.31493860483169556, "learning_rate": 1.0653492027481286e-05, "loss": 0.6506, "step": 265 }, { "epoch": 0.21933621933621933, "grad_norm": 0.3196756839752197, "learning_rate": 1.0060219924237379e-05, "loss": 0.712, "step": 266 }, { "epoch": 0.22016079158936303, "grad_norm": 0.3244759738445282, "learning_rate": 9.48337037464666e-06, "loss": 0.7272, "step": 267 }, { "epoch": 0.2209853638425067, "grad_norm": 0.30338799953460693, "learning_rate": 8.923011074561404e-06, "loss": 0.6724, "step": 268 }, { "epoch": 0.2218099360956504, "grad_norm": 0.31627827882766724, "learning_rate": 8.379207784630004e-06, "loss": 0.6856, "step": 269 }, { "epoch": 0.22263450834879406, "grad_norm": 0.31490376591682434, "learning_rate": 7.852024322579648e-06, "loss": 0.6818, "step": 270 }, { "epoch": 0.22345908060193775, "grad_norm": 0.3102516829967499, "learning_rate": 7.34152255572697e-06, "loss": 0.6574, "step": 271 }, { "epoch": 0.22428365285508142, "grad_norm": 0.3195677101612091, "learning_rate": 6.847762393717782e-06, "loss": 0.7313, "step": 272 }, { "epoch": 0.22510822510822512, "grad_norm": 0.3145836591720581, "learning_rate": 6.370801781496326e-06, "loss": 0.7007, "step": 273 }, { "epoch": 0.2259327973613688, "grad_norm": 0.3069111704826355, "learning_rate": 5.910696692505201e-06, "loss": 0.6735, "step": 274 }, { "epoch": 0.22675736961451248, "grad_norm": 0.31376153230667114, "learning_rate": 5.467501122116563e-06, "loss": 0.7041, "step": 275 }, { "epoch": 0.22758194186765615, "grad_norm": 0.30935463309288025, "learning_rate": 5.0412670812956465e-06, "loss": 0.6993, "step": 276 }, { "epoch": 0.22840651412079985, "grad_norm": 0.30352145433425903, "learning_rate": 4.6320445904969475e-06, "loss": 0.674, "step": 277 }, { "epoch": 0.22923108637394352, "grad_norm": 0.317184716463089, "learning_rate": 4.239881673794165e-06, "loss": 0.7051, "step": 278 }, { "epoch": 0.2300556586270872, "grad_norm": 0.3102111220359802, "learning_rate": 3.864824353244367e-06, "loss": 0.6934, "step": 279 }, { "epoch": 0.23088023088023088, "grad_norm": 0.31064939498901367, "learning_rate": 3.506916643487001e-06, "loss": 0.7003, "step": 280 }, { "epoch": 0.23170480313337455, "grad_norm": 0.3119175434112549, "learning_rate": 3.166200546578718e-06, "loss": 0.6886, "step": 281 }, { "epoch": 0.23252937538651824, "grad_norm": 0.31088459491729736, "learning_rate": 2.8427160470641253e-06, "loss": 0.6805, "step": 282 }, { "epoch": 0.2333539476396619, "grad_norm": 0.3150649070739746, "learning_rate": 2.5365011072835117e-06, "loss": 0.7153, "step": 283 }, { "epoch": 0.2341785198928056, "grad_norm": 0.31142279505729675, "learning_rate": 2.2475916629177415e-06, "loss": 0.7158, "step": 284 }, { "epoch": 0.23500309214594928, "grad_norm": 0.3094358444213867, "learning_rate": 1.9760216187710787e-06, "loss": 0.6481, "step": 285 }, { "epoch": 0.23582766439909297, "grad_norm": 0.3161930441856384, "learning_rate": 1.7218228447922867e-06, "loss": 0.7038, "step": 286 }, { "epoch": 0.23665223665223664, "grad_norm": 0.3040229082107544, "learning_rate": 1.4850251723345196e-06, "loss": 0.6384, "step": 287 }, { "epoch": 0.23747680890538034, "grad_norm": 0.30395784974098206, "learning_rate": 1.2656563906545902e-06, "loss": 0.6362, "step": 288 }, { "epoch": 0.238301381158524, "grad_norm": 0.30829399824142456, "learning_rate": 1.0637422436516274e-06, "loss": 0.6538, "step": 289 }, { "epoch": 0.2391259534116677, "grad_norm": 0.32518744468688965, "learning_rate": 8.793064268460604e-07, "loss": 0.7038, "step": 290 }, { "epoch": 0.23995052566481137, "grad_norm": 0.3239896893501282, "learning_rate": 7.123705845987093e-07, "loss": 0.6706, "step": 291 }, { "epoch": 0.24077509791795507, "grad_norm": 0.31476303935050964, "learning_rate": 5.629543075708176e-07, "loss": 0.6519, "step": 292 }, { "epoch": 0.24159967017109873, "grad_norm": 0.3170252740383148, "learning_rate": 4.310751304249738e-07, "loss": 0.6642, "step": 293 }, { "epoch": 0.24242424242424243, "grad_norm": 0.31761467456817627, "learning_rate": 3.167485297673411e-07, "loss": 0.6443, "step": 294 }, { "epoch": 0.2432488146773861, "grad_norm": 0.32578980922698975, "learning_rate": 2.1998792233142714e-07, "loss": 0.7081, "step": 295 }, { "epoch": 0.2440733869305298, "grad_norm": 0.32927650213241577, "learning_rate": 1.4080466340349316e-07, "loss": 0.6772, "step": 296 }, { "epoch": 0.24489795918367346, "grad_norm": 0.3357813358306885, "learning_rate": 7.92080454900701e-08, "loss": 0.6857, "step": 297 }, { "epoch": 0.24572253143681716, "grad_norm": 0.3505016565322876, "learning_rate": 3.5205297227380855e-08, "loss": 0.7116, "step": 298 }, { "epoch": 0.24654710368996083, "grad_norm": 0.37650617957115173, "learning_rate": 8.801582533035644e-09, "loss": 0.6173, "step": 299 }, { "epoch": 0.24737167594310452, "grad_norm": 0.5433653593063354, "learning_rate": 0.0, "loss": 0.6145, "step": 300 }, { "epoch": 0.24737167594310452, "eval_loss": 0.6729200482368469, "eval_runtime": 42.7292, "eval_samples_per_second": 9.268, "eval_steps_per_second": 9.268, "step": 300 } ], "logging_steps": 1, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.406657706577101e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }