|
{ |
|
"best_metric": 1.6927062273025513, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-400", |
|
"epoch": 0.15939033198017583, |
|
"eval_steps": 100, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00039847582995043956, |
|
"grad_norm": 0.8249958157539368, |
|
"learning_rate": 2e-05, |
|
"loss": 3.7705, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00039847582995043956, |
|
"eval_loss": 3.994760274887085, |
|
"eval_runtime": 599.7212, |
|
"eval_samples_per_second": 14.097, |
|
"eval_steps_per_second": 3.525, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0007969516599008791, |
|
"grad_norm": 0.9214300513267517, |
|
"learning_rate": 4e-05, |
|
"loss": 3.9395, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0011954274898513188, |
|
"grad_norm": 0.7693865299224854, |
|
"learning_rate": 6e-05, |
|
"loss": 3.7306, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0015939033198017582, |
|
"grad_norm": 0.8011671304702759, |
|
"learning_rate": 8e-05, |
|
"loss": 3.3512, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.001992379149752198, |
|
"grad_norm": 0.8984593749046326, |
|
"learning_rate": 0.0001, |
|
"loss": 3.6375, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0023908549797026376, |
|
"grad_norm": 1.0371809005737305, |
|
"learning_rate": 0.00012, |
|
"loss": 3.8592, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.002789330809653077, |
|
"grad_norm": 1.1792114973068237, |
|
"learning_rate": 0.00014, |
|
"loss": 3.8321, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0031878066396035165, |
|
"grad_norm": 1.3046059608459473, |
|
"learning_rate": 0.00016, |
|
"loss": 3.3336, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.003586282469553956, |
|
"grad_norm": 1.4238650798797607, |
|
"learning_rate": 0.00018, |
|
"loss": 2.9386, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.003984758299504396, |
|
"grad_norm": 1.1028273105621338, |
|
"learning_rate": 0.0002, |
|
"loss": 2.7003, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0043832341294548355, |
|
"grad_norm": 1.1997171640396118, |
|
"learning_rate": 0.00019999741592564903, |
|
"loss": 2.4654, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.004781709959405275, |
|
"grad_norm": 1.5336440801620483, |
|
"learning_rate": 0.00019998966383614488, |
|
"loss": 2.4423, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.005180185789355715, |
|
"grad_norm": 1.8468828201293945, |
|
"learning_rate": 0.00019997674413212708, |
|
"loss": 2.4003, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.005578661619306154, |
|
"grad_norm": 1.7681652307510376, |
|
"learning_rate": 0.00019995865748130516, |
|
"loss": 2.3207, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.005977137449256593, |
|
"grad_norm": 1.0762815475463867, |
|
"learning_rate": 0.0001999354048184241, |
|
"loss": 2.2285, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.006375613279207033, |
|
"grad_norm": 0.8622324466705322, |
|
"learning_rate": 0.00019990698734521613, |
|
"loss": 1.9898, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.006774089109157473, |
|
"grad_norm": 0.9870002865791321, |
|
"learning_rate": 0.0001998734065303385, |
|
"loss": 1.923, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.007172564939107912, |
|
"grad_norm": 1.0574088096618652, |
|
"learning_rate": 0.00019983466410929764, |
|
"loss": 2.08, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.007571040769058352, |
|
"grad_norm": 0.8685367107391357, |
|
"learning_rate": 0.0001997907620843595, |
|
"loss": 1.9477, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.007969516599008792, |
|
"grad_norm": 0.7225976586341858, |
|
"learning_rate": 0.00019974170272444604, |
|
"loss": 1.9725, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.00836799242895923, |
|
"grad_norm": 0.7834375500679016, |
|
"learning_rate": 0.00019968748856501788, |
|
"loss": 1.9877, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.008766468258909671, |
|
"grad_norm": 0.7260671257972717, |
|
"learning_rate": 0.00019962812240794343, |
|
"loss": 1.9946, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.00916494408886011, |
|
"grad_norm": 0.8711982369422913, |
|
"learning_rate": 0.000199563607321354, |
|
"loss": 1.8275, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.00956341991881055, |
|
"grad_norm": 0.5631409287452698, |
|
"learning_rate": 0.0001994939466394851, |
|
"loss": 1.7041, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.009961895748760989, |
|
"grad_norm": 0.5470976829528809, |
|
"learning_rate": 0.00019941914396250446, |
|
"loss": 1.7129, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01036037157871143, |
|
"grad_norm": 0.5513713359832764, |
|
"learning_rate": 0.00019933920315632557, |
|
"loss": 1.8914, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.010758847408661868, |
|
"grad_norm": 0.5356898903846741, |
|
"learning_rate": 0.00019925412835240826, |
|
"loss": 1.9072, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.011157323238612307, |
|
"grad_norm": 0.5035513043403625, |
|
"learning_rate": 0.0001991639239475448, |
|
"loss": 1.9054, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.011555799068562748, |
|
"grad_norm": 0.5602848529815674, |
|
"learning_rate": 0.00019906859460363307, |
|
"loss": 1.8175, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.011954274898513187, |
|
"grad_norm": 0.5118929743766785, |
|
"learning_rate": 0.00019896814524743528, |
|
"loss": 1.7964, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.012352750728463627, |
|
"grad_norm": 0.45721983909606934, |
|
"learning_rate": 0.0001988625810703235, |
|
"loss": 1.8377, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.012751226558414066, |
|
"grad_norm": 0.44319745898246765, |
|
"learning_rate": 0.0001987519075280114, |
|
"loss": 1.8383, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.013149702388364506, |
|
"grad_norm": 0.49430009722709656, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 1.7603, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.013548178218314945, |
|
"grad_norm": 0.4535483419895172, |
|
"learning_rate": 0.00019851525549064323, |
|
"loss": 1.6921, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.013946654048265384, |
|
"grad_norm": 0.48142871260643005, |
|
"learning_rate": 0.00019838928922611632, |
|
"loss": 1.8217, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.014345129878215825, |
|
"grad_norm": 0.6872768402099609, |
|
"learning_rate": 0.00019825823805681543, |
|
"loss": 1.8212, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.014743605708166263, |
|
"grad_norm": 0.4731607437133789, |
|
"learning_rate": 0.0001981221087556598, |
|
"loss": 1.6508, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.015142081538116704, |
|
"grad_norm": 0.42979615926742554, |
|
"learning_rate": 0.00019798090835801418, |
|
"loss": 1.4883, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.015540557368067143, |
|
"grad_norm": 0.5217059254646301, |
|
"learning_rate": 0.00019783464416132506, |
|
"loss": 1.6702, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.015939033198017583, |
|
"grad_norm": 0.5023307204246521, |
|
"learning_rate": 0.00019768332372474366, |
|
"loss": 1.7604, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.016337509027968024, |
|
"grad_norm": 0.5037193894386292, |
|
"learning_rate": 0.00019752695486873517, |
|
"loss": 1.7254, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.01673598485791846, |
|
"grad_norm": 0.4970403015613556, |
|
"learning_rate": 0.00019736554567467452, |
|
"loss": 1.726, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0171344606878689, |
|
"grad_norm": 0.4856554865837097, |
|
"learning_rate": 0.00019719910448442893, |
|
"loss": 1.5681, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.017532936517819342, |
|
"grad_norm": 0.4940697252750397, |
|
"learning_rate": 0.00019702763989992662, |
|
"loss": 1.58, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.01793141234776978, |
|
"grad_norm": 0.46332672238349915, |
|
"learning_rate": 0.00019685116078271223, |
|
"loss": 1.4937, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.01832988817772022, |
|
"grad_norm": 0.4996075928211212, |
|
"learning_rate": 0.00019666967625348906, |
|
"loss": 1.8633, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.01872836400767066, |
|
"grad_norm": 0.4581417143344879, |
|
"learning_rate": 0.00019648319569164736, |
|
"loss": 1.6708, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0191268398376211, |
|
"grad_norm": 0.49805447459220886, |
|
"learning_rate": 0.00019629172873477995, |
|
"loss": 1.6956, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.019525315667571538, |
|
"grad_norm": 0.5399609208106995, |
|
"learning_rate": 0.0001960952852781838, |
|
"loss": 1.8536, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.019923791497521978, |
|
"grad_norm": 0.5043647289276123, |
|
"learning_rate": 0.0001958938754743489, |
|
"loss": 1.9073, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02032226732747242, |
|
"grad_norm": 0.5041136741638184, |
|
"learning_rate": 0.0001956875097324334, |
|
"loss": 1.8891, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.02072074315742286, |
|
"grad_norm": 0.48425522446632385, |
|
"learning_rate": 0.00019547619871772574, |
|
"loss": 1.7636, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.021119218987373296, |
|
"grad_norm": 0.501112163066864, |
|
"learning_rate": 0.00019525995335109334, |
|
"loss": 1.6945, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.021517694817323737, |
|
"grad_norm": 0.4796481132507324, |
|
"learning_rate": 0.0001950387848084183, |
|
"loss": 1.7757, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.021916170647274177, |
|
"grad_norm": 0.5773142576217651, |
|
"learning_rate": 0.00019481270452001987, |
|
"loss": 1.7415, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.022314646477224614, |
|
"grad_norm": 0.4900719225406647, |
|
"learning_rate": 0.00019458172417006347, |
|
"loss": 1.6834, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.022713122307175055, |
|
"grad_norm": 0.4695626497268677, |
|
"learning_rate": 0.00019434585569595708, |
|
"loss": 1.7537, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.023111598137125496, |
|
"grad_norm": 0.49179011583328247, |
|
"learning_rate": 0.00019410511128773418, |
|
"loss": 1.685, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.023510073967075936, |
|
"grad_norm": 0.5011521577835083, |
|
"learning_rate": 0.0001938595033874238, |
|
"loss": 1.6088, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.023908549797026373, |
|
"grad_norm": 0.47553250193595886, |
|
"learning_rate": 0.0001936090446884074, |
|
"loss": 1.6502, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.024307025626976814, |
|
"grad_norm": 0.4945161044597626, |
|
"learning_rate": 0.00019335374813476302, |
|
"loss": 1.7799, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.024705501456927254, |
|
"grad_norm": 0.5294601917266846, |
|
"learning_rate": 0.00019309362692059617, |
|
"loss": 1.6993, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.02510397728687769, |
|
"grad_norm": 0.5426062941551208, |
|
"learning_rate": 0.00019282869448935798, |
|
"loss": 1.915, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.025502453116828132, |
|
"grad_norm": 0.49963027238845825, |
|
"learning_rate": 0.00019255896453315052, |
|
"loss": 1.7709, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.025900928946778572, |
|
"grad_norm": 0.5086182355880737, |
|
"learning_rate": 0.000192284450992019, |
|
"loss": 1.6436, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.026299404776729013, |
|
"grad_norm": 0.4630321264266968, |
|
"learning_rate": 0.0001920051680532314, |
|
"loss": 1.6038, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.02669788060667945, |
|
"grad_norm": 0.4538319408893585, |
|
"learning_rate": 0.00019172113015054532, |
|
"loss": 1.5576, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.02709635643662989, |
|
"grad_norm": 0.511547863483429, |
|
"learning_rate": 0.0001914323519634619, |
|
"loss": 1.6114, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.02749483226658033, |
|
"grad_norm": 0.5467154383659363, |
|
"learning_rate": 0.00019113884841646736, |
|
"loss": 1.8275, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.027893308096530768, |
|
"grad_norm": 0.44893792271614075, |
|
"learning_rate": 0.00019084063467826137, |
|
"loss": 1.6006, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02829178392648121, |
|
"grad_norm": 0.4380510747432709, |
|
"learning_rate": 0.00019053772616097337, |
|
"loss": 1.5283, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.02869025975643165, |
|
"grad_norm": 0.4855054020881653, |
|
"learning_rate": 0.000190230138519366, |
|
"loss": 1.949, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.02908873558638209, |
|
"grad_norm": 0.509174108505249, |
|
"learning_rate": 0.000189917887650026, |
|
"loss": 1.9279, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.029487211416332527, |
|
"grad_norm": 0.4690839946269989, |
|
"learning_rate": 0.00018960098969054255, |
|
"loss": 1.8505, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.029885687246282967, |
|
"grad_norm": 0.5497538447380066, |
|
"learning_rate": 0.00018927946101867347, |
|
"loss": 1.9063, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.030284163076233408, |
|
"grad_norm": 0.5328514575958252, |
|
"learning_rate": 0.0001889533182514986, |
|
"loss": 1.8781, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.03068263890618385, |
|
"grad_norm": 0.42769238352775574, |
|
"learning_rate": 0.0001886225782445612, |
|
"loss": 1.5811, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.031081114736134285, |
|
"grad_norm": 0.6662131547927856, |
|
"learning_rate": 0.00018828725809099655, |
|
"loss": 1.8193, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.03147959056608472, |
|
"grad_norm": 0.4854792058467865, |
|
"learning_rate": 0.0001879473751206489, |
|
"loss": 1.6416, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.031878066396035167, |
|
"grad_norm": 0.5450350046157837, |
|
"learning_rate": 0.00018760294689917553, |
|
"loss": 1.9132, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.032276542225985604, |
|
"grad_norm": 0.5068654417991638, |
|
"learning_rate": 0.00018725399122713912, |
|
"loss": 1.7539, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.03267501805593605, |
|
"grad_norm": 0.500575065612793, |
|
"learning_rate": 0.00018690052613908772, |
|
"loss": 1.7219, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.033073493885886485, |
|
"grad_norm": 0.481229692697525, |
|
"learning_rate": 0.0001865425699026226, |
|
"loss": 1.6467, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.03347196971583692, |
|
"grad_norm": 0.5118649005889893, |
|
"learning_rate": 0.00018618014101745442, |
|
"loss": 1.8061, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.033870445545787366, |
|
"grad_norm": 0.5075271725654602, |
|
"learning_rate": 0.0001858132582144469, |
|
"loss": 1.7179, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0342689213757378, |
|
"grad_norm": 0.455424427986145, |
|
"learning_rate": 0.00018544194045464886, |
|
"loss": 1.5479, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.03466739720568824, |
|
"grad_norm": 0.5024828910827637, |
|
"learning_rate": 0.00018506620692831428, |
|
"loss": 1.726, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.035065873035638684, |
|
"grad_norm": 0.5479703545570374, |
|
"learning_rate": 0.0001846860770539105, |
|
"loss": 1.7982, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.03546434886558912, |
|
"grad_norm": 0.4849150478839874, |
|
"learning_rate": 0.00018430157047711474, |
|
"loss": 1.6496, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.03586282469553956, |
|
"grad_norm": 0.5472909212112427, |
|
"learning_rate": 0.00018391270706979862, |
|
"loss": 1.8333, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03626130052549, |
|
"grad_norm": 0.4992446005344391, |
|
"learning_rate": 0.00018351950692900126, |
|
"loss": 1.761, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.03665977635544044, |
|
"grad_norm": 0.4649079144001007, |
|
"learning_rate": 0.00018312199037589068, |
|
"loss": 1.5256, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.03705825218539088, |
|
"grad_norm": 0.5369842648506165, |
|
"learning_rate": 0.00018272017795471345, |
|
"loss": 1.78, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.03745672801534132, |
|
"grad_norm": 0.4625012278556824, |
|
"learning_rate": 0.000182314090431733, |
|
"loss": 1.5817, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.03785520384529176, |
|
"grad_norm": 0.5325396060943604, |
|
"learning_rate": 0.00018190374879415632, |
|
"loss": 1.698, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0382536796752422, |
|
"grad_norm": 0.4603791832923889, |
|
"learning_rate": 0.00018148917424904953, |
|
"loss": 1.5092, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.03865215550519264, |
|
"grad_norm": 0.4525664746761322, |
|
"learning_rate": 0.0001810703882222415, |
|
"loss": 1.4083, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.039050631335143075, |
|
"grad_norm": 0.5617400407791138, |
|
"learning_rate": 0.00018064741235721687, |
|
"loss": 1.6626, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.03944910716509352, |
|
"grad_norm": 0.5286575555801392, |
|
"learning_rate": 0.00018022026851399737, |
|
"loss": 1.7368, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.039847582995043956, |
|
"grad_norm": 0.5342997908592224, |
|
"learning_rate": 0.0001797889787680119, |
|
"loss": 1.8665, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.039847582995043956, |
|
"eval_loss": 1.7500040531158447, |
|
"eval_runtime": 602.8533, |
|
"eval_samples_per_second": 14.023, |
|
"eval_steps_per_second": 3.507, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04024605882499439, |
|
"grad_norm": 0.4774135649204254, |
|
"learning_rate": 0.00017935356540895597, |
|
"loss": 1.6952, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.04064453465494484, |
|
"grad_norm": 0.6040295362472534, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 1.8068, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.041043010484895275, |
|
"grad_norm": 0.5168588757514954, |
|
"learning_rate": 0.00017847045807482345, |
|
"loss": 1.7252, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.04144148631484572, |
|
"grad_norm": 0.4739170968532562, |
|
"learning_rate": 0.00017802280974004716, |
|
"loss": 1.7146, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.041839962144796156, |
|
"grad_norm": 0.5354509353637695, |
|
"learning_rate": 0.000177571129070442, |
|
"loss": 1.868, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.04223843797474659, |
|
"grad_norm": 0.46404990553855896, |
|
"learning_rate": 0.00017711543940953668, |
|
"loss": 1.698, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.04263691380469704, |
|
"grad_norm": 0.45692166686058044, |
|
"learning_rate": 0.00017665576430805053, |
|
"loss": 1.649, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.043035389634647474, |
|
"grad_norm": 0.4770927131175995, |
|
"learning_rate": 0.0001761921275226763, |
|
"loss": 1.8552, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.04343386546459791, |
|
"grad_norm": 0.4677036702632904, |
|
"learning_rate": 0.00017572455301485249, |
|
"loss": 1.6415, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.043832341294548355, |
|
"grad_norm": 0.49166563153266907, |
|
"learning_rate": 0.00017525306494952498, |
|
"loss": 1.5864, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04423081712449879, |
|
"grad_norm": 0.551313579082489, |
|
"learning_rate": 0.0001747776876938981, |
|
"loss": 1.8246, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.04462929295444923, |
|
"grad_norm": 0.5011689066886902, |
|
"learning_rate": 0.00017429844581617532, |
|
"loss": 1.6335, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.04502776878439967, |
|
"grad_norm": 0.510704755783081, |
|
"learning_rate": 0.00017381536408428948, |
|
"loss": 1.7084, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.04542624461435011, |
|
"grad_norm": 0.4545946419239044, |
|
"learning_rate": 0.00017332846746462288, |
|
"loss": 1.5209, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.04582472044430055, |
|
"grad_norm": 0.4840458631515503, |
|
"learning_rate": 0.0001728377811207168, |
|
"loss": 1.7235, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.04622319627425099, |
|
"grad_norm": 0.4970642626285553, |
|
"learning_rate": 0.00017234333041197126, |
|
"loss": 1.6567, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.04662167210420143, |
|
"grad_norm": 0.5095712542533875, |
|
"learning_rate": 0.00017184514089233405, |
|
"loss": 1.7205, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.04702014793415187, |
|
"grad_norm": 0.5349841713905334, |
|
"learning_rate": 0.00017134323830898037, |
|
"loss": 1.7285, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.04741862376410231, |
|
"grad_norm": 0.5475637316703796, |
|
"learning_rate": 0.00017083764860098205, |
|
"loss": 1.6601, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.047817099594052746, |
|
"grad_norm": 0.5360651016235352, |
|
"learning_rate": 0.0001703283978979671, |
|
"loss": 1.6062, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04821557542400319, |
|
"grad_norm": 0.49149248003959656, |
|
"learning_rate": 0.00016981551251876904, |
|
"loss": 1.7332, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.04861405125395363, |
|
"grad_norm": 0.46091562509536743, |
|
"learning_rate": 0.00016929901897006698, |
|
"loss": 1.5933, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.049012527083904064, |
|
"grad_norm": 0.5389519333839417, |
|
"learning_rate": 0.0001687789439450156, |
|
"loss": 1.9277, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.04941100291385451, |
|
"grad_norm": 0.4960940182209015, |
|
"learning_rate": 0.00016825531432186543, |
|
"loss": 1.7398, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.049809478743804945, |
|
"grad_norm": 0.5458736419677734, |
|
"learning_rate": 0.00016772815716257412, |
|
"loss": 1.7824, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.05020795457375538, |
|
"grad_norm": 0.5217113494873047, |
|
"learning_rate": 0.00016719749971140754, |
|
"loss": 1.8169, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.05060643040370583, |
|
"grad_norm": 0.49912309646606445, |
|
"learning_rate": 0.0001666633693935319, |
|
"loss": 1.7099, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.051004906233656264, |
|
"grad_norm": 0.519611120223999, |
|
"learning_rate": 0.00016612579381359622, |
|
"loss": 1.5635, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.05140338206360671, |
|
"grad_norm": 0.5147966146469116, |
|
"learning_rate": 0.00016558480075430594, |
|
"loss": 1.6643, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.051801857893557145, |
|
"grad_norm": 0.539840817451477, |
|
"learning_rate": 0.00016504041817498678, |
|
"loss": 1.796, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05220033372350758, |
|
"grad_norm": 0.5166546106338501, |
|
"learning_rate": 0.00016449267421013994, |
|
"loss": 1.5948, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.052598809553458026, |
|
"grad_norm": 0.5418827533721924, |
|
"learning_rate": 0.00016394159716798807, |
|
"loss": 1.8935, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.05299728538340846, |
|
"grad_norm": 0.5098159313201904, |
|
"learning_rate": 0.00016338721552901212, |
|
"loss": 1.6439, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.0533957612133589, |
|
"grad_norm": 0.5153954029083252, |
|
"learning_rate": 0.0001628295579444796, |
|
"loss": 1.7484, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.053794237043309344, |
|
"grad_norm": 0.5499337315559387, |
|
"learning_rate": 0.0001622686532349637, |
|
"loss": 1.665, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.05419271287325978, |
|
"grad_norm": 0.5195444226264954, |
|
"learning_rate": 0.00016170453038885394, |
|
"loss": 1.6813, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.05459118870321022, |
|
"grad_norm": 0.5251967310905457, |
|
"learning_rate": 0.0001611372185608578, |
|
"loss": 1.7325, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.05498966453316066, |
|
"grad_norm": 0.5916984677314758, |
|
"learning_rate": 0.0001605667470704942, |
|
"loss": 1.8083, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.0553881403631111, |
|
"grad_norm": 0.5287485718727112, |
|
"learning_rate": 0.0001599931454005781, |
|
"loss": 1.8145, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.055786616193061536, |
|
"grad_norm": 0.5018830299377441, |
|
"learning_rate": 0.00015941644319569665, |
|
"loss": 1.6038, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05618509202301198, |
|
"grad_norm": 0.4870772063732147, |
|
"learning_rate": 0.00015883667026067745, |
|
"loss": 1.6, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.05658356785296242, |
|
"grad_norm": 0.4964452087879181, |
|
"learning_rate": 0.00015825385655904788, |
|
"loss": 1.7953, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.05698204368291286, |
|
"grad_norm": 0.4588499367237091, |
|
"learning_rate": 0.00015766803221148673, |
|
"loss": 1.5708, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.0573805195128633, |
|
"grad_norm": 0.521144449710846, |
|
"learning_rate": 0.00015707922749426737, |
|
"loss": 1.7794, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.057778995342813735, |
|
"grad_norm": 0.5591409206390381, |
|
"learning_rate": 0.00015648747283769317, |
|
"loss": 1.9377, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.05817747117276418, |
|
"grad_norm": 0.43348705768585205, |
|
"learning_rate": 0.00015589279882452476, |
|
"loss": 1.396, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.058575947002714616, |
|
"grad_norm": 0.46277427673339844, |
|
"learning_rate": 0.0001552952361883994, |
|
"loss": 1.6397, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.058974422832665054, |
|
"grad_norm": 0.49379056692123413, |
|
"learning_rate": 0.00015469481581224272, |
|
"loss": 1.7281, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.0593728986626155, |
|
"grad_norm": 0.49816733598709106, |
|
"learning_rate": 0.00015409156872667258, |
|
"loss": 1.6383, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.059771374492565935, |
|
"grad_norm": 0.44194599986076355, |
|
"learning_rate": 0.0001534855261083954, |
|
"loss": 1.552, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06016985032251637, |
|
"grad_norm": 0.4125038683414459, |
|
"learning_rate": 0.00015287671927859494, |
|
"loss": 1.4642, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.060568326152466816, |
|
"grad_norm": 0.4800686836242676, |
|
"learning_rate": 0.00015226517970131343, |
|
"loss": 1.5824, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.06096680198241725, |
|
"grad_norm": 0.5546180605888367, |
|
"learning_rate": 0.00015165093898182562, |
|
"loss": 1.8809, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.0613652778123677, |
|
"grad_norm": 0.45037057995796204, |
|
"learning_rate": 0.00015103402886500525, |
|
"loss": 1.4952, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.061763753642318134, |
|
"grad_norm": 0.531578004360199, |
|
"learning_rate": 0.00015041448123368455, |
|
"loss": 1.7191, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.06216222947226857, |
|
"grad_norm": 0.48476216197013855, |
|
"learning_rate": 0.00014979232810700637, |
|
"loss": 1.7732, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.06256070530221901, |
|
"grad_norm": 0.48990944027900696, |
|
"learning_rate": 0.0001491676016387694, |
|
"loss": 1.6629, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.06295918113216945, |
|
"grad_norm": 0.5288809537887573, |
|
"learning_rate": 0.00014854033411576659, |
|
"loss": 1.6989, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.0633576569621199, |
|
"grad_norm": 0.520767331123352, |
|
"learning_rate": 0.00014791055795611624, |
|
"loss": 1.6098, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.06375613279207033, |
|
"grad_norm": 0.49702882766723633, |
|
"learning_rate": 0.00014727830570758678, |
|
"loss": 1.5923, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06415460862202077, |
|
"grad_norm": 0.555203378200531, |
|
"learning_rate": 0.0001466436100459146, |
|
"loss": 1.8835, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.06455308445197121, |
|
"grad_norm": 0.49902498722076416, |
|
"learning_rate": 0.00014600650377311522, |
|
"loss": 1.6217, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.06495156028192164, |
|
"grad_norm": 0.4641701877117157, |
|
"learning_rate": 0.0001453670198157883, |
|
"loss": 1.6337, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.0653500361118721, |
|
"grad_norm": 0.5301873087882996, |
|
"learning_rate": 0.00014472519122341566, |
|
"loss": 1.7073, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.06574851194182253, |
|
"grad_norm": 0.5231285095214844, |
|
"learning_rate": 0.00014408105116665336, |
|
"loss": 1.8093, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.06614698777177297, |
|
"grad_norm": 0.45359745621681213, |
|
"learning_rate": 0.00014343463293561734, |
|
"loss": 1.6956, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.0665454636017234, |
|
"grad_norm": 0.44360700249671936, |
|
"learning_rate": 0.00014278596993816308, |
|
"loss": 1.6548, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.06694393943167384, |
|
"grad_norm": 0.4991564452648163, |
|
"learning_rate": 0.00014213509569815884, |
|
"loss": 1.7701, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.06734241526162428, |
|
"grad_norm": 0.4743082821369171, |
|
"learning_rate": 0.00014148204385375321, |
|
"loss": 1.7401, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.06774089109157473, |
|
"grad_norm": 0.5044605731964111, |
|
"learning_rate": 0.0001408268481556366, |
|
"loss": 1.6009, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06813936692152517, |
|
"grad_norm": 0.49447423219680786, |
|
"learning_rate": 0.00014016954246529696, |
|
"loss": 1.6208, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.0685378427514756, |
|
"grad_norm": 0.46173223853111267, |
|
"learning_rate": 0.0001395101607532698, |
|
"loss": 1.6046, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.06893631858142604, |
|
"grad_norm": 0.5149829983711243, |
|
"learning_rate": 0.00013884873709738257, |
|
"loss": 1.6744, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.06933479441137648, |
|
"grad_norm": 0.4951649010181427, |
|
"learning_rate": 0.00013818530568099327, |
|
"loss": 1.7894, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.06973327024132693, |
|
"grad_norm": 0.4963831603527069, |
|
"learning_rate": 0.00013751990079122412, |
|
"loss": 1.7119, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.07013174607127737, |
|
"grad_norm": 0.48155760765075684, |
|
"learning_rate": 0.00013685255681718922, |
|
"loss": 1.6867, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.0705302219012278, |
|
"grad_norm": 0.4654506742954254, |
|
"learning_rate": 0.0001361833082482175, |
|
"loss": 1.535, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.07092869773117824, |
|
"grad_norm": 0.4506324827671051, |
|
"learning_rate": 0.0001355121896720703, |
|
"loss": 1.5751, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.07132717356112868, |
|
"grad_norm": 0.5030463933944702, |
|
"learning_rate": 0.00013483923577315348, |
|
"loss": 1.7494, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.07172564939107912, |
|
"grad_norm": 0.4498404264450073, |
|
"learning_rate": 0.00013416448133072526, |
|
"loss": 1.5368, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07212412522102957, |
|
"grad_norm": 0.5255778431892395, |
|
"learning_rate": 0.00013348796121709862, |
|
"loss": 1.6465, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.07252260105098, |
|
"grad_norm": 0.566211998462677, |
|
"learning_rate": 0.00013280971039583906, |
|
"loss": 1.6151, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.07292107688093044, |
|
"grad_norm": 0.5025095343589783, |
|
"learning_rate": 0.0001321297639199575, |
|
"loss": 1.6099, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.07331955271088088, |
|
"grad_norm": 0.5001662969589233, |
|
"learning_rate": 0.000131448156930099, |
|
"loss": 1.5435, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.07371802854083132, |
|
"grad_norm": 0.5104225277900696, |
|
"learning_rate": 0.0001307649246527263, |
|
"loss": 1.7083, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.07411650437078177, |
|
"grad_norm": 0.4568442404270172, |
|
"learning_rate": 0.0001300801023982995, |
|
"loss": 1.555, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.0745149802007322, |
|
"grad_norm": 0.5155336856842041, |
|
"learning_rate": 0.00012939372555945112, |
|
"loss": 1.6897, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.07491345603068264, |
|
"grad_norm": 0.5319402813911438, |
|
"learning_rate": 0.0001287058296091567, |
|
"loss": 1.7909, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.07531193186063308, |
|
"grad_norm": 0.5743489861488342, |
|
"learning_rate": 0.00012801645009890195, |
|
"loss": 1.9705, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.07571040769058351, |
|
"grad_norm": 0.4790689945220947, |
|
"learning_rate": 0.0001273256226568451, |
|
"loss": 1.4495, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07610888352053395, |
|
"grad_norm": 0.47618716955184937, |
|
"learning_rate": 0.00012663338298597563, |
|
"loss": 1.7269, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.0765073593504844, |
|
"grad_norm": 0.429544597864151, |
|
"learning_rate": 0.00012593976686226904, |
|
"loss": 1.488, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.07690583518043484, |
|
"grad_norm": 0.5185227394104004, |
|
"learning_rate": 0.0001252448101328381, |
|
"loss": 1.6479, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.07730431101038528, |
|
"grad_norm": 0.44675058126449585, |
|
"learning_rate": 0.00012454854871407994, |
|
"loss": 1.5254, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.07770278684033571, |
|
"grad_norm": 0.5390235185623169, |
|
"learning_rate": 0.00012385101858982005, |
|
"loss": 1.4862, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.07810126267028615, |
|
"grad_norm": 0.4766441583633423, |
|
"learning_rate": 0.00012315225580945252, |
|
"loss": 1.6441, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.0784997385002366, |
|
"grad_norm": 0.46869808435440063, |
|
"learning_rate": 0.0001224522964860769, |
|
"loss": 1.7038, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.07889821433018704, |
|
"grad_norm": 0.42946845293045044, |
|
"learning_rate": 0.00012175117679463187, |
|
"loss": 1.5641, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.07929669016013748, |
|
"grad_norm": 0.4762909412384033, |
|
"learning_rate": 0.00012104893297002567, |
|
"loss": 1.6807, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.07969516599008791, |
|
"grad_norm": 0.4462391138076782, |
|
"learning_rate": 0.0001203456013052634, |
|
"loss": 1.5805, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07969516599008791, |
|
"eval_loss": 1.7198740243911743, |
|
"eval_runtime": 603.1346, |
|
"eval_samples_per_second": 14.017, |
|
"eval_steps_per_second": 3.505, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08009364182003835, |
|
"grad_norm": 0.5120438933372498, |
|
"learning_rate": 0.00011964121814957137, |
|
"loss": 1.7626, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.08049211764998879, |
|
"grad_norm": 0.5081213712692261, |
|
"learning_rate": 0.00011893581990651848, |
|
"loss": 1.7664, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.08089059347993924, |
|
"grad_norm": 0.45303136110305786, |
|
"learning_rate": 0.00011822944303213486, |
|
"loss": 1.4845, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.08128906930988967, |
|
"grad_norm": 0.4521328806877136, |
|
"learning_rate": 0.00011752212403302784, |
|
"loss": 1.4534, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.08168754513984011, |
|
"grad_norm": 0.4870966970920563, |
|
"learning_rate": 0.00011681389946449504, |
|
"loss": 1.568, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.08208602096979055, |
|
"grad_norm": 0.5694287419319153, |
|
"learning_rate": 0.00011610480592863531, |
|
"loss": 1.9597, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.08248449679974099, |
|
"grad_norm": 0.5484179258346558, |
|
"learning_rate": 0.00011539488007245702, |
|
"loss": 1.8557, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.08288297262969144, |
|
"grad_norm": 0.44166100025177, |
|
"learning_rate": 0.00011468415858598411, |
|
"loss": 1.5371, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.08328144845964187, |
|
"grad_norm": 0.5365661978721619, |
|
"learning_rate": 0.00011397267820035986, |
|
"loss": 1.7778, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.08367992428959231, |
|
"grad_norm": 0.45911017060279846, |
|
"learning_rate": 0.00011326047568594851, |
|
"loss": 1.5729, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08407840011954275, |
|
"grad_norm": 0.5417599678039551, |
|
"learning_rate": 0.00011254758785043515, |
|
"loss": 1.8296, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.08447687594949319, |
|
"grad_norm": 0.48594942688941956, |
|
"learning_rate": 0.0001118340515369232, |
|
"loss": 1.7837, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.08487535177944362, |
|
"grad_norm": 0.4888298511505127, |
|
"learning_rate": 0.00011111990362203033, |
|
"loss": 1.6575, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.08527382760939407, |
|
"grad_norm": 0.5313907265663147, |
|
"learning_rate": 0.00011040518101398276, |
|
"loss": 1.7803, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.08567230343934451, |
|
"grad_norm": 0.5065906643867493, |
|
"learning_rate": 0.00010968992065070769, |
|
"loss": 1.6539, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.08607077926929495, |
|
"grad_norm": 0.46294957399368286, |
|
"learning_rate": 0.00010897415949792427, |
|
"loss": 1.6412, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.08646925509924538, |
|
"grad_norm": 0.5068647861480713, |
|
"learning_rate": 0.00010825793454723325, |
|
"loss": 1.7173, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.08686773092919582, |
|
"grad_norm": 0.45219966769218445, |
|
"learning_rate": 0.0001075412828142051, |
|
"loss": 1.4531, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.08726620675914626, |
|
"grad_norm": 0.48035022616386414, |
|
"learning_rate": 0.0001068242413364671, |
|
"loss": 1.6187, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.08766468258909671, |
|
"grad_norm": 0.5463985204696655, |
|
"learning_rate": 0.00010610684717178905, |
|
"loss": 1.749, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08806315841904715, |
|
"grad_norm": 0.4818764626979828, |
|
"learning_rate": 0.00010538913739616816, |
|
"loss": 1.4508, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.08846163424899758, |
|
"grad_norm": 0.5018213987350464, |
|
"learning_rate": 0.00010467114910191289, |
|
"loss": 1.6853, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.08886011007894802, |
|
"grad_norm": 0.5122075080871582, |
|
"learning_rate": 0.00010395291939572593, |
|
"loss": 1.6991, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.08925858590889846, |
|
"grad_norm": 0.48191148042678833, |
|
"learning_rate": 0.00010323448539678653, |
|
"loss": 1.6428, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.08965706173884891, |
|
"grad_norm": 0.4748276472091675, |
|
"learning_rate": 0.00010251588423483205, |
|
"loss": 1.7059, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.09005553756879935, |
|
"grad_norm": 0.5150067806243896, |
|
"learning_rate": 0.0001017971530482392, |
|
"loss": 1.7409, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.09045401339874978, |
|
"grad_norm": 0.5893855094909668, |
|
"learning_rate": 0.00010107832898210439, |
|
"loss": 1.7183, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.09085248922870022, |
|
"grad_norm": 0.5195055603981018, |
|
"learning_rate": 0.00010035944918632429, |
|
"loss": 1.8396, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.09125096505865066, |
|
"grad_norm": 0.5996953845024109, |
|
"learning_rate": 9.96405508136757e-05, |
|
"loss": 1.9944, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.0916494408886011, |
|
"grad_norm": 0.5057780146598816, |
|
"learning_rate": 9.892167101789564e-05, |
|
"loss": 1.6473, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09204791671855155, |
|
"grad_norm": 0.46774283051490784, |
|
"learning_rate": 9.820284695176082e-05, |
|
"loss": 1.5973, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.09244639254850198, |
|
"grad_norm": 0.46982142329216003, |
|
"learning_rate": 9.748411576516794e-05, |
|
"loss": 1.6464, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.09284486837845242, |
|
"grad_norm": 0.4873621165752411, |
|
"learning_rate": 9.676551460321349e-05, |
|
"loss": 1.6629, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.09324334420840286, |
|
"grad_norm": 0.4866909682750702, |
|
"learning_rate": 9.60470806042741e-05, |
|
"loss": 1.6262, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.0936418200383533, |
|
"grad_norm": 0.5320809483528137, |
|
"learning_rate": 9.532885089808713e-05, |
|
"loss": 1.7158, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.09404029586830374, |
|
"grad_norm": 0.47346270084381104, |
|
"learning_rate": 9.461086260383187e-05, |
|
"loss": 1.6044, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.09443877169825418, |
|
"grad_norm": 0.5696609616279602, |
|
"learning_rate": 9.389315282821097e-05, |
|
"loss": 1.7883, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.09483724752820462, |
|
"grad_norm": 0.4926949441432953, |
|
"learning_rate": 9.317575866353292e-05, |
|
"loss": 1.7306, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.09523572335815506, |
|
"grad_norm": 0.5241943001747131, |
|
"learning_rate": 9.245871718579491e-05, |
|
"loss": 1.732, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.09563419918810549, |
|
"grad_norm": 0.5425236225128174, |
|
"learning_rate": 9.174206545276677e-05, |
|
"loss": 1.6209, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09603267501805593, |
|
"grad_norm": 0.5216458439826965, |
|
"learning_rate": 9.102584050207578e-05, |
|
"loss": 1.74, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.09643115084800638, |
|
"grad_norm": 0.5082316994667053, |
|
"learning_rate": 9.031007934929236e-05, |
|
"loss": 1.6836, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.09682962667795682, |
|
"grad_norm": 0.48965132236480713, |
|
"learning_rate": 8.959481898601728e-05, |
|
"loss": 1.7055, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.09722810250790725, |
|
"grad_norm": 0.514946699142456, |
|
"learning_rate": 8.888009637796968e-05, |
|
"loss": 1.684, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.09762657833785769, |
|
"grad_norm": 0.551802396774292, |
|
"learning_rate": 8.81659484630768e-05, |
|
"loss": 1.8566, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.09802505416780813, |
|
"grad_norm": 0.4790934920310974, |
|
"learning_rate": 8.745241214956483e-05, |
|
"loss": 1.6461, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.09842352999775858, |
|
"grad_norm": 0.5450412631034851, |
|
"learning_rate": 8.673952431405148e-05, |
|
"loss": 1.7215, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.09882200582770902, |
|
"grad_norm": 0.5299497842788696, |
|
"learning_rate": 8.602732179964017e-05, |
|
"loss": 1.7454, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.09922048165765945, |
|
"grad_norm": 0.5010784268379211, |
|
"learning_rate": 8.531584141401591e-05, |
|
"loss": 1.6028, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.09961895748760989, |
|
"grad_norm": 0.4926188886165619, |
|
"learning_rate": 8.4605119927543e-05, |
|
"loss": 1.6837, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.10001743331756033, |
|
"grad_norm": 0.5703017115592957, |
|
"learning_rate": 8.38951940713647e-05, |
|
"loss": 1.8639, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.10041590914751077, |
|
"grad_norm": 0.5429261326789856, |
|
"learning_rate": 8.318610053550497e-05, |
|
"loss": 1.7258, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.10081438497746122, |
|
"grad_norm": 0.48338782787323, |
|
"learning_rate": 8.247787596697218e-05, |
|
"loss": 1.5873, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.10121286080741165, |
|
"grad_norm": 0.506877601146698, |
|
"learning_rate": 8.177055696786516e-05, |
|
"loss": 1.6736, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.10161133663736209, |
|
"grad_norm": 0.537820041179657, |
|
"learning_rate": 8.106418009348157e-05, |
|
"loss": 1.9075, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.10200981246731253, |
|
"grad_norm": 0.4729152023792267, |
|
"learning_rate": 8.035878185042868e-05, |
|
"loss": 1.5359, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.10240828829726296, |
|
"grad_norm": 0.4413747191429138, |
|
"learning_rate": 7.965439869473664e-05, |
|
"loss": 1.6245, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.10280676412721342, |
|
"grad_norm": 0.5398510694503784, |
|
"learning_rate": 7.895106702997437e-05, |
|
"loss": 1.6318, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.10320523995716385, |
|
"grad_norm": 0.5172785520553589, |
|
"learning_rate": 7.824882320536814e-05, |
|
"loss": 1.6601, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.10360371578711429, |
|
"grad_norm": 0.4824993908405304, |
|
"learning_rate": 7.754770351392311e-05, |
|
"loss": 1.5672, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.10400219161706473, |
|
"grad_norm": 0.4745709300041199, |
|
"learning_rate": 7.684774419054747e-05, |
|
"loss": 1.7128, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.10440066744701516, |
|
"grad_norm": 0.5071855783462524, |
|
"learning_rate": 7.614898141017996e-05, |
|
"loss": 1.7368, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.1047991432769656, |
|
"grad_norm": 0.5377690196037292, |
|
"learning_rate": 7.54514512859201e-05, |
|
"loss": 1.8659, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.10519761910691605, |
|
"grad_norm": 0.4762866199016571, |
|
"learning_rate": 7.475518986716194e-05, |
|
"loss": 1.6012, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.10559609493686649, |
|
"grad_norm": 0.46296924352645874, |
|
"learning_rate": 7.406023313773097e-05, |
|
"loss": 1.5484, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.10599457076681693, |
|
"grad_norm": 0.47845426201820374, |
|
"learning_rate": 7.336661701402439e-05, |
|
"loss": 1.6248, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.10639304659676736, |
|
"grad_norm": 0.48351001739501953, |
|
"learning_rate": 7.267437734315492e-05, |
|
"loss": 1.5549, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.1067915224267178, |
|
"grad_norm": 0.48554375767707825, |
|
"learning_rate": 7.198354990109805e-05, |
|
"loss": 1.5708, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.10718999825666825, |
|
"grad_norm": 0.47755077481269836, |
|
"learning_rate": 7.129417039084333e-05, |
|
"loss": 1.5864, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.10758847408661869, |
|
"grad_norm": 0.4970269799232483, |
|
"learning_rate": 7.060627444054893e-05, |
|
"loss": 1.6373, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.10798694991656912, |
|
"grad_norm": 0.47547978162765503, |
|
"learning_rate": 6.99198976017005e-05, |
|
"loss": 1.7433, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.10838542574651956, |
|
"grad_norm": 0.5408848524093628, |
|
"learning_rate": 6.923507534727373e-05, |
|
"loss": 1.77, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.10878390157647, |
|
"grad_norm": 0.49777430295944214, |
|
"learning_rate": 6.855184306990106e-05, |
|
"loss": 1.6071, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.10918237740642044, |
|
"grad_norm": 0.4691534638404846, |
|
"learning_rate": 6.78702360800425e-05, |
|
"loss": 1.5913, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.10958085323637089, |
|
"grad_norm": 0.5284269452095032, |
|
"learning_rate": 6.719028960416098e-05, |
|
"loss": 1.8038, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.10997932906632132, |
|
"grad_norm": 0.49061042070388794, |
|
"learning_rate": 6.651203878290139e-05, |
|
"loss": 1.5991, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.11037780489627176, |
|
"grad_norm": 0.5676330327987671, |
|
"learning_rate": 6.583551866927475e-05, |
|
"loss": 1.8924, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.1107762807262222, |
|
"grad_norm": 0.5392544865608215, |
|
"learning_rate": 6.516076422684654e-05, |
|
"loss": 1.7611, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.11117475655617264, |
|
"grad_norm": 0.5719506740570068, |
|
"learning_rate": 6.448781032792972e-05, |
|
"loss": 1.756, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.11157323238612307, |
|
"grad_norm": 0.4809233248233795, |
|
"learning_rate": 6.381669175178248e-05, |
|
"loss": 1.641, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.11197170821607352, |
|
"grad_norm": 0.48434188961982727, |
|
"learning_rate": 6.31474431828108e-05, |
|
"loss": 1.579, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.11237018404602396, |
|
"grad_norm": 0.5024405717849731, |
|
"learning_rate": 6.248009920877592e-05, |
|
"loss": 1.6653, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.1127686598759744, |
|
"grad_norm": 0.441279798746109, |
|
"learning_rate": 6.181469431900672e-05, |
|
"loss": 1.5105, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.11316713570592483, |
|
"grad_norm": 0.5233234763145447, |
|
"learning_rate": 6.115126290261745e-05, |
|
"loss": 1.7695, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.11356561153587527, |
|
"grad_norm": 0.5281261801719666, |
|
"learning_rate": 6.048983924673022e-05, |
|
"loss": 1.76, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.11396408736582572, |
|
"grad_norm": 0.534590482711792, |
|
"learning_rate": 5.983045753470308e-05, |
|
"loss": 1.8155, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.11436256319577616, |
|
"grad_norm": 0.5247072577476501, |
|
"learning_rate": 5.917315184436345e-05, |
|
"loss": 1.6073, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.1147610390257266, |
|
"grad_norm": 0.4829355776309967, |
|
"learning_rate": 5.851795614624682e-05, |
|
"loss": 1.5224, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.11515951485567703, |
|
"grad_norm": 0.516015887260437, |
|
"learning_rate": 5.786490430184115e-05, |
|
"loss": 1.6813, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.11555799068562747, |
|
"grad_norm": 0.48894891142845154, |
|
"learning_rate": 5.72140300618369e-05, |
|
"loss": 1.7965, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.11595646651557791, |
|
"grad_norm": 0.49149996042251587, |
|
"learning_rate": 5.656536706438267e-05, |
|
"loss": 1.6388, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.11635494234552836, |
|
"grad_norm": 0.4835774898529053, |
|
"learning_rate": 5.591894883334667e-05, |
|
"loss": 1.6856, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.1167534181754788, |
|
"grad_norm": 0.5278857946395874, |
|
"learning_rate": 5.5274808776584367e-05, |
|
"loss": 1.6883, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.11715189400542923, |
|
"grad_norm": 0.4995588958263397, |
|
"learning_rate": 5.463298018421171e-05, |
|
"loss": 1.519, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.11755036983537967, |
|
"grad_norm": 0.5236543416976929, |
|
"learning_rate": 5.399349622688479e-05, |
|
"loss": 1.7372, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.11794884566533011, |
|
"grad_norm": 0.45699524879455566, |
|
"learning_rate": 5.335638995408545e-05, |
|
"loss": 1.6082, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.11834732149528056, |
|
"grad_norm": 0.5191316604614258, |
|
"learning_rate": 5.272169429241325e-05, |
|
"loss": 1.7123, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.118745797325231, |
|
"grad_norm": 0.42880895733833313, |
|
"learning_rate": 5.208944204388377e-05, |
|
"loss": 1.4809, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.11914427315518143, |
|
"grad_norm": 0.5574065446853638, |
|
"learning_rate": 5.145966588423341e-05, |
|
"loss": 1.8128, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.11954274898513187, |
|
"grad_norm": 0.47847244143486023, |
|
"learning_rate": 5.0832398361230596e-05, |
|
"loss": 1.5699, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11954274898513187, |
|
"eval_loss": 1.700640082359314, |
|
"eval_runtime": 603.0633, |
|
"eval_samples_per_second": 14.018, |
|
"eval_steps_per_second": 3.505, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1199412248150823, |
|
"grad_norm": 0.5043081045150757, |
|
"learning_rate": 5.020767189299369e-05, |
|
"loss": 1.6154, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.12033970064503274, |
|
"grad_norm": 0.5781189203262329, |
|
"learning_rate": 4.9585518766315496e-05, |
|
"loss": 1.7757, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.1207381764749832, |
|
"grad_norm": 0.5455333590507507, |
|
"learning_rate": 4.896597113499479e-05, |
|
"loss": 1.5694, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.12113665230493363, |
|
"grad_norm": 0.5005789399147034, |
|
"learning_rate": 4.834906101817438e-05, |
|
"loss": 1.716, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.12153512813488407, |
|
"grad_norm": 0.5123348832130432, |
|
"learning_rate": 4.773482029868657e-05, |
|
"loss": 1.7465, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.1219336039648345, |
|
"grad_norm": 0.5470724105834961, |
|
"learning_rate": 4.712328072140505e-05, |
|
"loss": 1.7012, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.12233207979478494, |
|
"grad_norm": 0.47325006127357483, |
|
"learning_rate": 4.651447389160458e-05, |
|
"loss": 1.6349, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.1227305556247354, |
|
"grad_norm": 0.4630093276500702, |
|
"learning_rate": 4.5908431273327436e-05, |
|
"loss": 1.5126, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.12312903145468583, |
|
"grad_norm": 0.5252417922019958, |
|
"learning_rate": 4.530518418775733e-05, |
|
"loss": 1.7035, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.12352750728463627, |
|
"grad_norm": 0.5543851256370544, |
|
"learning_rate": 4.470476381160065e-05, |
|
"loss": 1.9395, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1239259831145867, |
|
"grad_norm": 0.4824533462524414, |
|
"learning_rate": 4.4107201175475275e-05, |
|
"loss": 1.5935, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.12432445894453714, |
|
"grad_norm": 0.5243806838989258, |
|
"learning_rate": 4.351252716230685e-05, |
|
"loss": 1.7277, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.12472293477448758, |
|
"grad_norm": 0.501171886920929, |
|
"learning_rate": 4.292077250573266e-05, |
|
"loss": 1.594, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.12512141060443802, |
|
"grad_norm": 0.47692808508872986, |
|
"learning_rate": 4.2331967788513295e-05, |
|
"loss": 1.4772, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.12551988643438847, |
|
"grad_norm": 0.5134193897247314, |
|
"learning_rate": 4.174614344095213e-05, |
|
"loss": 1.643, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.1259183622643389, |
|
"grad_norm": 0.5090487003326416, |
|
"learning_rate": 4.116332973932256e-05, |
|
"loss": 1.6696, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.12631683809428934, |
|
"grad_norm": 0.5596434473991394, |
|
"learning_rate": 4.058355680430337e-05, |
|
"loss": 1.4942, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.1267153139242398, |
|
"grad_norm": 0.5060478448867798, |
|
"learning_rate": 4.0006854599421926e-05, |
|
"loss": 1.7277, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.12711378975419022, |
|
"grad_norm": 0.5043231248855591, |
|
"learning_rate": 3.943325292950579e-05, |
|
"loss": 1.653, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.12751226558414067, |
|
"grad_norm": 0.49735555052757263, |
|
"learning_rate": 3.886278143914219e-05, |
|
"loss": 1.6637, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1279107414140911, |
|
"grad_norm": 0.5129334926605225, |
|
"learning_rate": 3.829546961114607e-05, |
|
"loss": 1.7365, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.12830921724404154, |
|
"grad_norm": 0.5209783911705017, |
|
"learning_rate": 3.773134676503629e-05, |
|
"loss": 1.7903, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.128707693073992, |
|
"grad_norm": 0.5349249243736267, |
|
"learning_rate": 3.7170442055520415e-05, |
|
"loss": 1.7308, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.12910616890394241, |
|
"grad_norm": 0.5156399011611938, |
|
"learning_rate": 3.661278447098789e-05, |
|
"loss": 1.5822, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.12950464473389287, |
|
"grad_norm": 0.5426967144012451, |
|
"learning_rate": 3.605840283201195e-05, |
|
"loss": 1.6398, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.1299031205638433, |
|
"grad_norm": 0.5003894567489624, |
|
"learning_rate": 3.550732578986006e-05, |
|
"loss": 1.745, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.13030159639379374, |
|
"grad_norm": 0.5355332493782043, |
|
"learning_rate": 3.495958182501325e-05, |
|
"loss": 1.6628, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.1307000722237442, |
|
"grad_norm": 0.5347525477409363, |
|
"learning_rate": 3.441519924569408e-05, |
|
"loss": 1.7257, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.1310985480536946, |
|
"grad_norm": 0.5936457514762878, |
|
"learning_rate": 3.387420618640379e-05, |
|
"loss": 1.7605, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.13149702388364506, |
|
"grad_norm": 0.4799719750881195, |
|
"learning_rate": 3.3336630606468134e-05, |
|
"loss": 1.5599, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1318954997135955, |
|
"grad_norm": 0.534975528717041, |
|
"learning_rate": 3.280250028859248e-05, |
|
"loss": 1.741, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.13229397554354594, |
|
"grad_norm": 0.5082612037658691, |
|
"learning_rate": 3.227184283742591e-05, |
|
"loss": 1.6139, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.1326924513734964, |
|
"grad_norm": 0.5057691335678101, |
|
"learning_rate": 3.174468567813461e-05, |
|
"loss": 1.7092, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.1330909272034468, |
|
"grad_norm": 0.4517490863800049, |
|
"learning_rate": 3.122105605498442e-05, |
|
"loss": 1.5781, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.13348940303339726, |
|
"grad_norm": 0.4567318558692932, |
|
"learning_rate": 3.070098102993302e-05, |
|
"loss": 1.5871, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.1338878788633477, |
|
"grad_norm": 0.5688018202781677, |
|
"learning_rate": 3.018448748123097e-05, |
|
"loss": 1.8721, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.13428635469329814, |
|
"grad_norm": 0.5304664969444275, |
|
"learning_rate": 2.9671602102032926e-05, |
|
"loss": 1.7813, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.13468483052324856, |
|
"grad_norm": 0.4832111597061157, |
|
"learning_rate": 2.9162351399017963e-05, |
|
"loss": 1.6609, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.135083306353199, |
|
"grad_norm": 0.5615909695625305, |
|
"learning_rate": 2.8656761691019673e-05, |
|
"loss": 1.7713, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.13548178218314946, |
|
"grad_norm": 0.4976556599140167, |
|
"learning_rate": 2.8154859107665987e-05, |
|
"loss": 1.6278, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.13588025801309989, |
|
"grad_norm": 0.48554837703704834, |
|
"learning_rate": 2.7656669588028762e-05, |
|
"loss": 1.6038, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.13627873384305034, |
|
"grad_norm": 0.50529545545578, |
|
"learning_rate": 2.7162218879283176e-05, |
|
"loss": 1.515, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.13667720967300076, |
|
"grad_norm": 0.5264208912849426, |
|
"learning_rate": 2.667153253537713e-05, |
|
"loss": 1.7788, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.1370756855029512, |
|
"grad_norm": 0.5166341066360474, |
|
"learning_rate": 2.618463591571052e-05, |
|
"loss": 1.7624, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.13747416133290166, |
|
"grad_norm": 0.4670686721801758, |
|
"learning_rate": 2.570155418382473e-05, |
|
"loss": 1.5216, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.13787263716285209, |
|
"grad_norm": 0.5010607242584229, |
|
"learning_rate": 2.5222312306101925e-05, |
|
"loss": 1.6297, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.13827111299280254, |
|
"grad_norm": 0.44925105571746826, |
|
"learning_rate": 2.474693505047504e-05, |
|
"loss": 1.4302, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.13866958882275296, |
|
"grad_norm": 0.44039008021354675, |
|
"learning_rate": 2.427544698514753e-05, |
|
"loss": 1.4163, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.1390680646527034, |
|
"grad_norm": 0.5106916427612305, |
|
"learning_rate": 2.3807872477323733e-05, |
|
"loss": 1.5566, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.13946654048265386, |
|
"grad_norm": 0.5118552446365356, |
|
"learning_rate": 2.334423569194948e-05, |
|
"loss": 1.6767, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.13986501631260428, |
|
"grad_norm": 0.5088701248168945, |
|
"learning_rate": 2.288456059046331e-05, |
|
"loss": 1.6389, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.14026349214255474, |
|
"grad_norm": 0.5484685301780701, |
|
"learning_rate": 2.242887092955801e-05, |
|
"loss": 1.6978, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.14066196797250516, |
|
"grad_norm": 0.5057936906814575, |
|
"learning_rate": 2.1977190259952883e-05, |
|
"loss": 1.7238, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.1410604438024556, |
|
"grad_norm": 0.4977273643016815, |
|
"learning_rate": 2.1529541925176555e-05, |
|
"loss": 1.589, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.14145891963240606, |
|
"grad_norm": 0.556425929069519, |
|
"learning_rate": 2.1085949060360654e-05, |
|
"loss": 1.784, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.14185739546235648, |
|
"grad_norm": 0.48254185914993286, |
|
"learning_rate": 2.064643459104405e-05, |
|
"loss": 1.5242, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.14225587129230693, |
|
"grad_norm": 0.5121050477027893, |
|
"learning_rate": 2.0211021231988102e-05, |
|
"loss": 1.6523, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.14265434712225736, |
|
"grad_norm": 0.5412747263908386, |
|
"learning_rate": 1.977973148600266e-05, |
|
"loss": 1.7816, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.1430528229522078, |
|
"grad_norm": 0.5190417170524597, |
|
"learning_rate": 1.935258764278314e-05, |
|
"loss": 1.7215, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.14345129878215823, |
|
"grad_norm": 0.5047377347946167, |
|
"learning_rate": 1.8929611777758526e-05, |
|
"loss": 1.6061, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.14384977461210868, |
|
"grad_norm": 0.5179762840270996, |
|
"learning_rate": 1.851082575095051e-05, |
|
"loss": 1.6054, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.14424825044205913, |
|
"grad_norm": 0.533320963382721, |
|
"learning_rate": 1.8096251205843684e-05, |
|
"loss": 1.6372, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.14464672627200956, |
|
"grad_norm": 0.4938521981239319, |
|
"learning_rate": 1.7685909568267033e-05, |
|
"loss": 1.7578, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.14504520210196, |
|
"grad_norm": 0.5236971974372864, |
|
"learning_rate": 1.7279822045286576e-05, |
|
"loss": 1.7821, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.14544367793191043, |
|
"grad_norm": 0.548584520816803, |
|
"learning_rate": 1.6878009624109313e-05, |
|
"loss": 1.7914, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.14584215376186088, |
|
"grad_norm": 0.5472350716590881, |
|
"learning_rate": 1.648049307099874e-05, |
|
"loss": 1.5642, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.14624062959181133, |
|
"grad_norm": 0.5605772733688354, |
|
"learning_rate": 1.6087292930201394e-05, |
|
"loss": 1.7474, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.14663910542176176, |
|
"grad_norm": 0.47889217734336853, |
|
"learning_rate": 1.569842952288527e-05, |
|
"loss": 1.5833, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.1470375812517122, |
|
"grad_norm": 0.5094882845878601, |
|
"learning_rate": 1.5313922946089486e-05, |
|
"loss": 1.702, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.14743605708166263, |
|
"grad_norm": 0.48179590702056885, |
|
"learning_rate": 1.4933793071685732e-05, |
|
"loss": 1.5017, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.14783453291161308, |
|
"grad_norm": 0.5010417699813843, |
|
"learning_rate": 1.4558059545351143e-05, |
|
"loss": 1.5445, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.14823300874156353, |
|
"grad_norm": 0.43870919942855835, |
|
"learning_rate": 1.4186741785553115e-05, |
|
"loss": 1.5428, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.14863148457151396, |
|
"grad_norm": 0.4662686288356781, |
|
"learning_rate": 1.3819858982545598e-05, |
|
"loss": 1.4941, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.1490299604014644, |
|
"grad_norm": 0.5589818358421326, |
|
"learning_rate": 1.3457430097377421e-05, |
|
"loss": 1.7253, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.14942843623141483, |
|
"grad_norm": 0.5113766193389893, |
|
"learning_rate": 1.3099473860912326e-05, |
|
"loss": 1.5904, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.14982691206136528, |
|
"grad_norm": 0.5116039514541626, |
|
"learning_rate": 1.2746008772860884e-05, |
|
"loss": 1.6287, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.1502253878913157, |
|
"grad_norm": 0.5835950970649719, |
|
"learning_rate": 1.2397053100824463e-05, |
|
"loss": 1.4924, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.15062386372126615, |
|
"grad_norm": 0.5390010476112366, |
|
"learning_rate": 1.2052624879351104e-05, |
|
"loss": 1.5715, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.1510223395512166, |
|
"grad_norm": 0.5362573266029358, |
|
"learning_rate": 1.1712741909003444e-05, |
|
"loss": 1.7802, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.15142081538116703, |
|
"grad_norm": 0.49706023931503296, |
|
"learning_rate": 1.1377421755438832e-05, |
|
"loss": 1.7128, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.15181929121111748, |
|
"grad_norm": 0.5436707735061646, |
|
"learning_rate": 1.1046681748501408e-05, |
|
"loss": 1.7381, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.1522177670410679, |
|
"grad_norm": 0.51026850938797, |
|
"learning_rate": 1.0720538981326556e-05, |
|
"loss": 1.6813, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.15261624287101835, |
|
"grad_norm": 0.5093562006950378, |
|
"learning_rate": 1.0399010309457457e-05, |
|
"loss": 1.6643, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.1530147187009688, |
|
"grad_norm": 0.5157011151313782, |
|
"learning_rate": 1.0082112349974016e-05, |
|
"loss": 1.7036, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.15341319453091923, |
|
"grad_norm": 0.46138855814933777, |
|
"learning_rate": 9.76986148063398e-06, |
|
"loss": 1.3899, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.15381167036086968, |
|
"grad_norm": 0.48433151841163635, |
|
"learning_rate": 9.462273839026624e-06, |
|
"loss": 1.5909, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.1542101461908201, |
|
"grad_norm": 0.5252178907394409, |
|
"learning_rate": 9.159365321738655e-06, |
|
"loss": 1.7776, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.15460862202077055, |
|
"grad_norm": 0.5377585291862488, |
|
"learning_rate": 8.861151583532656e-06, |
|
"loss": 1.7147, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.155007097850721, |
|
"grad_norm": 0.5604181885719299, |
|
"learning_rate": 8.56764803653809e-06, |
|
"loss": 1.8522, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.15540557368067143, |
|
"grad_norm": 0.5364322662353516, |
|
"learning_rate": 8.278869849454718e-06, |
|
"loss": 1.6899, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.15580404951062188, |
|
"grad_norm": 0.4611172676086426, |
|
"learning_rate": 7.994831946768622e-06, |
|
"loss": 1.5186, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.1562025253405723, |
|
"grad_norm": 0.5083454251289368, |
|
"learning_rate": 7.715549007981027e-06, |
|
"loss": 1.6604, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.15660100117052275, |
|
"grad_norm": 0.5075681209564209, |
|
"learning_rate": 7.441035466849489e-06, |
|
"loss": 1.5289, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.1569994770004732, |
|
"grad_norm": 0.5153861045837402, |
|
"learning_rate": 7.171305510642023e-06, |
|
"loss": 1.6241, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.15739795283042363, |
|
"grad_norm": 0.5271784663200378, |
|
"learning_rate": 6.906373079403849e-06, |
|
"loss": 1.7079, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.15779642866037408, |
|
"grad_norm": 0.49165791273117065, |
|
"learning_rate": 6.646251865236997e-06, |
|
"loss": 1.7268, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.1581949044903245, |
|
"grad_norm": 0.5324728488922119, |
|
"learning_rate": 6.390955311592617e-06, |
|
"loss": 1.6794, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.15859338032027495, |
|
"grad_norm": 0.5144858956336975, |
|
"learning_rate": 6.140496612576241e-06, |
|
"loss": 1.7149, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.15899185615022537, |
|
"grad_norm": 0.5114040374755859, |
|
"learning_rate": 5.8948887122658335e-06, |
|
"loss": 1.561, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.15939033198017583, |
|
"grad_norm": 0.5673094391822815, |
|
"learning_rate": 5.65414430404293e-06, |
|
"loss": 1.873, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15939033198017583, |
|
"eval_loss": 1.6927062273025513, |
|
"eval_runtime": 603.1583, |
|
"eval_samples_per_second": 14.016, |
|
"eval_steps_per_second": 3.505, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 447, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 2, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1184619093229568e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|