diff --git "a/checkpoint-734/trainer_state.json" "b/checkpoint-734/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-734/trainer_state.json" @@ -0,0 +1,5223 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9809394145677333, + "eval_steps": 92, + "global_step": 734, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.17789718508720398, + "learning_rate": 2e-05, + "loss": 2.3216, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 2.256117105484009, + "eval_runtime": 52.5552, + "eval_samples_per_second": 95.081, + "eval_steps_per_second": 23.785, + "step": 1 + }, + { + "epoch": 0.01, + "grad_norm": 0.1491609364748001, + "learning_rate": 4e-05, + "loss": 2.2689, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 0.16114787757396698, + "learning_rate": 6e-05, + "loss": 2.1799, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 0.17484058439731598, + "learning_rate": 8e-05, + "loss": 2.2857, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 0.16328556835651398, + "learning_rate": 0.0001, + "loss": 2.1805, + "step": 5 + }, + { + "epoch": 0.02, + "grad_norm": 0.2006424516439438, + "learning_rate": 0.00012, + "loss": 2.3171, + "step": 6 + }, + { + "epoch": 0.02, + "grad_norm": 0.15052957832813263, + "learning_rate": 0.00014, + "loss": 2.2048, + "step": 7 + }, + { + "epoch": 0.02, + "grad_norm": 0.1444414258003235, + "learning_rate": 0.00016, + "loss": 2.181, + "step": 8 + }, + { + "epoch": 0.02, + "grad_norm": 0.174555703997612, + "learning_rate": 0.00018, + "loss": 2.0824, + "step": 9 + }, + { + "epoch": 0.03, + "grad_norm": 0.20063771307468414, + "learning_rate": 0.0002, + "loss": 2.1274, + "step": 10 + }, + { + "epoch": 0.03, + "grad_norm": 0.19133329391479492, + "learning_rate": 0.00019999958540892524, + "loss": 2.2436, + "step": 11 + }, + { + "epoch": 0.03, + "grad_norm": 0.19068902730941772, + "learning_rate": 0.00019999834163913867, + "loss": 2.0865, + "step": 12 + }, + { + "epoch": 0.04, + "grad_norm": 0.12937726080417633, + "learning_rate": 0.00019999626870095333, + "loss": 1.9834, + "step": 13 + }, + { + "epoch": 0.04, + "grad_norm": 0.12377699464559555, + "learning_rate": 0.0001999933666115578, + "loss": 2.0084, + "step": 14 + }, + { + "epoch": 0.04, + "grad_norm": 0.11349006742238998, + "learning_rate": 0.00019998963539501557, + "loss": 2.0472, + "step": 15 + }, + { + "epoch": 0.04, + "grad_norm": 0.10255803167819977, + "learning_rate": 0.00019998507508226524, + "loss": 2.0017, + "step": 16 + }, + { + "epoch": 0.05, + "grad_norm": 0.1226801723241806, + "learning_rate": 0.00019997968571112013, + "loss": 2.0241, + "step": 17 + }, + { + "epoch": 0.05, + "grad_norm": 0.09784600138664246, + "learning_rate": 0.00019997346732626795, + "loss": 1.9264, + "step": 18 + }, + { + "epoch": 0.05, + "grad_norm": 0.0987139344215393, + "learning_rate": 0.00019996641997927043, + "loss": 2.0021, + "step": 19 + }, + { + "epoch": 0.05, + "grad_norm": 0.1008625328540802, + "learning_rate": 0.0001999585437285629, + "loss": 1.9401, + "step": 20 + }, + { + "epoch": 0.06, + "grad_norm": 0.11675934493541718, + "learning_rate": 0.00019994983863945388, + "loss": 2.0533, + "step": 21 + }, + { + "epoch": 0.06, + "grad_norm": 0.1078399121761322, + "learning_rate": 0.0001999403047841243, + "loss": 2.067, + "step": 22 + }, + { + "epoch": 0.06, + "grad_norm": 0.10346037149429321, + "learning_rate": 0.00019992994224162728, + "loss": 1.9767, + "step": 23 + }, + { + "epoch": 0.07, + "grad_norm": 0.11017748713493347, + "learning_rate": 0.0001999187510978872, + "loss": 1.9501, + "step": 24 + }, + { + "epoch": 0.07, + "grad_norm": 0.11317062377929688, + "learning_rate": 0.00019990673144569892, + "loss": 1.9425, + "step": 25 + }, + { + "epoch": 0.07, + "grad_norm": 0.10313500463962555, + "learning_rate": 0.0001998938833847273, + "loss": 1.9594, + "step": 26 + }, + { + "epoch": 0.07, + "grad_norm": 0.1095932349562645, + "learning_rate": 0.00019988020702150618, + "loss": 1.951, + "step": 27 + }, + { + "epoch": 0.08, + "grad_norm": 0.11742599308490753, + "learning_rate": 0.00019986570246943754, + "loss": 2.0445, + "step": 28 + }, + { + "epoch": 0.08, + "grad_norm": 0.14208655059337616, + "learning_rate": 0.00019985036984879052, + "loss": 1.9932, + "step": 29 + }, + { + "epoch": 0.08, + "grad_norm": 0.10696502774953842, + "learning_rate": 0.00019983420928670044, + "loss": 1.9076, + "step": 30 + }, + { + "epoch": 0.08, + "grad_norm": 0.10935875028371811, + "learning_rate": 0.00019981722091716783, + "loss": 1.8718, + "step": 31 + }, + { + "epoch": 0.09, + "grad_norm": 0.11814530938863754, + "learning_rate": 0.00019979940488105722, + "loss": 1.8591, + "step": 32 + }, + { + "epoch": 0.09, + "grad_norm": 0.12232942134141922, + "learning_rate": 0.00019978076132609599, + "loss": 1.9379, + "step": 33 + }, + { + "epoch": 0.09, + "grad_norm": 0.11652842909097672, + "learning_rate": 0.00019976129040687318, + "loss": 1.9731, + "step": 34 + }, + { + "epoch": 0.1, + "grad_norm": 0.1147683709859848, + "learning_rate": 0.0001997409922848381, + "loss": 1.8994, + "step": 35 + }, + { + "epoch": 0.1, + "grad_norm": 0.1180400475859642, + "learning_rate": 0.00019971986712829932, + "loss": 1.8678, + "step": 36 + }, + { + "epoch": 0.1, + "grad_norm": 0.13028429448604584, + "learning_rate": 0.0001996979151124227, + "loss": 1.9853, + "step": 37 + }, + { + "epoch": 0.1, + "grad_norm": 0.11380694806575775, + "learning_rate": 0.00019967513641923056, + "loss": 1.8388, + "step": 38 + }, + { + "epoch": 0.11, + "grad_norm": 0.13089893758296967, + "learning_rate": 0.00019965153123759966, + "loss": 1.8802, + "step": 39 + }, + { + "epoch": 0.11, + "grad_norm": 0.12069398909807205, + "learning_rate": 0.00019962709976326, + "loss": 1.9847, + "step": 40 + }, + { + "epoch": 0.11, + "grad_norm": 0.12704235315322876, + "learning_rate": 0.00019960184219879303, + "loss": 2.0103, + "step": 41 + }, + { + "epoch": 0.11, + "grad_norm": 0.15785923600196838, + "learning_rate": 0.00019957575875362993, + "loss": 1.9266, + "step": 42 + }, + { + "epoch": 0.12, + "grad_norm": 0.137678325176239, + "learning_rate": 0.00019954884964404996, + "loss": 1.827, + "step": 43 + }, + { + "epoch": 0.12, + "grad_norm": 0.12916652858257294, + "learning_rate": 0.0001995211150931787, + "loss": 1.8881, + "step": 44 + }, + { + "epoch": 0.12, + "grad_norm": 0.14117199182510376, + "learning_rate": 0.00019949255533098604, + "loss": 1.9003, + "step": 45 + }, + { + "epoch": 0.13, + "grad_norm": 0.122989721596241, + "learning_rate": 0.00019946317059428448, + "loss": 1.7859, + "step": 46 + }, + { + "epoch": 0.13, + "grad_norm": 0.14118267595767975, + "learning_rate": 0.00019943296112672696, + "loss": 1.9206, + "step": 47 + }, + { + "epoch": 0.13, + "grad_norm": 0.14580337703227997, + "learning_rate": 0.00019940192717880502, + "loss": 2.0004, + "step": 48 + }, + { + "epoch": 0.13, + "grad_norm": 0.1319318562746048, + "learning_rate": 0.00019937006900784665, + "loss": 1.9288, + "step": 49 + }, + { + "epoch": 0.14, + "grad_norm": 0.13234923779964447, + "learning_rate": 0.00019933738687801403, + "loss": 1.8757, + "step": 50 + }, + { + "epoch": 0.14, + "grad_norm": 0.13480357825756073, + "learning_rate": 0.00019930388106030166, + "loss": 1.773, + "step": 51 + }, + { + "epoch": 0.14, + "grad_norm": 0.15075266361236572, + "learning_rate": 0.00019926955183253374, + "loss": 1.9003, + "step": 52 + }, + { + "epoch": 0.14, + "grad_norm": 0.2747078537940979, + "learning_rate": 0.00019923439947936204, + "loss": 1.8076, + "step": 53 + }, + { + "epoch": 0.15, + "grad_norm": 0.14992934465408325, + "learning_rate": 0.00019919842429226372, + "loss": 1.9598, + "step": 54 + }, + { + "epoch": 0.15, + "grad_norm": 0.15926502645015717, + "learning_rate": 0.0001991616265695385, + "loss": 1.8815, + "step": 55 + }, + { + "epoch": 0.15, + "grad_norm": 0.18423113226890564, + "learning_rate": 0.00019912400661630658, + "loss": 2.0042, + "step": 56 + }, + { + "epoch": 0.16, + "grad_norm": 0.15437527000904083, + "learning_rate": 0.00019908556474450593, + "loss": 1.8524, + "step": 57 + }, + { + "epoch": 0.16, + "grad_norm": 0.15860721468925476, + "learning_rate": 0.0001990463012728896, + "loss": 1.9301, + "step": 58 + }, + { + "epoch": 0.16, + "grad_norm": 0.1519562005996704, + "learning_rate": 0.00019900621652702336, + "loss": 1.8769, + "step": 59 + }, + { + "epoch": 0.16, + "grad_norm": 0.13514740765094757, + "learning_rate": 0.00019896531083928273, + "loss": 1.7548, + "step": 60 + }, + { + "epoch": 0.17, + "grad_norm": 0.15928471088409424, + "learning_rate": 0.00019892358454885042, + "loss": 1.8667, + "step": 61 + }, + { + "epoch": 0.17, + "grad_norm": 0.15786142647266388, + "learning_rate": 0.00019888103800171333, + "loss": 1.8641, + "step": 62 + }, + { + "epoch": 0.17, + "grad_norm": 0.1386164277791977, + "learning_rate": 0.00019883767155065986, + "loss": 1.8872, + "step": 63 + }, + { + "epoch": 0.17, + "grad_norm": 0.14870542287826538, + "learning_rate": 0.00019879348555527687, + "loss": 1.8602, + "step": 64 + }, + { + "epoch": 0.18, + "grad_norm": 0.14343470335006714, + "learning_rate": 0.00019874848038194673, + "loss": 1.8312, + "step": 65 + }, + { + "epoch": 0.18, + "grad_norm": 0.15904268622398376, + "learning_rate": 0.00019870265640384435, + "loss": 1.9288, + "step": 66 + }, + { + "epoch": 0.18, + "grad_norm": 0.1546896994113922, + "learning_rate": 0.00019865601400093395, + "loss": 1.9167, + "step": 67 + }, + { + "epoch": 0.19, + "grad_norm": 0.15036776661872864, + "learning_rate": 0.000198608553559966, + "loss": 1.8157, + "step": 68 + }, + { + "epoch": 0.19, + "grad_norm": 0.155603289604187, + "learning_rate": 0.000198560275474474, + "loss": 1.8193, + "step": 69 + }, + { + "epoch": 0.19, + "grad_norm": 0.14846371114253998, + "learning_rate": 0.00019851118014477126, + "loss": 1.7431, + "step": 70 + }, + { + "epoch": 0.19, + "grad_norm": 0.15555807948112488, + "learning_rate": 0.00019846126797794743, + "loss": 1.9226, + "step": 71 + }, + { + "epoch": 0.2, + "grad_norm": 0.14305201172828674, + "learning_rate": 0.00019841053938786534, + "loss": 1.8991, + "step": 72 + }, + { + "epoch": 0.2, + "grad_norm": 0.14723321795463562, + "learning_rate": 0.00019835899479515737, + "loss": 1.881, + "step": 73 + }, + { + "epoch": 0.2, + "grad_norm": 0.14168229699134827, + "learning_rate": 0.00019830663462722207, + "loss": 1.9365, + "step": 74 + }, + { + "epoch": 0.2, + "grad_norm": 0.16648411750793457, + "learning_rate": 0.00019825345931822068, + "loss": 1.764, + "step": 75 + }, + { + "epoch": 0.21, + "grad_norm": 0.13298211991786957, + "learning_rate": 0.00019819946930907332, + "loss": 1.8581, + "step": 76 + }, + { + "epoch": 0.21, + "grad_norm": 0.15933053195476532, + "learning_rate": 0.00019814466504745548, + "loss": 1.8043, + "step": 77 + }, + { + "epoch": 0.21, + "grad_norm": 0.15350531041622162, + "learning_rate": 0.00019808904698779433, + "loss": 1.7964, + "step": 78 + }, + { + "epoch": 0.22, + "grad_norm": 0.17690815031528473, + "learning_rate": 0.00019803261559126492, + "loss": 1.8173, + "step": 79 + }, + { + "epoch": 0.22, + "grad_norm": 0.14556676149368286, + "learning_rate": 0.0001979753713257863, + "loss": 1.9111, + "step": 80 + }, + { + "epoch": 0.22, + "grad_norm": 0.14608439803123474, + "learning_rate": 0.00019791731466601773, + "loss": 1.8583, + "step": 81 + }, + { + "epoch": 0.22, + "grad_norm": 0.15660099685192108, + "learning_rate": 0.0001978584460933546, + "loss": 1.8686, + "step": 82 + }, + { + "epoch": 0.23, + "grad_norm": 0.2378729283809662, + "learning_rate": 0.00019779876609592471, + "loss": 1.7445, + "step": 83 + }, + { + "epoch": 0.23, + "grad_norm": 0.17274336516857147, + "learning_rate": 0.00019773827516858386, + "loss": 1.9353, + "step": 84 + }, + { + "epoch": 0.23, + "grad_norm": 0.15144622325897217, + "learning_rate": 0.00019767697381291202, + "loss": 1.8179, + "step": 85 + }, + { + "epoch": 0.23, + "grad_norm": 0.1504279226064682, + "learning_rate": 0.00019761486253720915, + "loss": 1.787, + "step": 86 + }, + { + "epoch": 0.24, + "grad_norm": 0.14772076904773712, + "learning_rate": 0.0001975519418564908, + "loss": 1.8928, + "step": 87 + }, + { + "epoch": 0.24, + "grad_norm": 0.16531221568584442, + "learning_rate": 0.00019748821229248405, + "loss": 1.7052, + "step": 88 + }, + { + "epoch": 0.24, + "grad_norm": 0.15720781683921814, + "learning_rate": 0.00019742367437362306, + "loss": 1.8116, + "step": 89 + }, + { + "epoch": 0.25, + "grad_norm": 0.15642230212688446, + "learning_rate": 0.00019735832863504476, + "loss": 1.7233, + "step": 90 + }, + { + "epoch": 0.25, + "grad_norm": 0.13698022067546844, + "learning_rate": 0.00019729217561858433, + "loss": 1.7699, + "step": 91 + }, + { + "epoch": 0.25, + "grad_norm": 0.16550354659557343, + "learning_rate": 0.00019722521587277076, + "loss": 1.7379, + "step": 92 + }, + { + "epoch": 0.25, + "eval_loss": 1.785460114479065, + "eval_runtime": 54.2174, + "eval_samples_per_second": 92.166, + "eval_steps_per_second": 23.055, + "step": 92 + }, + { + "epoch": 0.25, + "grad_norm": 0.1539517045021057, + "learning_rate": 0.00019715744995282234, + "loss": 1.7921, + "step": 93 + }, + { + "epoch": 0.26, + "grad_norm": 0.1603965014219284, + "learning_rate": 0.00019708887842064194, + "loss": 1.7856, + "step": 94 + }, + { + "epoch": 0.26, + "grad_norm": 0.14334900677204132, + "learning_rate": 0.0001970195018448125, + "loss": 1.8048, + "step": 95 + }, + { + "epoch": 0.26, + "grad_norm": 0.17852160334587097, + "learning_rate": 0.00019694932080059217, + "loss": 1.7513, + "step": 96 + }, + { + "epoch": 0.26, + "grad_norm": 0.15032339096069336, + "learning_rate": 0.00019687833586990973, + "loss": 1.7011, + "step": 97 + }, + { + "epoch": 0.27, + "grad_norm": 0.1456947922706604, + "learning_rate": 0.00019680654764135945, + "loss": 1.7992, + "step": 98 + }, + { + "epoch": 0.27, + "grad_norm": 0.14242622256278992, + "learning_rate": 0.00019673395671019654, + "loss": 1.8617, + "step": 99 + }, + { + "epoch": 0.27, + "grad_norm": 0.15148206055164337, + "learning_rate": 0.00019666056367833204, + "loss": 1.8464, + "step": 100 + }, + { + "epoch": 0.28, + "grad_norm": 0.15412554144859314, + "learning_rate": 0.00019658636915432788, + "loss": 1.8757, + "step": 101 + }, + { + "epoch": 0.28, + "grad_norm": 0.1881706863641739, + "learning_rate": 0.0001965113737533918, + "loss": 1.7233, + "step": 102 + }, + { + "epoch": 0.28, + "grad_norm": 0.16812562942504883, + "learning_rate": 0.0001964355780973723, + "loss": 1.769, + "step": 103 + }, + { + "epoch": 0.28, + "grad_norm": 0.14640142023563385, + "learning_rate": 0.00019635898281475346, + "loss": 1.8155, + "step": 104 + }, + { + "epoch": 0.29, + "grad_norm": 0.16034264862537384, + "learning_rate": 0.00019628158854064956, + "loss": 1.7034, + "step": 105 + }, + { + "epoch": 0.29, + "grad_norm": 0.14522109925746918, + "learning_rate": 0.00019620339591680023, + "loss": 1.8194, + "step": 106 + }, + { + "epoch": 0.29, + "grad_norm": 0.1690577268600464, + "learning_rate": 0.0001961244055915647, + "loss": 1.7467, + "step": 107 + }, + { + "epoch": 0.29, + "grad_norm": 0.17564715445041656, + "learning_rate": 0.00019604461821991667, + "loss": 1.8696, + "step": 108 + }, + { + "epoch": 0.3, + "grad_norm": 0.1871030628681183, + "learning_rate": 0.00019596403446343877, + "loss": 1.8975, + "step": 109 + }, + { + "epoch": 0.3, + "grad_norm": 0.18129503726959229, + "learning_rate": 0.0001958826549903171, + "loss": 1.8295, + "step": 110 + }, + { + "epoch": 0.3, + "grad_norm": 0.16357164084911346, + "learning_rate": 0.00019580048047533578, + "loss": 1.813, + "step": 111 + }, + { + "epoch": 0.3, + "grad_norm": 0.15929792821407318, + "learning_rate": 0.00019571751159987115, + "loss": 1.7072, + "step": 112 + }, + { + "epoch": 0.31, + "grad_norm": 0.14020949602127075, + "learning_rate": 0.00019563374905188637, + "loss": 1.8245, + "step": 113 + }, + { + "epoch": 0.31, + "grad_norm": 0.14771193265914917, + "learning_rate": 0.0001955491935259255, + "loss": 1.7532, + "step": 114 + }, + { + "epoch": 0.31, + "grad_norm": 0.1625107079744339, + "learning_rate": 0.0001954638457231079, + "loss": 1.7653, + "step": 115 + }, + { + "epoch": 0.32, + "grad_norm": 0.15289072692394257, + "learning_rate": 0.0001953777063511223, + "loss": 1.7301, + "step": 116 + }, + { + "epoch": 0.32, + "grad_norm": 0.15066611766815186, + "learning_rate": 0.00019529077612422103, + "loss": 1.7909, + "step": 117 + }, + { + "epoch": 0.32, + "grad_norm": 0.16926345229148865, + "learning_rate": 0.00019520305576321395, + "loss": 1.7426, + "step": 118 + }, + { + "epoch": 0.32, + "grad_norm": 0.16127844154834747, + "learning_rate": 0.00019511454599546268, + "loss": 1.7841, + "step": 119 + }, + { + "epoch": 0.33, + "grad_norm": 0.14246778190135956, + "learning_rate": 0.0001950252475548744, + "loss": 1.7545, + "step": 120 + }, + { + "epoch": 0.33, + "grad_norm": 0.15499025583267212, + "learning_rate": 0.00019493516118189582, + "loss": 1.7749, + "step": 121 + }, + { + "epoch": 0.33, + "grad_norm": 0.16287535429000854, + "learning_rate": 0.00019484428762350708, + "loss": 1.7878, + "step": 122 + }, + { + "epoch": 0.33, + "grad_norm": 0.14317578077316284, + "learning_rate": 0.00019475262763321552, + "loss": 1.8787, + "step": 123 + }, + { + "epoch": 0.34, + "grad_norm": 0.15122784674167633, + "learning_rate": 0.00019466018197104946, + "loss": 1.8259, + "step": 124 + }, + { + "epoch": 0.34, + "grad_norm": 0.1579144448041916, + "learning_rate": 0.00019456695140355172, + "loss": 1.8568, + "step": 125 + }, + { + "epoch": 0.34, + "grad_norm": 0.15908168256282806, + "learning_rate": 0.0001944729367037736, + "loss": 1.7898, + "step": 126 + }, + { + "epoch": 0.35, + "grad_norm": 0.17988181114196777, + "learning_rate": 0.0001943781386512682, + "loss": 1.8017, + "step": 127 + }, + { + "epoch": 0.35, + "grad_norm": 0.16047564148902893, + "learning_rate": 0.000194282558032084, + "loss": 1.6842, + "step": 128 + }, + { + "epoch": 0.35, + "grad_norm": 0.2199389785528183, + "learning_rate": 0.0001941861956387585, + "loss": 1.7777, + "step": 129 + }, + { + "epoch": 0.35, + "grad_norm": 0.17500443756580353, + "learning_rate": 0.00019408905227031142, + "loss": 1.817, + "step": 130 + }, + { + "epoch": 0.36, + "grad_norm": 0.16257035732269287, + "learning_rate": 0.00019399112873223824, + "loss": 1.7822, + "step": 131 + }, + { + "epoch": 0.36, + "grad_norm": 0.1655006855726242, + "learning_rate": 0.0001938924258365035, + "loss": 1.7835, + "step": 132 + }, + { + "epoch": 0.36, + "grad_norm": 0.1625986397266388, + "learning_rate": 0.00019379294440153392, + "loss": 1.739, + "step": 133 + }, + { + "epoch": 0.36, + "grad_norm": 0.1542993187904358, + "learning_rate": 0.00019369268525221185, + "loss": 1.8484, + "step": 134 + }, + { + "epoch": 0.37, + "grad_norm": 0.16408205032348633, + "learning_rate": 0.00019359164921986825, + "loss": 1.7889, + "step": 135 + }, + { + "epoch": 0.37, + "grad_norm": 0.1816944181919098, + "learning_rate": 0.00019348983714227583, + "loss": 1.8592, + "step": 136 + }, + { + "epoch": 0.37, + "grad_norm": 0.1641003042459488, + "learning_rate": 0.00019338724986364222, + "loss": 1.7293, + "step": 137 + }, + { + "epoch": 0.38, + "grad_norm": 0.14509302377700806, + "learning_rate": 0.00019328388823460283, + "loss": 1.653, + "step": 138 + }, + { + "epoch": 0.38, + "grad_norm": 0.14293956756591797, + "learning_rate": 0.00019317975311221373, + "loss": 1.8303, + "step": 139 + }, + { + "epoch": 0.38, + "grad_norm": 0.17679671943187714, + "learning_rate": 0.00019307484535994492, + "loss": 1.7262, + "step": 140 + }, + { + "epoch": 0.38, + "grad_norm": 0.142549067735672, + "learning_rate": 0.00019296916584767262, + "loss": 1.8038, + "step": 141 + }, + { + "epoch": 0.39, + "grad_norm": 0.17163245379924774, + "learning_rate": 0.00019286271545167254, + "loss": 1.7815, + "step": 142 + }, + { + "epoch": 0.39, + "grad_norm": 0.1682298481464386, + "learning_rate": 0.0001927554950546124, + "loss": 1.7159, + "step": 143 + }, + { + "epoch": 0.39, + "grad_norm": 0.17430084943771362, + "learning_rate": 0.00019264750554554453, + "loss": 1.8462, + "step": 144 + }, + { + "epoch": 0.39, + "grad_norm": 0.1589604616165161, + "learning_rate": 0.00019253874781989864, + "loss": 1.7369, + "step": 145 + }, + { + "epoch": 0.4, + "grad_norm": 0.15902858972549438, + "learning_rate": 0.00019242922277947448, + "loss": 1.8005, + "step": 146 + }, + { + "epoch": 0.4, + "grad_norm": 0.15692317485809326, + "learning_rate": 0.00019231893133243405, + "loss": 1.7759, + "step": 147 + }, + { + "epoch": 0.4, + "grad_norm": 0.15179891884326935, + "learning_rate": 0.00019220787439329438, + "loss": 1.8517, + "step": 148 + }, + { + "epoch": 0.41, + "grad_norm": 0.14860700070858002, + "learning_rate": 0.00019209605288291978, + "loss": 1.7751, + "step": 149 + }, + { + "epoch": 0.41, + "grad_norm": 0.1454017162322998, + "learning_rate": 0.0001919834677285142, + "loss": 1.6579, + "step": 150 + }, + { + "epoch": 0.41, + "grad_norm": 0.16802151501178741, + "learning_rate": 0.00019187011986361374, + "loss": 1.7853, + "step": 151 + }, + { + "epoch": 0.41, + "grad_norm": 0.16814345121383667, + "learning_rate": 0.0001917560102280786, + "loss": 1.7162, + "step": 152 + }, + { + "epoch": 0.42, + "grad_norm": 0.15314815938472748, + "learning_rate": 0.0001916411397680855, + "loss": 1.7066, + "step": 153 + }, + { + "epoch": 0.42, + "grad_norm": 0.16011942923069, + "learning_rate": 0.00019152550943611987, + "loss": 1.7655, + "step": 154 + }, + { + "epoch": 0.42, + "grad_norm": 0.15423813462257385, + "learning_rate": 0.00019140912019096766, + "loss": 1.8106, + "step": 155 + }, + { + "epoch": 0.42, + "grad_norm": 0.1538185477256775, + "learning_rate": 0.0001912919729977078, + "loss": 1.7952, + "step": 156 + }, + { + "epoch": 0.43, + "grad_norm": 0.16051077842712402, + "learning_rate": 0.00019117406882770388, + "loss": 1.7182, + "step": 157 + }, + { + "epoch": 0.43, + "grad_norm": 0.15189512073993683, + "learning_rate": 0.00019105540865859623, + "loss": 1.6282, + "step": 158 + }, + { + "epoch": 0.43, + "grad_norm": 0.16353580355644226, + "learning_rate": 0.0001909359934742938, + "loss": 1.7374, + "step": 159 + }, + { + "epoch": 0.44, + "grad_norm": 0.16629937291145325, + "learning_rate": 0.00019081582426496598, + "loss": 1.6122, + "step": 160 + }, + { + "epoch": 0.44, + "grad_norm": 0.16899485886096954, + "learning_rate": 0.00019069490202703438, + "loss": 1.6817, + "step": 161 + }, + { + "epoch": 0.44, + "grad_norm": 0.1827431470155716, + "learning_rate": 0.00019057322776316467, + "loss": 1.7633, + "step": 162 + }, + { + "epoch": 0.44, + "grad_norm": 0.17833983898162842, + "learning_rate": 0.00019045080248225805, + "loss": 1.6124, + "step": 163 + }, + { + "epoch": 0.45, + "grad_norm": 0.1423504501581192, + "learning_rate": 0.00019032762719944316, + "loss": 1.7344, + "step": 164 + }, + { + "epoch": 0.45, + "grad_norm": 0.1658603847026825, + "learning_rate": 0.0001902037029360674, + "loss": 1.6874, + "step": 165 + }, + { + "epoch": 0.45, + "grad_norm": 0.16833803057670593, + "learning_rate": 0.00019007903071968868, + "loss": 1.7291, + "step": 166 + }, + { + "epoch": 0.45, + "grad_norm": 0.16885894536972046, + "learning_rate": 0.00018995361158406676, + "loss": 1.77, + "step": 167 + }, + { + "epoch": 0.46, + "grad_norm": 0.16659806668758392, + "learning_rate": 0.00018982744656915475, + "loss": 1.8441, + "step": 168 + }, + { + "epoch": 0.46, + "grad_norm": 0.18617461621761322, + "learning_rate": 0.00018970053672109037, + "loss": 1.8355, + "step": 169 + }, + { + "epoch": 0.46, + "grad_norm": 0.17858782410621643, + "learning_rate": 0.00018957288309218745, + "loss": 1.7093, + "step": 170 + }, + { + "epoch": 0.47, + "grad_norm": 0.1546701192855835, + "learning_rate": 0.00018944448674092714, + "loss": 1.6651, + "step": 171 + }, + { + "epoch": 0.47, + "grad_norm": 0.16935370862483978, + "learning_rate": 0.000189315348731949, + "loss": 1.7196, + "step": 172 + }, + { + "epoch": 0.47, + "grad_norm": 0.17772410809993744, + "learning_rate": 0.0001891854701360424, + "loss": 1.7215, + "step": 173 + }, + { + "epoch": 0.47, + "grad_norm": 0.16293296217918396, + "learning_rate": 0.00018905485203013744, + "loss": 1.7094, + "step": 174 + }, + { + "epoch": 0.48, + "grad_norm": 0.1712435483932495, + "learning_rate": 0.00018892349549729615, + "loss": 1.7627, + "step": 175 + }, + { + "epoch": 0.48, + "grad_norm": 0.19540230929851532, + "learning_rate": 0.00018879140162670347, + "loss": 1.7626, + "step": 176 + }, + { + "epoch": 0.48, + "grad_norm": 0.16615939140319824, + "learning_rate": 0.00018865857151365814, + "loss": 1.6614, + "step": 177 + }, + { + "epoch": 0.48, + "grad_norm": 0.14994904398918152, + "learning_rate": 0.0001885250062595638, + "loss": 1.8007, + "step": 178 + }, + { + "epoch": 0.49, + "grad_norm": 0.17429371178150177, + "learning_rate": 0.0001883907069719197, + "loss": 1.5822, + "step": 179 + }, + { + "epoch": 0.49, + "grad_norm": 0.1600300520658493, + "learning_rate": 0.0001882556747643115, + "loss": 1.7315, + "step": 180 + }, + { + "epoch": 0.49, + "grad_norm": 0.16705308854579926, + "learning_rate": 0.00018811991075640223, + "loss": 1.6875, + "step": 181 + }, + { + "epoch": 0.5, + "grad_norm": 0.1432919055223465, + "learning_rate": 0.00018798341607392276, + "loss": 1.662, + "step": 182 + }, + { + "epoch": 0.5, + "grad_norm": 0.1693781316280365, + "learning_rate": 0.00018784619184866267, + "loss": 1.6294, + "step": 183 + }, + { + "epoch": 0.5, + "grad_norm": 0.16618549823760986, + "learning_rate": 0.0001877082392184607, + "loss": 1.6935, + "step": 184 + }, + { + "epoch": 0.5, + "eval_loss": 1.707476258277893, + "eval_runtime": 53.476, + "eval_samples_per_second": 93.444, + "eval_steps_per_second": 23.375, + "step": 184 + }, + { + "epoch": 0.5, + "grad_norm": 0.19029615819454193, + "learning_rate": 0.00018756955932719546, + "loss": 1.6409, + "step": 185 + }, + { + "epoch": 0.51, + "grad_norm": 0.15953810513019562, + "learning_rate": 0.00018743015332477588, + "loss": 1.7592, + "step": 186 + }, + { + "epoch": 0.51, + "grad_norm": 0.1445915549993515, + "learning_rate": 0.0001872900223671316, + "loss": 1.7382, + "step": 187 + }, + { + "epoch": 0.51, + "grad_norm": 0.1905539631843567, + "learning_rate": 0.0001871491676162035, + "loss": 1.7496, + "step": 188 + }, + { + "epoch": 0.51, + "grad_norm": 0.19065023958683014, + "learning_rate": 0.0001870075902399341, + "loss": 1.7459, + "step": 189 + }, + { + "epoch": 0.52, + "grad_norm": 0.15250128507614136, + "learning_rate": 0.00018686529141225767, + "loss": 1.7847, + "step": 190 + }, + { + "epoch": 0.52, + "grad_norm": 0.16181302070617676, + "learning_rate": 0.00018672227231309068, + "loss": 1.6404, + "step": 191 + }, + { + "epoch": 0.52, + "grad_norm": 0.15402203798294067, + "learning_rate": 0.00018657853412832202, + "loss": 1.7612, + "step": 192 + }, + { + "epoch": 0.53, + "grad_norm": 0.16924121975898743, + "learning_rate": 0.00018643407804980303, + "loss": 1.7262, + "step": 193 + }, + { + "epoch": 0.53, + "grad_norm": 0.1564388871192932, + "learning_rate": 0.00018628890527533777, + "loss": 1.7721, + "step": 194 + }, + { + "epoch": 0.53, + "grad_norm": 0.17039404809474945, + "learning_rate": 0.00018614301700867287, + "loss": 1.6788, + "step": 195 + }, + { + "epoch": 0.53, + "grad_norm": 0.1548219621181488, + "learning_rate": 0.0001859964144594879, + "loss": 1.7128, + "step": 196 + }, + { + "epoch": 0.54, + "grad_norm": 0.16957992315292358, + "learning_rate": 0.000185849098843385, + "loss": 1.8554, + "step": 197 + }, + { + "epoch": 0.54, + "grad_norm": 0.15887251496315002, + "learning_rate": 0.00018570107138187893, + "loss": 1.691, + "step": 198 + }, + { + "epoch": 0.54, + "grad_norm": 0.1746351420879364, + "learning_rate": 0.000185552333302387, + "loss": 1.6902, + "step": 199 + }, + { + "epoch": 0.54, + "grad_norm": 0.15400560200214386, + "learning_rate": 0.00018540288583821882, + "loss": 1.7756, + "step": 200 + }, + { + "epoch": 0.55, + "grad_norm": 0.163401260972023, + "learning_rate": 0.00018525273022856607, + "loss": 1.6391, + "step": 201 + }, + { + "epoch": 0.55, + "grad_norm": 0.14885050058364868, + "learning_rate": 0.0001851018677184923, + "loss": 1.7449, + "step": 202 + }, + { + "epoch": 0.55, + "grad_norm": 0.15566030144691467, + "learning_rate": 0.00018495029955892248, + "loss": 1.7574, + "step": 203 + }, + { + "epoch": 0.56, + "grad_norm": 0.16466747224330902, + "learning_rate": 0.00018479802700663268, + "loss": 1.7116, + "step": 204 + }, + { + "epoch": 0.56, + "grad_norm": 0.15742923319339752, + "learning_rate": 0.00018464505132423983, + "loss": 1.6491, + "step": 205 + }, + { + "epoch": 0.56, + "grad_norm": 0.15772201120853424, + "learning_rate": 0.00018449137378019094, + "loss": 1.7971, + "step": 206 + }, + { + "epoch": 0.56, + "grad_norm": 0.17034031450748444, + "learning_rate": 0.00018433699564875274, + "loss": 1.7036, + "step": 207 + }, + { + "epoch": 0.57, + "grad_norm": 0.1637440025806427, + "learning_rate": 0.0001841819182100012, + "loss": 1.5544, + "step": 208 + }, + { + "epoch": 0.57, + "grad_norm": 0.18623477220535278, + "learning_rate": 0.00018402614274981073, + "loss": 1.632, + "step": 209 + }, + { + "epoch": 0.57, + "grad_norm": 0.16553111374378204, + "learning_rate": 0.0001838696705598436, + "loss": 1.7081, + "step": 210 + }, + { + "epoch": 0.57, + "grad_norm": 0.15178020298480988, + "learning_rate": 0.0001837125029375393, + "loss": 1.679, + "step": 211 + }, + { + "epoch": 0.58, + "grad_norm": 0.16419006884098053, + "learning_rate": 0.00018355464118610372, + "loss": 1.7082, + "step": 212 + }, + { + "epoch": 0.58, + "grad_norm": 0.1607130765914917, + "learning_rate": 0.00018339608661449832, + "loss": 1.6493, + "step": 213 + }, + { + "epoch": 0.58, + "grad_norm": 0.17754729092121124, + "learning_rate": 0.00018323684053742932, + "loss": 1.7558, + "step": 214 + }, + { + "epoch": 0.59, + "grad_norm": 0.16623522341251373, + "learning_rate": 0.00018307690427533672, + "loss": 1.7677, + "step": 215 + }, + { + "epoch": 0.59, + "grad_norm": 0.16194763779640198, + "learning_rate": 0.00018291627915438348, + "loss": 1.7749, + "step": 216 + }, + { + "epoch": 0.59, + "grad_norm": 0.17232032120227814, + "learning_rate": 0.00018275496650644444, + "loss": 1.8273, + "step": 217 + }, + { + "epoch": 0.59, + "grad_norm": 0.16804206371307373, + "learning_rate": 0.00018259296766909527, + "loss": 1.7504, + "step": 218 + }, + { + "epoch": 0.6, + "grad_norm": 0.1571667343378067, + "learning_rate": 0.00018243028398560142, + "loss": 1.7331, + "step": 219 + }, + { + "epoch": 0.6, + "grad_norm": 0.16855494678020477, + "learning_rate": 0.00018226691680490694, + "loss": 1.694, + "step": 220 + }, + { + "epoch": 0.6, + "grad_norm": 0.18118086457252502, + "learning_rate": 0.00018210286748162336, + "loss": 1.7338, + "step": 221 + }, + { + "epoch": 0.6, + "grad_norm": 0.16491609811782837, + "learning_rate": 0.00018193813737601834, + "loss": 1.5984, + "step": 222 + }, + { + "epoch": 0.61, + "grad_norm": 0.15478603541851044, + "learning_rate": 0.00018177272785400455, + "loss": 1.7077, + "step": 223 + }, + { + "epoch": 0.61, + "grad_norm": 0.14907459914684296, + "learning_rate": 0.0001816066402871282, + "loss": 1.6867, + "step": 224 + }, + { + "epoch": 0.61, + "grad_norm": 0.16465577483177185, + "learning_rate": 0.00018143987605255774, + "loss": 1.7904, + "step": 225 + }, + { + "epoch": 0.62, + "grad_norm": 0.1661553680896759, + "learning_rate": 0.00018127243653307248, + "loss": 1.7439, + "step": 226 + }, + { + "epoch": 0.62, + "grad_norm": 0.15185391902923584, + "learning_rate": 0.00018110432311705096, + "loss": 1.6711, + "step": 227 + }, + { + "epoch": 0.62, + "grad_norm": 0.15746577084064484, + "learning_rate": 0.00018093553719845967, + "loss": 1.7139, + "step": 228 + }, + { + "epoch": 0.62, + "grad_norm": 0.16057291626930237, + "learning_rate": 0.00018076608017684128, + "loss": 1.6659, + "step": 229 + }, + { + "epoch": 0.63, + "grad_norm": 0.17637218534946442, + "learning_rate": 0.00018059595345730318, + "loss": 1.7832, + "step": 230 + }, + { + "epoch": 0.63, + "grad_norm": 0.16048584878444672, + "learning_rate": 0.00018042515845050576, + "loss": 1.8092, + "step": 231 + }, + { + "epoch": 0.63, + "grad_norm": 0.1690172702074051, + "learning_rate": 0.00018025369657265075, + "loss": 1.6495, + "step": 232 + }, + { + "epoch": 0.63, + "grad_norm": 0.1493247002363205, + "learning_rate": 0.0001800815692454694, + "loss": 1.6867, + "step": 233 + }, + { + "epoch": 0.64, + "grad_norm": 0.1496494710445404, + "learning_rate": 0.00017990877789621082, + "loss": 1.6808, + "step": 234 + }, + { + "epoch": 0.64, + "grad_norm": 0.1575721800327301, + "learning_rate": 0.00017973532395763, + "loss": 1.7341, + "step": 235 + }, + { + "epoch": 0.64, + "grad_norm": 0.16140134632587433, + "learning_rate": 0.00017956120886797604, + "loss": 1.7025, + "step": 236 + }, + { + "epoch": 0.65, + "grad_norm": 0.1708807647228241, + "learning_rate": 0.0001793864340709802, + "loss": 1.7168, + "step": 237 + }, + { + "epoch": 0.65, + "grad_norm": 0.1652187556028366, + "learning_rate": 0.00017921100101584388, + "loss": 1.6333, + "step": 238 + }, + { + "epoch": 0.65, + "grad_norm": 0.15541191399097443, + "learning_rate": 0.00017903491115722666, + "loss": 1.6765, + "step": 239 + }, + { + "epoch": 0.65, + "grad_norm": 0.15625408291816711, + "learning_rate": 0.00017885816595523423, + "loss": 1.7528, + "step": 240 + }, + { + "epoch": 0.66, + "grad_norm": 0.171896293759346, + "learning_rate": 0.00017868076687540624, + "loss": 1.6741, + "step": 241 + }, + { + "epoch": 0.66, + "grad_norm": 0.1624470204114914, + "learning_rate": 0.00017850271538870423, + "loss": 1.5913, + "step": 242 + }, + { + "epoch": 0.66, + "grad_norm": 0.163376584649086, + "learning_rate": 0.00017832401297149928, + "loss": 1.6242, + "step": 243 + }, + { + "epoch": 0.66, + "grad_norm": 0.1690063625574112, + "learning_rate": 0.00017814466110555998, + "loss": 1.6997, + "step": 244 + }, + { + "epoch": 0.67, + "grad_norm": 0.18232744932174683, + "learning_rate": 0.00017796466127804, + "loss": 1.7033, + "step": 245 + }, + { + "epoch": 0.67, + "grad_norm": 0.16556936502456665, + "learning_rate": 0.0001777840149814657, + "loss": 1.8036, + "step": 246 + }, + { + "epoch": 0.67, + "grad_norm": 0.1530187577009201, + "learning_rate": 0.00017760272371372402, + "loss": 1.7847, + "step": 247 + }, + { + "epoch": 0.68, + "grad_norm": 0.17229686677455902, + "learning_rate": 0.00017742078897804974, + "loss": 1.6999, + "step": 248 + }, + { + "epoch": 0.68, + "grad_norm": 0.1453051120042801, + "learning_rate": 0.00017723821228301322, + "loss": 1.7255, + "step": 249 + }, + { + "epoch": 0.68, + "grad_norm": 0.1597098410129547, + "learning_rate": 0.00017705499514250784, + "loss": 1.7228, + "step": 250 + }, + { + "epoch": 0.68, + "grad_norm": 0.20154576003551483, + "learning_rate": 0.0001768711390757374, + "loss": 1.6099, + "step": 251 + }, + { + "epoch": 0.69, + "grad_norm": 0.17710669338703156, + "learning_rate": 0.0001766866456072036, + "loss": 1.6167, + "step": 252 + }, + { + "epoch": 0.69, + "grad_norm": 0.15468710660934448, + "learning_rate": 0.00017650151626669337, + "loss": 1.6875, + "step": 253 + }, + { + "epoch": 0.69, + "grad_norm": 0.18249762058258057, + "learning_rate": 0.0001763157525892661, + "loss": 1.5819, + "step": 254 + }, + { + "epoch": 0.69, + "grad_norm": 0.16782326996326447, + "learning_rate": 0.0001761293561152411, + "loss": 1.626, + "step": 255 + }, + { + "epoch": 0.7, + "grad_norm": 0.16319702565670013, + "learning_rate": 0.0001759423283901846, + "loss": 1.6173, + "step": 256 + }, + { + "epoch": 0.7, + "grad_norm": 0.15339410305023193, + "learning_rate": 0.00017575467096489717, + "loss": 1.7897, + "step": 257 + }, + { + "epoch": 0.7, + "grad_norm": 0.15668465197086334, + "learning_rate": 0.00017556638539540067, + "loss": 1.6749, + "step": 258 + }, + { + "epoch": 0.71, + "grad_norm": 0.1564616560935974, + "learning_rate": 0.00017537747324292542, + "loss": 1.6263, + "step": 259 + }, + { + "epoch": 0.71, + "grad_norm": 0.15310418605804443, + "learning_rate": 0.00017518793607389723, + "loss": 1.7237, + "step": 260 + }, + { + "epoch": 0.71, + "grad_norm": 0.1633821427822113, + "learning_rate": 0.00017499777545992452, + "loss": 1.7156, + "step": 261 + }, + { + "epoch": 0.71, + "grad_norm": 0.18414661288261414, + "learning_rate": 0.00017480699297778518, + "loss": 1.6333, + "step": 262 + }, + { + "epoch": 0.72, + "grad_norm": 0.18508172035217285, + "learning_rate": 0.00017461559020941341, + "loss": 1.681, + "step": 263 + }, + { + "epoch": 0.72, + "grad_norm": 0.17592370510101318, + "learning_rate": 0.0001744235687418869, + "loss": 1.7787, + "step": 264 + }, + { + "epoch": 0.72, + "grad_norm": 0.22177156805992126, + "learning_rate": 0.00017423093016741333, + "loss": 1.6066, + "step": 265 + }, + { + "epoch": 0.72, + "grad_norm": 0.1583690643310547, + "learning_rate": 0.00017403767608331733, + "loss": 1.7423, + "step": 266 + }, + { + "epoch": 0.73, + "grad_norm": 0.17558494210243225, + "learning_rate": 0.00017384380809202737, + "loss": 1.7306, + "step": 267 + }, + { + "epoch": 0.73, + "grad_norm": 0.1737305372953415, + "learning_rate": 0.00017364932780106212, + "loss": 1.8139, + "step": 268 + }, + { + "epoch": 0.73, + "grad_norm": 0.15237008035182953, + "learning_rate": 0.00017345423682301755, + "loss": 1.7415, + "step": 269 + }, + { + "epoch": 0.74, + "grad_norm": 0.1741713285446167, + "learning_rate": 0.00017325853677555312, + "loss": 1.73, + "step": 270 + }, + { + "epoch": 0.74, + "grad_norm": 0.1688138246536255, + "learning_rate": 0.00017306222928137875, + "loss": 1.774, + "step": 271 + }, + { + "epoch": 0.74, + "grad_norm": 0.16588321328163147, + "learning_rate": 0.00017286531596824112, + "loss": 1.758, + "step": 272 + }, + { + "epoch": 0.74, + "grad_norm": 0.17456404864788055, + "learning_rate": 0.00017266779846891029, + "loss": 1.7528, + "step": 273 + }, + { + "epoch": 0.75, + "grad_norm": 0.15763048827648163, + "learning_rate": 0.00017246967842116605, + "loss": 1.661, + "step": 274 + }, + { + "epoch": 0.75, + "grad_norm": 0.1601736843585968, + "learning_rate": 0.00017227095746778456, + "loss": 1.7317, + "step": 275 + }, + { + "epoch": 0.75, + "grad_norm": 0.17678223550319672, + "learning_rate": 0.00017207163725652445, + "loss": 1.7016, + "step": 276 + }, + { + "epoch": 0.75, + "eval_loss": 1.666338324546814, + "eval_runtime": 53.3064, + "eval_samples_per_second": 93.741, + "eval_steps_per_second": 23.449, + "step": 276 + }, + { + "epoch": 0.75, + "grad_norm": 0.1592448651790619, + "learning_rate": 0.00017187171944011329, + "loss": 1.6313, + "step": 277 + }, + { + "epoch": 0.76, + "grad_norm": 0.1765425056219101, + "learning_rate": 0.000171671205676234, + "loss": 1.5646, + "step": 278 + }, + { + "epoch": 0.76, + "grad_norm": 0.14587025344371796, + "learning_rate": 0.00017147009762751085, + "loss": 1.7655, + "step": 279 + }, + { + "epoch": 0.76, + "grad_norm": 0.15663780272006989, + "learning_rate": 0.00017126839696149596, + "loss": 1.6468, + "step": 280 + }, + { + "epoch": 0.77, + "grad_norm": 0.15041159093379974, + "learning_rate": 0.00017106610535065517, + "loss": 1.6655, + "step": 281 + }, + { + "epoch": 0.77, + "grad_norm": 0.18636269867420197, + "learning_rate": 0.0001708632244723545, + "loss": 1.7274, + "step": 282 + }, + { + "epoch": 0.77, + "grad_norm": 0.2026071399450302, + "learning_rate": 0.00017065975600884584, + "loss": 1.7669, + "step": 283 + }, + { + "epoch": 0.77, + "grad_norm": 0.16321098804473877, + "learning_rate": 0.00017045570164725348, + "loss": 1.5634, + "step": 284 + }, + { + "epoch": 0.78, + "grad_norm": 0.17123541235923767, + "learning_rate": 0.00017025106307955973, + "loss": 1.6881, + "step": 285 + }, + { + "epoch": 0.78, + "grad_norm": 0.17135639488697052, + "learning_rate": 0.00017004584200259107, + "loss": 1.5511, + "step": 286 + }, + { + "epoch": 0.78, + "grad_norm": 0.149318128824234, + "learning_rate": 0.00016984004011800404, + "loss": 1.7526, + "step": 287 + }, + { + "epoch": 0.78, + "grad_norm": 0.17461049556732178, + "learning_rate": 0.0001696336591322711, + "loss": 1.7297, + "step": 288 + }, + { + "epoch": 0.79, + "grad_norm": 0.22167545557022095, + "learning_rate": 0.00016942670075666656, + "loss": 1.5982, + "step": 289 + }, + { + "epoch": 0.79, + "grad_norm": 0.17257270216941833, + "learning_rate": 0.00016921916670725233, + "loss": 1.6988, + "step": 290 + }, + { + "epoch": 0.79, + "grad_norm": 0.16062453389167786, + "learning_rate": 0.00016901105870486372, + "loss": 1.621, + "step": 291 + }, + { + "epoch": 0.8, + "grad_norm": 0.18726034462451935, + "learning_rate": 0.00016880237847509514, + "loss": 1.6037, + "step": 292 + }, + { + "epoch": 0.8, + "grad_norm": 0.1890726536512375, + "learning_rate": 0.00016859312774828578, + "loss": 1.6991, + "step": 293 + }, + { + "epoch": 0.8, + "grad_norm": 0.1621987223625183, + "learning_rate": 0.0001683833082595053, + "loss": 1.6365, + "step": 294 + }, + { + "epoch": 0.8, + "grad_norm": 0.15142592787742615, + "learning_rate": 0.0001681729217485395, + "loss": 1.5566, + "step": 295 + }, + { + "epoch": 0.81, + "grad_norm": 0.15878473222255707, + "learning_rate": 0.0001679619699598757, + "loss": 1.6339, + "step": 296 + }, + { + "epoch": 0.81, + "grad_norm": 0.19840176403522491, + "learning_rate": 0.00016775045464268855, + "loss": 1.7045, + "step": 297 + }, + { + "epoch": 0.81, + "grad_norm": 0.15840516984462738, + "learning_rate": 0.00016753837755082527, + "loss": 1.7066, + "step": 298 + }, + { + "epoch": 0.81, + "grad_norm": 0.14891847968101501, + "learning_rate": 0.00016732574044279122, + "loss": 1.6095, + "step": 299 + }, + { + "epoch": 0.82, + "grad_norm": 0.1543874889612198, + "learning_rate": 0.0001671125450817354, + "loss": 1.6635, + "step": 300 + }, + { + "epoch": 0.82, + "grad_norm": 0.1737508326768875, + "learning_rate": 0.00016689879323543566, + "loss": 1.7243, + "step": 301 + }, + { + "epoch": 0.82, + "grad_norm": 0.1600562036037445, + "learning_rate": 0.00016668448667628418, + "loss": 1.6657, + "step": 302 + }, + { + "epoch": 0.83, + "grad_norm": 0.16577617824077606, + "learning_rate": 0.00016646962718127264, + "loss": 1.6408, + "step": 303 + }, + { + "epoch": 0.83, + "grad_norm": 0.1584424525499344, + "learning_rate": 0.00016625421653197766, + "loss": 1.7327, + "step": 304 + }, + { + "epoch": 0.83, + "grad_norm": 0.16025269031524658, + "learning_rate": 0.00016603825651454588, + "loss": 1.7496, + "step": 305 + }, + { + "epoch": 0.83, + "grad_norm": 0.1708802878856659, + "learning_rate": 0.0001658217489196792, + "loss": 1.6233, + "step": 306 + }, + { + "epoch": 0.84, + "grad_norm": 0.17108985781669617, + "learning_rate": 0.00016560469554262, + "loss": 1.675, + "step": 307 + }, + { + "epoch": 0.84, + "grad_norm": 0.17331109941005707, + "learning_rate": 0.00016538709818313604, + "loss": 1.6796, + "step": 308 + }, + { + "epoch": 0.84, + "grad_norm": 0.15554888546466827, + "learning_rate": 0.0001651689586455059, + "loss": 1.6977, + "step": 309 + }, + { + "epoch": 0.84, + "grad_norm": 0.16161341965198517, + "learning_rate": 0.0001649502787385036, + "loss": 1.5421, + "step": 310 + }, + { + "epoch": 0.85, + "grad_norm": 0.16354724764823914, + "learning_rate": 0.00016473106027538393, + "loss": 1.6105, + "step": 311 + }, + { + "epoch": 0.85, + "grad_norm": 0.16034872829914093, + "learning_rate": 0.0001645113050738673, + "loss": 1.7455, + "step": 312 + }, + { + "epoch": 0.85, + "grad_norm": 0.15332373976707458, + "learning_rate": 0.00016429101495612453, + "loss": 1.5715, + "step": 313 + }, + { + "epoch": 0.86, + "grad_norm": 0.18118220567703247, + "learning_rate": 0.00016407019174876196, + "loss": 1.6783, + "step": 314 + }, + { + "epoch": 0.86, + "grad_norm": 0.16754361987113953, + "learning_rate": 0.00016384883728280626, + "loss": 1.596, + "step": 315 + }, + { + "epoch": 0.86, + "grad_norm": 0.1616910696029663, + "learning_rate": 0.00016362695339368913, + "loss": 1.699, + "step": 316 + }, + { + "epoch": 0.86, + "grad_norm": 0.1674581617116928, + "learning_rate": 0.00016340454192123217, + "loss": 1.6238, + "step": 317 + }, + { + "epoch": 0.87, + "grad_norm": 0.1575167030096054, + "learning_rate": 0.00016318160470963158, + "loss": 1.7563, + "step": 318 + }, + { + "epoch": 0.87, + "grad_norm": 0.15855339169502258, + "learning_rate": 0.00016295814360744296, + "loss": 1.6587, + "step": 319 + }, + { + "epoch": 0.87, + "grad_norm": 0.16675756871700287, + "learning_rate": 0.00016273416046756585, + "loss": 1.5694, + "step": 320 + }, + { + "epoch": 0.87, + "grad_norm": 0.17339494824409485, + "learning_rate": 0.0001625096571472285, + "loss": 1.6924, + "step": 321 + }, + { + "epoch": 0.88, + "grad_norm": 0.16791300475597382, + "learning_rate": 0.00016228463550797234, + "loss": 1.6103, + "step": 322 + }, + { + "epoch": 0.88, + "grad_norm": 0.1806572824716568, + "learning_rate": 0.00016205909741563663, + "loss": 1.7016, + "step": 323 + }, + { + "epoch": 0.88, + "grad_norm": 0.18338143825531006, + "learning_rate": 0.00016183304474034303, + "loss": 1.6367, + "step": 324 + }, + { + "epoch": 0.88, + "grad_norm": 0.17175601422786713, + "learning_rate": 0.00016160647935647987, + "loss": 1.6221, + "step": 325 + }, + { + "epoch": 0.89, + "grad_norm": 0.1569746434688568, + "learning_rate": 0.00016137940314268695, + "loss": 1.6587, + "step": 326 + }, + { + "epoch": 0.89, + "grad_norm": 0.16621094942092896, + "learning_rate": 0.00016115181798183968, + "loss": 1.7476, + "step": 327 + }, + { + "epoch": 0.89, + "grad_norm": 0.1589597761631012, + "learning_rate": 0.00016092372576103362, + "loss": 1.608, + "step": 328 + }, + { + "epoch": 0.9, + "grad_norm": 0.1526154726743698, + "learning_rate": 0.00016069512837156867, + "loss": 1.6524, + "step": 329 + }, + { + "epoch": 0.9, + "grad_norm": 0.16909892857074738, + "learning_rate": 0.00016046602770893365, + "loss": 1.7135, + "step": 330 + }, + { + "epoch": 0.9, + "grad_norm": 0.16316530108451843, + "learning_rate": 0.00016023642567279033, + "loss": 1.645, + "step": 331 + }, + { + "epoch": 0.9, + "grad_norm": 0.15762114524841309, + "learning_rate": 0.00016000632416695782, + "loss": 1.6342, + "step": 332 + }, + { + "epoch": 0.91, + "grad_norm": 0.15512891113758087, + "learning_rate": 0.0001597757250993967, + "loss": 1.6937, + "step": 333 + }, + { + "epoch": 0.91, + "grad_norm": 0.1490001082420349, + "learning_rate": 0.0001595446303821933, + "loss": 1.6558, + "step": 334 + }, + { + "epoch": 0.91, + "grad_norm": 0.167209193110466, + "learning_rate": 0.00015931304193154375, + "loss": 1.7354, + "step": 335 + }, + { + "epoch": 0.91, + "grad_norm": 0.15756173431873322, + "learning_rate": 0.00015908096166773817, + "loss": 1.6624, + "step": 336 + }, + { + "epoch": 0.92, + "grad_norm": 0.35730743408203125, + "learning_rate": 0.0001588483915151447, + "loss": 1.7502, + "step": 337 + }, + { + "epoch": 0.92, + "grad_norm": 0.17207658290863037, + "learning_rate": 0.00015861533340219347, + "loss": 1.7045, + "step": 338 + }, + { + "epoch": 0.92, + "grad_norm": 0.1688750982284546, + "learning_rate": 0.00015838178926136078, + "loss": 1.7443, + "step": 339 + }, + { + "epoch": 0.93, + "grad_norm": 0.15904007852077484, + "learning_rate": 0.00015814776102915295, + "loss": 1.6744, + "step": 340 + }, + { + "epoch": 0.93, + "grad_norm": 0.15581969916820526, + "learning_rate": 0.0001579132506460903, + "loss": 1.6926, + "step": 341 + }, + { + "epoch": 0.93, + "grad_norm": 0.1648828089237213, + "learning_rate": 0.0001576782600566911, + "loss": 1.6929, + "step": 342 + }, + { + "epoch": 0.93, + "grad_norm": 0.16498371958732605, + "learning_rate": 0.00015744279120945534, + "loss": 1.7607, + "step": 343 + }, + { + "epoch": 0.94, + "grad_norm": 0.1721397489309311, + "learning_rate": 0.00015720684605684864, + "loss": 1.7413, + "step": 344 + }, + { + "epoch": 0.94, + "grad_norm": 0.20110780000686646, + "learning_rate": 0.00015697042655528617, + "loss": 1.7579, + "step": 345 + }, + { + "epoch": 0.94, + "grad_norm": 0.17458587884902954, + "learning_rate": 0.00015673353466511618, + "loss": 1.557, + "step": 346 + }, + { + "epoch": 0.94, + "grad_norm": 0.1674317717552185, + "learning_rate": 0.00015649617235060395, + "loss": 1.7079, + "step": 347 + }, + { + "epoch": 0.95, + "grad_norm": 0.17642170190811157, + "learning_rate": 0.0001562583415799154, + "loss": 1.7137, + "step": 348 + }, + { + "epoch": 0.95, + "grad_norm": 0.17334245145320892, + "learning_rate": 0.00015602004432510084, + "loss": 1.6618, + "step": 349 + }, + { + "epoch": 0.95, + "grad_norm": 0.19081345200538635, + "learning_rate": 0.00015578128256207857, + "loss": 1.6069, + "step": 350 + }, + { + "epoch": 0.96, + "grad_norm": 0.1641138792037964, + "learning_rate": 0.00015554205827061855, + "loss": 1.7307, + "step": 351 + }, + { + "epoch": 0.96, + "grad_norm": 0.18359099328517914, + "learning_rate": 0.0001553023734343258, + "loss": 1.6646, + "step": 352 + }, + { + "epoch": 0.96, + "grad_norm": 0.15749670565128326, + "learning_rate": 0.00015506223004062427, + "loss": 1.6696, + "step": 353 + }, + { + "epoch": 0.96, + "grad_norm": 0.15676377713680267, + "learning_rate": 0.00015482163008074016, + "loss": 1.6257, + "step": 354 + }, + { + "epoch": 0.97, + "grad_norm": 0.1613236665725708, + "learning_rate": 0.00015458057554968533, + "loss": 1.6906, + "step": 355 + }, + { + "epoch": 0.97, + "grad_norm": 0.15893089771270752, + "learning_rate": 0.0001543390684462409, + "loss": 1.7515, + "step": 356 + }, + { + "epoch": 0.97, + "grad_norm": 0.16093188524246216, + "learning_rate": 0.00015409711077294066, + "loss": 1.7285, + "step": 357 + }, + { + "epoch": 0.97, + "grad_norm": 0.1732104867696762, + "learning_rate": 0.00015385470453605456, + "loss": 1.728, + "step": 358 + }, + { + "epoch": 0.98, + "grad_norm": 0.18478652834892273, + "learning_rate": 0.0001536118517455717, + "loss": 1.6798, + "step": 359 + }, + { + "epoch": 0.98, + "grad_norm": 0.1561679095029831, + "learning_rate": 0.00015336855441518414, + "loss": 1.4988, + "step": 360 + }, + { + "epoch": 0.98, + "grad_norm": 0.16175530850887299, + "learning_rate": 0.00015312481456226986, + "loss": 1.5661, + "step": 361 + }, + { + "epoch": 0.99, + "grad_norm": 0.15828688442707062, + "learning_rate": 0.0001528806342078763, + "loss": 1.6894, + "step": 362 + }, + { + "epoch": 0.99, + "grad_norm": 0.1670212745666504, + "learning_rate": 0.00015263601537670332, + "loss": 1.6946, + "step": 363 + }, + { + "epoch": 0.99, + "grad_norm": 0.15261001884937286, + "learning_rate": 0.00015239096009708665, + "loss": 1.647, + "step": 364 + }, + { + "epoch": 0.99, + "grad_norm": 0.15754546225070953, + "learning_rate": 0.00015214547040098082, + "loss": 1.65, + "step": 365 + }, + { + "epoch": 1.0, + "grad_norm": 0.15401124954223633, + "learning_rate": 0.00015189954832394266, + "loss": 1.6826, + "step": 366 + }, + { + "epoch": 1.0, + "grad_norm": 0.15601174533367157, + "learning_rate": 0.00015165319590511412, + "loss": 1.7028, + "step": 367 + }, + { + "epoch": 1.0, + "grad_norm": 0.17138823866844177, + "learning_rate": 0.00015140641518720545, + "loss": 1.5761, + "step": 368 + }, + { + "epoch": 1.0, + "eval_loss": 1.637104868888855, + "eval_runtime": 53.887, + "eval_samples_per_second": 92.731, + "eval_steps_per_second": 23.197, + "step": 368 + }, + { + "epoch": 1.0, + "grad_norm": 0.16486236453056335, + "learning_rate": 0.00015115920821647834, + "loss": 1.6967, + "step": 369 + }, + { + "epoch": 1.01, + "grad_norm": 0.16460780799388885, + "learning_rate": 0.00015091157704272886, + "loss": 1.6389, + "step": 370 + }, + { + "epoch": 1.01, + "grad_norm": 0.16463211178779602, + "learning_rate": 0.00015066352371927047, + "loss": 1.7642, + "step": 371 + }, + { + "epoch": 1.01, + "grad_norm": 0.15675517916679382, + "learning_rate": 0.0001504150503029171, + "loss": 1.6816, + "step": 372 + }, + { + "epoch": 1.02, + "grad_norm": 0.1745896190404892, + "learning_rate": 0.0001501661588539659, + "loss": 1.7393, + "step": 373 + }, + { + "epoch": 1.0, + "grad_norm": 0.16300994157791138, + "learning_rate": 0.00014991685143618042, + "loss": 1.6244, + "step": 374 + }, + { + "epoch": 1.0, + "grad_norm": 0.17243793606758118, + "learning_rate": 0.00014966713011677318, + "loss": 1.664, + "step": 375 + }, + { + "epoch": 1.01, + "grad_norm": 0.1658598929643631, + "learning_rate": 0.00014941699696638887, + "loss": 1.5184, + "step": 376 + }, + { + "epoch": 1.01, + "grad_norm": 0.16609761118888855, + "learning_rate": 0.00014916645405908683, + "loss": 1.6516, + "step": 377 + }, + { + "epoch": 1.01, + "grad_norm": 0.17976997792720795, + "learning_rate": 0.0001489155034723242, + "loss": 1.619, + "step": 378 + }, + { + "epoch": 1.01, + "grad_norm": 0.1680922657251358, + "learning_rate": 0.00014866414728693838, + "loss": 1.6772, + "step": 379 + }, + { + "epoch": 1.02, + "grad_norm": 0.18461324274539948, + "learning_rate": 0.00014841238758713006, + "loss": 1.5051, + "step": 380 + }, + { + "epoch": 1.02, + "grad_norm": 0.20495639741420746, + "learning_rate": 0.0001481602264604457, + "loss": 1.6497, + "step": 381 + }, + { + "epoch": 1.02, + "grad_norm": 0.16210030019283295, + "learning_rate": 0.00014790766599776038, + "loss": 1.376, + "step": 382 + }, + { + "epoch": 1.03, + "grad_norm": 0.16862326860427856, + "learning_rate": 0.00014765470829326033, + "loss": 1.5394, + "step": 383 + }, + { + "epoch": 1.03, + "grad_norm": 0.16202732920646667, + "learning_rate": 0.0001474013554444257, + "loss": 1.6899, + "step": 384 + }, + { + "epoch": 1.03, + "grad_norm": 0.1672043651342392, + "learning_rate": 0.00014714760955201306, + "loss": 1.6943, + "step": 385 + }, + { + "epoch": 1.03, + "grad_norm": 0.1694658100605011, + "learning_rate": 0.00014689347272003813, + "loss": 1.6172, + "step": 386 + }, + { + "epoch": 1.04, + "grad_norm": 0.2547871470451355, + "learning_rate": 0.0001466389470557581, + "loss": 1.6476, + "step": 387 + }, + { + "epoch": 1.04, + "grad_norm": 0.16380763053894043, + "learning_rate": 0.0001463840346696543, + "loss": 1.6212, + "step": 388 + }, + { + "epoch": 1.04, + "grad_norm": 0.16450123488903046, + "learning_rate": 0.00014612873767541485, + "loss": 1.7284, + "step": 389 + }, + { + "epoch": 1.04, + "grad_norm": 0.18062160909175873, + "learning_rate": 0.00014587305818991673, + "loss": 1.5628, + "step": 390 + }, + { + "epoch": 1.05, + "grad_norm": 0.16628506779670715, + "learning_rate": 0.0001456169983332087, + "loss": 1.5655, + "step": 391 + }, + { + "epoch": 1.05, + "grad_norm": 0.17104601860046387, + "learning_rate": 0.00014536056022849327, + "loss": 1.6204, + "step": 392 + }, + { + "epoch": 1.05, + "grad_norm": 0.17847618460655212, + "learning_rate": 0.00014510374600210953, + "loss": 1.626, + "step": 393 + }, + { + "epoch": 1.06, + "grad_norm": 0.17856758832931519, + "learning_rate": 0.0001448465577835152, + "loss": 1.6905, + "step": 394 + }, + { + "epoch": 1.06, + "grad_norm": 0.15889039635658264, + "learning_rate": 0.000144588997705269, + "loss": 1.5162, + "step": 395 + }, + { + "epoch": 1.06, + "grad_norm": 0.17019660770893097, + "learning_rate": 0.0001443310679030132, + "loss": 1.6263, + "step": 396 + }, + { + "epoch": 1.06, + "grad_norm": 0.1884152591228485, + "learning_rate": 0.00014407277051545564, + "loss": 1.5887, + "step": 397 + }, + { + "epoch": 1.07, + "grad_norm": 0.16880416870117188, + "learning_rate": 0.00014381410768435216, + "loss": 1.6554, + "step": 398 + }, + { + "epoch": 1.07, + "grad_norm": 0.172772616147995, + "learning_rate": 0.0001435550815544888, + "loss": 1.6194, + "step": 399 + }, + { + "epoch": 1.07, + "grad_norm": 0.1651185303926468, + "learning_rate": 0.00014329569427366394, + "loss": 1.6202, + "step": 400 + }, + { + "epoch": 1.07, + "grad_norm": 0.1783447414636612, + "learning_rate": 0.00014303594799267065, + "loss": 1.5183, + "step": 401 + }, + { + "epoch": 1.08, + "grad_norm": 0.18523593246936798, + "learning_rate": 0.00014277584486527872, + "loss": 1.5517, + "step": 402 + }, + { + "epoch": 1.08, + "grad_norm": 0.17499391734600067, + "learning_rate": 0.0001425153870482169, + "loss": 1.6889, + "step": 403 + }, + { + "epoch": 1.08, + "grad_norm": 0.16847828030586243, + "learning_rate": 0.00014225457670115482, + "loss": 1.6499, + "step": 404 + }, + { + "epoch": 1.09, + "grad_norm": 0.187130406498909, + "learning_rate": 0.00014199341598668537, + "loss": 1.6138, + "step": 405 + }, + { + "epoch": 1.09, + "grad_norm": 0.17778280377388, + "learning_rate": 0.0001417319070703066, + "loss": 1.5702, + "step": 406 + }, + { + "epoch": 1.09, + "grad_norm": 0.18552978336811066, + "learning_rate": 0.00014147005212040374, + "loss": 1.4837, + "step": 407 + }, + { + "epoch": 1.09, + "grad_norm": 0.19115741550922394, + "learning_rate": 0.00014120785330823128, + "loss": 1.5809, + "step": 408 + }, + { + "epoch": 1.1, + "grad_norm": 0.19195811450481415, + "learning_rate": 0.000140945312807895, + "loss": 1.6841, + "step": 409 + }, + { + "epoch": 1.1, + "grad_norm": 0.1775340586900711, + "learning_rate": 0.0001406824327963338, + "loss": 1.4879, + "step": 410 + }, + { + "epoch": 1.1, + "grad_norm": 0.17470397055149078, + "learning_rate": 0.00014041921545330193, + "loss": 1.7223, + "step": 411 + }, + { + "epoch": 1.1, + "grad_norm": 0.18359576165676117, + "learning_rate": 0.00014015566296135047, + "loss": 1.6814, + "step": 412 + }, + { + "epoch": 1.11, + "grad_norm": 0.18946854770183563, + "learning_rate": 0.00013989177750580974, + "loss": 1.5659, + "step": 413 + }, + { + "epoch": 1.11, + "grad_norm": 0.17491303384304047, + "learning_rate": 0.00013962756127477077, + "loss": 1.5633, + "step": 414 + }, + { + "epoch": 1.11, + "grad_norm": 0.19263826310634613, + "learning_rate": 0.00013936301645906744, + "loss": 1.4806, + "step": 415 + }, + { + "epoch": 1.12, + "grad_norm": 0.20540061593055725, + "learning_rate": 0.0001390981452522581, + "loss": 1.6358, + "step": 416 + }, + { + "epoch": 1.12, + "grad_norm": 0.1703554391860962, + "learning_rate": 0.00013883294985060754, + "loss": 1.5266, + "step": 417 + }, + { + "epoch": 1.12, + "grad_norm": 0.1858687400817871, + "learning_rate": 0.00013856743245306867, + "loss": 1.5421, + "step": 418 + }, + { + "epoch": 1.12, + "grad_norm": 0.19925490021705627, + "learning_rate": 0.0001383015952612644, + "loss": 1.5334, + "step": 419 + }, + { + "epoch": 1.13, + "grad_norm": 0.18249362707138062, + "learning_rate": 0.00013803544047946917, + "loss": 1.6678, + "step": 420 + }, + { + "epoch": 1.13, + "grad_norm": 0.1895124316215515, + "learning_rate": 0.00013776897031459104, + "loss": 1.5825, + "step": 421 + }, + { + "epoch": 1.13, + "grad_norm": 0.18998436629772186, + "learning_rate": 0.00013750218697615297, + "loss": 1.4063, + "step": 422 + }, + { + "epoch": 1.13, + "grad_norm": 0.18359903991222382, + "learning_rate": 0.00013723509267627479, + "loss": 1.5474, + "step": 423 + }, + { + "epoch": 1.14, + "grad_norm": 0.1771814525127411, + "learning_rate": 0.0001369676896296548, + "loss": 1.6217, + "step": 424 + }, + { + "epoch": 1.14, + "grad_norm": 0.2542651891708374, + "learning_rate": 0.0001366999800535513, + "loss": 1.5389, + "step": 425 + }, + { + "epoch": 1.14, + "grad_norm": 0.16922856867313385, + "learning_rate": 0.00013643196616776432, + "loss": 1.6299, + "step": 426 + }, + { + "epoch": 1.14, + "grad_norm": 0.19103561341762543, + "learning_rate": 0.00013616365019461716, + "loss": 1.6539, + "step": 427 + }, + { + "epoch": 1.15, + "grad_norm": 0.17129819095134735, + "learning_rate": 0.00013589503435893792, + "loss": 1.5554, + "step": 428 + }, + { + "epoch": 1.15, + "grad_norm": 0.17848622798919678, + "learning_rate": 0.00013562612088804127, + "loss": 1.6609, + "step": 429 + }, + { + "epoch": 1.15, + "grad_norm": 0.1793471872806549, + "learning_rate": 0.00013535691201170958, + "loss": 1.4906, + "step": 430 + }, + { + "epoch": 1.16, + "grad_norm": 0.17249123752117157, + "learning_rate": 0.00013508740996217493, + "loss": 1.5901, + "step": 431 + }, + { + "epoch": 1.16, + "grad_norm": 0.17167752981185913, + "learning_rate": 0.00013481761697410009, + "loss": 1.5892, + "step": 432 + }, + { + "epoch": 1.16, + "grad_norm": 0.18118228018283844, + "learning_rate": 0.00013454753528456038, + "loss": 1.6449, + "step": 433 + }, + { + "epoch": 1.16, + "grad_norm": 0.19412502646446228, + "learning_rate": 0.00013427716713302506, + "loss": 1.4823, + "step": 434 + }, + { + "epoch": 1.17, + "grad_norm": 0.16546295583248138, + "learning_rate": 0.00013400651476133844, + "loss": 1.5447, + "step": 435 + }, + { + "epoch": 1.17, + "grad_norm": 0.17301104962825775, + "learning_rate": 0.00013373558041370178, + "loss": 1.5071, + "step": 436 + }, + { + "epoch": 1.17, + "grad_norm": 0.19761618971824646, + "learning_rate": 0.00013346436633665428, + "loss": 1.6934, + "step": 437 + }, + { + "epoch": 1.17, + "grad_norm": 0.17933383584022522, + "learning_rate": 0.00013319287477905464, + "loss": 1.514, + "step": 438 + }, + { + "epoch": 1.18, + "grad_norm": 0.20005792379379272, + "learning_rate": 0.00013292110799206243, + "loss": 1.4735, + "step": 439 + }, + { + "epoch": 1.18, + "grad_norm": 0.1816921830177307, + "learning_rate": 0.0001326490682291193, + "loss": 1.6867, + "step": 440 + }, + { + "epoch": 1.18, + "grad_norm": 0.19630655646324158, + "learning_rate": 0.00013237675774593045, + "loss": 1.3871, + "step": 441 + }, + { + "epoch": 1.19, + "grad_norm": 0.17641153931617737, + "learning_rate": 0.0001321041788004458, + "loss": 1.5511, + "step": 442 + }, + { + "epoch": 1.19, + "grad_norm": 0.18330830335617065, + "learning_rate": 0.00013183133365284123, + "loss": 1.432, + "step": 443 + }, + { + "epoch": 1.19, + "grad_norm": 0.20252883434295654, + "learning_rate": 0.00013155822456550006, + "loss": 1.6277, + "step": 444 + }, + { + "epoch": 1.19, + "grad_norm": 0.17543216049671173, + "learning_rate": 0.00013128485380299408, + "loss": 1.6337, + "step": 445 + }, + { + "epoch": 1.2, + "grad_norm": 0.1888299435377121, + "learning_rate": 0.00013101122363206488, + "loss": 1.6009, + "step": 446 + }, + { + "epoch": 1.2, + "grad_norm": 0.20228877663612366, + "learning_rate": 0.00013073733632160494, + "loss": 1.5939, + "step": 447 + }, + { + "epoch": 1.2, + "grad_norm": 0.19537915289402008, + "learning_rate": 0.00013046319414263902, + "loss": 1.5944, + "step": 448 + }, + { + "epoch": 1.2, + "grad_norm": 0.18867214024066925, + "learning_rate": 0.00013018879936830505, + "loss": 1.5614, + "step": 449 + }, + { + "epoch": 1.21, + "grad_norm": 0.18890655040740967, + "learning_rate": 0.00012991415427383556, + "loss": 1.6572, + "step": 450 + }, + { + "epoch": 1.21, + "grad_norm": 0.19154617190361023, + "learning_rate": 0.00012963926113653863, + "loss": 1.4916, + "step": 451 + }, + { + "epoch": 1.21, + "grad_norm": 0.18493875861167908, + "learning_rate": 0.00012936412223577915, + "loss": 1.6289, + "step": 452 + }, + { + "epoch": 1.22, + "grad_norm": 0.18440605700016022, + "learning_rate": 0.00012908873985295968, + "loss": 1.5943, + "step": 453 + }, + { + "epoch": 1.22, + "grad_norm": 0.1951192021369934, + "learning_rate": 0.00012881311627150187, + "loss": 1.6152, + "step": 454 + }, + { + "epoch": 1.22, + "grad_norm": 0.18237607181072235, + "learning_rate": 0.00012853725377682718, + "loss": 1.7562, + "step": 455 + }, + { + "epoch": 1.22, + "grad_norm": 0.17004793882369995, + "learning_rate": 0.0001282611546563382, + "loss": 1.5222, + "step": 456 + }, + { + "epoch": 1.23, + "grad_norm": 0.19362376630306244, + "learning_rate": 0.00012798482119939956, + "loss": 1.5306, + "step": 457 + }, + { + "epoch": 1.23, + "grad_norm": 0.1877073347568512, + "learning_rate": 0.00012770825569731895, + "loss": 1.4202, + "step": 458 + }, + { + "epoch": 1.23, + "grad_norm": 0.177567720413208, + "learning_rate": 0.00012743146044332817, + "loss": 1.5566, + "step": 459 + }, + { + "epoch": 1.23, + "grad_norm": 0.20578047633171082, + "learning_rate": 0.00012715443773256402, + "loss": 1.4785, + "step": 460 + }, + { + "epoch": 1.23, + "eval_loss": 1.6219624280929565, + "eval_runtime": 53.6944, + "eval_samples_per_second": 93.064, + "eval_steps_per_second": 23.28, + "step": 460 + }, + { + "epoch": 1.24, + "grad_norm": 0.180220827460289, + "learning_rate": 0.0001268771898620494, + "loss": 1.6456, + "step": 461 + }, + { + "epoch": 1.24, + "grad_norm": 0.20168617367744446, + "learning_rate": 0.00012659971913067414, + "loss": 1.4979, + "step": 462 + }, + { + "epoch": 1.24, + "grad_norm": 0.1822381317615509, + "learning_rate": 0.000126322027839176, + "loss": 1.5769, + "step": 463 + }, + { + "epoch": 1.25, + "grad_norm": 0.18748806416988373, + "learning_rate": 0.00012604411829012166, + "loss": 1.6408, + "step": 464 + }, + { + "epoch": 1.25, + "grad_norm": 0.17949888110160828, + "learning_rate": 0.00012576599278788742, + "loss": 1.6341, + "step": 465 + }, + { + "epoch": 1.25, + "grad_norm": 0.19509534537792206, + "learning_rate": 0.00012548765363864036, + "loss": 1.5458, + "step": 466 + }, + { + "epoch": 1.25, + "grad_norm": 0.1767253279685974, + "learning_rate": 0.00012520910315031895, + "loss": 1.577, + "step": 467 + }, + { + "epoch": 1.26, + "grad_norm": 0.17574870586395264, + "learning_rate": 0.0001249303436326142, + "loss": 1.6089, + "step": 468 + }, + { + "epoch": 1.26, + "grad_norm": 0.17545011639595032, + "learning_rate": 0.0001246513773969502, + "loss": 1.6331, + "step": 469 + }, + { + "epoch": 1.26, + "grad_norm": 0.18224777281284332, + "learning_rate": 0.00012437220675646523, + "loss": 1.5737, + "step": 470 + }, + { + "epoch": 1.26, + "grad_norm": 0.20681160688400269, + "learning_rate": 0.00012409283402599238, + "loss": 1.5503, + "step": 471 + }, + { + "epoch": 1.27, + "grad_norm": 0.196259543299675, + "learning_rate": 0.0001238132615220405, + "loss": 1.6594, + "step": 472 + }, + { + "epoch": 1.27, + "grad_norm": 0.18110986053943634, + "learning_rate": 0.0001235334915627748, + "loss": 1.6193, + "step": 473 + }, + { + "epoch": 1.27, + "grad_norm": 0.18576951324939728, + "learning_rate": 0.00012325352646799795, + "loss": 1.6723, + "step": 474 + }, + { + "epoch": 1.28, + "grad_norm": 0.18934178352355957, + "learning_rate": 0.00012297336855913047, + "loss": 1.6192, + "step": 475 + }, + { + "epoch": 1.28, + "grad_norm": 0.185868501663208, + "learning_rate": 0.00012269302015919172, + "loss": 1.6163, + "step": 476 + }, + { + "epoch": 1.28, + "grad_norm": 0.16594283282756805, + "learning_rate": 0.00012241248359278064, + "loss": 1.6334, + "step": 477 + }, + { + "epoch": 1.28, + "grad_norm": 0.17133650183677673, + "learning_rate": 0.00012213176118605637, + "loss": 1.5889, + "step": 478 + }, + { + "epoch": 1.29, + "grad_norm": 0.1712963730096817, + "learning_rate": 0.00012185085526671893, + "loss": 1.5003, + "step": 479 + }, + { + "epoch": 1.29, + "grad_norm": 0.1756075769662857, + "learning_rate": 0.00012156976816399013, + "loss": 1.6512, + "step": 480 + }, + { + "epoch": 1.29, + "grad_norm": 0.17380216717720032, + "learning_rate": 0.00012128850220859397, + "loss": 1.4197, + "step": 481 + }, + { + "epoch": 1.29, + "grad_norm": 0.18200692534446716, + "learning_rate": 0.00012100705973273765, + "loss": 1.5587, + "step": 482 + }, + { + "epoch": 1.3, + "grad_norm": 0.17649267613887787, + "learning_rate": 0.00012072544307009184, + "loss": 1.662, + "step": 483 + }, + { + "epoch": 1.3, + "grad_norm": 0.20866695046424866, + "learning_rate": 0.00012044365455577171, + "loss": 1.5038, + "step": 484 + }, + { + "epoch": 1.3, + "grad_norm": 0.18661659955978394, + "learning_rate": 0.00012016169652631726, + "loss": 1.4872, + "step": 485 + }, + { + "epoch": 1.31, + "grad_norm": 0.19411514699459076, + "learning_rate": 0.00011987957131967418, + "loss": 1.5808, + "step": 486 + }, + { + "epoch": 1.31, + "grad_norm": 0.1833687424659729, + "learning_rate": 0.00011959728127517434, + "loss": 1.6578, + "step": 487 + }, + { + "epoch": 1.31, + "grad_norm": 0.2166387289762497, + "learning_rate": 0.00011931482873351632, + "loss": 1.6438, + "step": 488 + }, + { + "epoch": 1.31, + "grad_norm": 0.17330534756183624, + "learning_rate": 0.00011903221603674628, + "loss": 1.5918, + "step": 489 + }, + { + "epoch": 1.32, + "grad_norm": 0.17018438875675201, + "learning_rate": 0.00011874944552823817, + "loss": 1.5898, + "step": 490 + }, + { + "epoch": 1.32, + "grad_norm": 0.1763826310634613, + "learning_rate": 0.00011846651955267463, + "loss": 1.5376, + "step": 491 + }, + { + "epoch": 1.32, + "grad_norm": 0.18420416116714478, + "learning_rate": 0.00011818344045602727, + "loss": 1.5467, + "step": 492 + }, + { + "epoch": 1.32, + "grad_norm": 0.19673533737659454, + "learning_rate": 0.00011790021058553751, + "loss": 1.582, + "step": 493 + }, + { + "epoch": 1.33, + "grad_norm": 0.1835012137889862, + "learning_rate": 0.00011761683228969682, + "loss": 1.6558, + "step": 494 + }, + { + "epoch": 1.33, + "grad_norm": 0.18565072119235992, + "learning_rate": 0.00011733330791822749, + "loss": 1.5635, + "step": 495 + }, + { + "epoch": 1.33, + "grad_norm": 0.18372558057308197, + "learning_rate": 0.00011704963982206299, + "loss": 1.5643, + "step": 496 + }, + { + "epoch": 1.34, + "grad_norm": 0.17450691759586334, + "learning_rate": 0.00011676583035332853, + "loss": 1.5644, + "step": 497 + }, + { + "epoch": 1.34, + "grad_norm": 0.1876295804977417, + "learning_rate": 0.00011648188186532154, + "loss": 1.5997, + "step": 498 + }, + { + "epoch": 1.34, + "grad_norm": 0.19220595061779022, + "learning_rate": 0.00011619779671249223, + "loss": 1.6757, + "step": 499 + }, + { + "epoch": 1.34, + "grad_norm": 0.1927752047777176, + "learning_rate": 0.00011591357725042393, + "loss": 1.5843, + "step": 500 + }, + { + "epoch": 1.35, + "grad_norm": 0.18117199838161469, + "learning_rate": 0.00011562922583581375, + "loss": 1.6051, + "step": 501 + }, + { + "epoch": 1.35, + "grad_norm": 0.18213218450546265, + "learning_rate": 0.0001153447448264528, + "loss": 1.5653, + "step": 502 + }, + { + "epoch": 1.35, + "grad_norm": 0.18642397224903107, + "learning_rate": 0.00011506013658120687, + "loss": 1.5057, + "step": 503 + }, + { + "epoch": 1.35, + "grad_norm": 0.18502803146839142, + "learning_rate": 0.00011477540345999669, + "loss": 1.5657, + "step": 504 + }, + { + "epoch": 1.36, + "grad_norm": 0.1808471977710724, + "learning_rate": 0.00011449054782377855, + "loss": 1.6401, + "step": 505 + }, + { + "epoch": 1.36, + "grad_norm": 0.18637515604496002, + "learning_rate": 0.00011420557203452444, + "loss": 1.5472, + "step": 506 + }, + { + "epoch": 1.36, + "grad_norm": 0.19144316017627716, + "learning_rate": 0.00011392047845520282, + "loss": 1.5232, + "step": 507 + }, + { + "epoch": 1.37, + "grad_norm": 0.17375171184539795, + "learning_rate": 0.00011363526944975867, + "loss": 1.5129, + "step": 508 + }, + { + "epoch": 1.37, + "grad_norm": 0.18536758422851562, + "learning_rate": 0.00011334994738309423, + "loss": 1.5233, + "step": 509 + }, + { + "epoch": 1.37, + "grad_norm": 0.1873825639486313, + "learning_rate": 0.00011306451462104912, + "loss": 1.6106, + "step": 510 + }, + { + "epoch": 1.37, + "grad_norm": 0.18517866730690002, + "learning_rate": 0.00011277897353038085, + "loss": 1.5758, + "step": 511 + }, + { + "epoch": 1.38, + "grad_norm": 0.18197666108608246, + "learning_rate": 0.00011249332647874513, + "loss": 1.5425, + "step": 512 + }, + { + "epoch": 1.38, + "grad_norm": 0.17675837874412537, + "learning_rate": 0.00011220757583467643, + "loss": 1.548, + "step": 513 + }, + { + "epoch": 1.38, + "grad_norm": 0.18274517357349396, + "learning_rate": 0.00011192172396756797, + "loss": 1.5736, + "step": 514 + }, + { + "epoch": 1.38, + "grad_norm": 0.24651746451854706, + "learning_rate": 0.00011163577324765248, + "loss": 1.5953, + "step": 515 + }, + { + "epoch": 1.39, + "grad_norm": 0.18200458586215973, + "learning_rate": 0.00011134972604598224, + "loss": 1.5957, + "step": 516 + }, + { + "epoch": 1.39, + "grad_norm": 0.19455678761005402, + "learning_rate": 0.00011106358473440963, + "loss": 1.659, + "step": 517 + }, + { + "epoch": 1.39, + "grad_norm": 0.19950920343399048, + "learning_rate": 0.00011077735168556729, + "loss": 1.5277, + "step": 518 + }, + { + "epoch": 1.4, + "grad_norm": 0.19501787424087524, + "learning_rate": 0.00011049102927284857, + "loss": 1.612, + "step": 519 + }, + { + "epoch": 1.4, + "grad_norm": 0.1821345090866089, + "learning_rate": 0.00011020461987038781, + "loss": 1.5435, + "step": 520 + }, + { + "epoch": 1.4, + "grad_norm": 0.17476962506771088, + "learning_rate": 0.00010991812585304069, + "loss": 1.5899, + "step": 521 + }, + { + "epoch": 1.4, + "grad_norm": 0.1734679490327835, + "learning_rate": 0.00010963154959636438, + "loss": 1.675, + "step": 522 + }, + { + "epoch": 1.41, + "grad_norm": 0.19216276705265045, + "learning_rate": 0.00010934489347659816, + "loss": 1.5913, + "step": 523 + }, + { + "epoch": 1.41, + "grad_norm": 0.18504251539707184, + "learning_rate": 0.00010905815987064328, + "loss": 1.5604, + "step": 524 + }, + { + "epoch": 1.41, + "grad_norm": 0.1940964013338089, + "learning_rate": 0.00010877135115604372, + "loss": 1.5155, + "step": 525 + }, + { + "epoch": 1.41, + "grad_norm": 0.19056075811386108, + "learning_rate": 0.00010848446971096606, + "loss": 1.5704, + "step": 526 + }, + { + "epoch": 1.42, + "grad_norm": 0.18702279031276703, + "learning_rate": 0.00010819751791418011, + "loss": 1.5397, + "step": 527 + }, + { + "epoch": 1.42, + "grad_norm": 0.1916041225194931, + "learning_rate": 0.00010791049814503888, + "loss": 1.5917, + "step": 528 + }, + { + "epoch": 1.42, + "grad_norm": 0.18013226985931396, + "learning_rate": 0.00010762341278345909, + "loss": 1.5779, + "step": 529 + }, + { + "epoch": 1.43, + "grad_norm": 0.1704178899526596, + "learning_rate": 0.00010733626420990134, + "loss": 1.5725, + "step": 530 + }, + { + "epoch": 1.43, + "grad_norm": 0.19062383472919464, + "learning_rate": 0.0001070490548053503, + "loss": 1.6347, + "step": 531 + }, + { + "epoch": 1.43, + "grad_norm": 0.18978549540042877, + "learning_rate": 0.00010676178695129513, + "loss": 1.5762, + "step": 532 + }, + { + "epoch": 1.43, + "grad_norm": 0.18251344561576843, + "learning_rate": 0.00010647446302970954, + "loss": 1.5688, + "step": 533 + }, + { + "epoch": 1.44, + "grad_norm": 0.17656119167804718, + "learning_rate": 0.00010618708542303226, + "loss": 1.5179, + "step": 534 + }, + { + "epoch": 1.44, + "grad_norm": 0.19226515293121338, + "learning_rate": 0.00010589965651414704, + "loss": 1.6262, + "step": 535 + }, + { + "epoch": 1.44, + "grad_norm": 0.19419853389263153, + "learning_rate": 0.00010561217868636313, + "loss": 1.5603, + "step": 536 + }, + { + "epoch": 1.44, + "grad_norm": 0.1746627241373062, + "learning_rate": 0.00010532465432339538, + "loss": 1.5841, + "step": 537 + }, + { + "epoch": 1.45, + "grad_norm": 0.17498454451560974, + "learning_rate": 0.00010503708580934442, + "loss": 1.6575, + "step": 538 + }, + { + "epoch": 1.45, + "grad_norm": 0.17727667093276978, + "learning_rate": 0.00010474947552867706, + "loss": 1.5722, + "step": 539 + }, + { + "epoch": 1.45, + "grad_norm": 0.19356906414031982, + "learning_rate": 0.00010446182586620648, + "loss": 1.5017, + "step": 540 + }, + { + "epoch": 1.46, + "grad_norm": 0.18533338606357574, + "learning_rate": 0.00010417413920707222, + "loss": 1.5829, + "step": 541 + }, + { + "epoch": 1.46, + "grad_norm": 0.18073119223117828, + "learning_rate": 0.00010388641793672078, + "loss": 1.6755, + "step": 542 + }, + { + "epoch": 1.46, + "grad_norm": 0.1906929314136505, + "learning_rate": 0.00010359866444088555, + "loss": 1.5658, + "step": 543 + }, + { + "epoch": 1.46, + "grad_norm": 0.186784565448761, + "learning_rate": 0.00010331088110556717, + "loss": 1.5248, + "step": 544 + }, + { + "epoch": 1.47, + "grad_norm": 0.20720143616199493, + "learning_rate": 0.00010302307031701364, + "loss": 1.628, + "step": 545 + }, + { + "epoch": 1.47, + "grad_norm": 0.16866104304790497, + "learning_rate": 0.0001027352344617007, + "loss": 1.5764, + "step": 546 + }, + { + "epoch": 1.47, + "grad_norm": 0.19025076925754547, + "learning_rate": 0.00010244737592631181, + "loss": 1.7834, + "step": 547 + }, + { + "epoch": 1.47, + "grad_norm": 0.19599542021751404, + "learning_rate": 0.00010215949709771866, + "loss": 1.5223, + "step": 548 + }, + { + "epoch": 1.48, + "grad_norm": 0.2109784334897995, + "learning_rate": 0.00010187160036296103, + "loss": 1.5431, + "step": 549 + }, + { + "epoch": 1.48, + "grad_norm": 0.2028791755437851, + "learning_rate": 0.00010158368810922729, + "loss": 1.5948, + "step": 550 + }, + { + "epoch": 1.48, + "grad_norm": 0.20557260513305664, + "learning_rate": 0.00010129576272383445, + "loss": 1.6074, + "step": 551 + }, + { + "epoch": 1.49, + "grad_norm": 0.20252369344234467, + "learning_rate": 0.0001010078265942084, + "loss": 1.4492, + "step": 552 + }, + { + "epoch": 1.49, + "eval_loss": 1.6022814512252808, + "eval_runtime": 54.0178, + "eval_samples_per_second": 92.507, + "eval_steps_per_second": 23.141, + "step": 552 + }, + { + "epoch": 1.49, + "grad_norm": 0.18352779746055603, + "learning_rate": 0.00010071988210786411, + "loss": 1.5881, + "step": 553 + }, + { + "epoch": 1.49, + "grad_norm": 0.2220783829689026, + "learning_rate": 0.00010043193165238591, + "loss": 1.4926, + "step": 554 + }, + { + "epoch": 1.49, + "grad_norm": 0.19979965686798096, + "learning_rate": 0.00010014397761540755, + "loss": 1.6466, + "step": 555 + }, + { + "epoch": 1.5, + "grad_norm": 0.17908692359924316, + "learning_rate": 9.985602238459247e-05, + "loss": 1.6098, + "step": 556 + }, + { + "epoch": 1.5, + "grad_norm": 0.19138608872890472, + "learning_rate": 9.956806834761411e-05, + "loss": 1.6106, + "step": 557 + }, + { + "epoch": 1.5, + "grad_norm": 0.19390954077243805, + "learning_rate": 9.92801178921359e-05, + "loss": 1.6256, + "step": 558 + }, + { + "epoch": 1.5, + "grad_norm": 0.17675888538360596, + "learning_rate": 9.899217340579164e-05, + "loss": 1.558, + "step": 559 + }, + { + "epoch": 1.51, + "grad_norm": 0.18734760582447052, + "learning_rate": 9.870423727616558e-05, + "loss": 1.5597, + "step": 560 + }, + { + "epoch": 1.51, + "grad_norm": 0.2040862888097763, + "learning_rate": 9.841631189077269e-05, + "loss": 1.5835, + "step": 561 + }, + { + "epoch": 1.51, + "grad_norm": 0.18791481852531433, + "learning_rate": 9.812839963703899e-05, + "loss": 1.5414, + "step": 562 + }, + { + "epoch": 1.52, + "grad_norm": 0.219825878739357, + "learning_rate": 9.784050290228134e-05, + "loss": 1.3138, + "step": 563 + }, + { + "epoch": 1.52, + "grad_norm": 0.19013109803199768, + "learning_rate": 9.75526240736882e-05, + "loss": 1.4212, + "step": 564 + }, + { + "epoch": 1.52, + "grad_norm": 0.17495863139629364, + "learning_rate": 9.726476553829932e-05, + "loss": 1.535, + "step": 565 + }, + { + "epoch": 1.52, + "grad_norm": 0.19723224639892578, + "learning_rate": 9.69769296829864e-05, + "loss": 1.5841, + "step": 566 + }, + { + "epoch": 1.53, + "grad_norm": 0.17984871566295624, + "learning_rate": 9.668911889443285e-05, + "loss": 1.5363, + "step": 567 + }, + { + "epoch": 1.53, + "grad_norm": 0.23778770864009857, + "learning_rate": 9.640133555911449e-05, + "loss": 1.4199, + "step": 568 + }, + { + "epoch": 1.53, + "grad_norm": 0.1896432489156723, + "learning_rate": 9.611358206327923e-05, + "loss": 1.4074, + "step": 569 + }, + { + "epoch": 1.53, + "grad_norm": 0.18232782185077667, + "learning_rate": 9.582586079292778e-05, + "loss": 1.5881, + "step": 570 + }, + { + "epoch": 1.54, + "grad_norm": 0.19662512838840485, + "learning_rate": 9.553817413379356e-05, + "loss": 1.563, + "step": 571 + }, + { + "epoch": 1.54, + "grad_norm": 0.2098013013601303, + "learning_rate": 9.525052447132294e-05, + "loss": 1.5127, + "step": 572 + }, + { + "epoch": 1.54, + "grad_norm": 0.19347095489501953, + "learning_rate": 9.496291419065561e-05, + "loss": 1.6533, + "step": 573 + }, + { + "epoch": 1.55, + "grad_norm": 0.18512225151062012, + "learning_rate": 9.467534567660466e-05, + "loss": 1.5451, + "step": 574 + }, + { + "epoch": 1.55, + "grad_norm": 0.21681411564350128, + "learning_rate": 9.43878213136369e-05, + "loss": 1.4848, + "step": 575 + }, + { + "epoch": 1.55, + "grad_norm": 0.17953048646450043, + "learning_rate": 9.410034348585298e-05, + "loss": 1.6384, + "step": 576 + }, + { + "epoch": 1.55, + "grad_norm": 0.20537644624710083, + "learning_rate": 9.381291457696779e-05, + "loss": 1.6757, + "step": 577 + }, + { + "epoch": 1.56, + "grad_norm": 0.19091413915157318, + "learning_rate": 9.352553697029048e-05, + "loss": 1.6759, + "step": 578 + }, + { + "epoch": 1.56, + "grad_norm": 0.19432410597801208, + "learning_rate": 9.323821304870489e-05, + "loss": 1.6414, + "step": 579 + }, + { + "epoch": 1.56, + "grad_norm": 0.21516050398349762, + "learning_rate": 9.295094519464972e-05, + "loss": 1.468, + "step": 580 + }, + { + "epoch": 1.56, + "grad_norm": 0.18204712867736816, + "learning_rate": 9.266373579009867e-05, + "loss": 1.5371, + "step": 581 + }, + { + "epoch": 1.57, + "grad_norm": 0.18101070821285248, + "learning_rate": 9.237658721654092e-05, + "loss": 1.5876, + "step": 582 + }, + { + "epoch": 1.57, + "grad_norm": 0.1908547580242157, + "learning_rate": 9.208950185496114e-05, + "loss": 1.4907, + "step": 583 + }, + { + "epoch": 1.57, + "grad_norm": 0.20174852013587952, + "learning_rate": 9.180248208581994e-05, + "loss": 1.5133, + "step": 584 + }, + { + "epoch": 1.58, + "grad_norm": 0.20399607717990875, + "learning_rate": 9.151553028903396e-05, + "loss": 1.6886, + "step": 585 + }, + { + "epoch": 1.58, + "grad_norm": 0.1815037727355957, + "learning_rate": 9.122864884395633e-05, + "loss": 1.5567, + "step": 586 + }, + { + "epoch": 1.58, + "grad_norm": 0.19035212695598602, + "learning_rate": 9.094184012935674e-05, + "loss": 1.6067, + "step": 587 + }, + { + "epoch": 1.58, + "grad_norm": 0.18987087905406952, + "learning_rate": 9.065510652340185e-05, + "loss": 1.6233, + "step": 588 + }, + { + "epoch": 1.59, + "grad_norm": 0.19988678395748138, + "learning_rate": 9.036845040363562e-05, + "loss": 1.494, + "step": 589 + }, + { + "epoch": 1.59, + "grad_norm": 0.18984133005142212, + "learning_rate": 9.008187414695932e-05, + "loss": 1.5683, + "step": 590 + }, + { + "epoch": 1.59, + "grad_norm": 0.18878312408924103, + "learning_rate": 8.979538012961221e-05, + "loss": 1.6567, + "step": 591 + }, + { + "epoch": 1.59, + "grad_norm": 0.1855538934469223, + "learning_rate": 8.950897072715144e-05, + "loss": 1.5954, + "step": 592 + }, + { + "epoch": 1.6, + "grad_norm": 0.24248315393924713, + "learning_rate": 8.922264831443274e-05, + "loss": 1.5471, + "step": 593 + }, + { + "epoch": 1.6, + "grad_norm": 0.19141511619091034, + "learning_rate": 8.89364152655904e-05, + "loss": 1.4948, + "step": 594 + }, + { + "epoch": 1.6, + "grad_norm": 0.19037242233753204, + "learning_rate": 8.865027395401778e-05, + "loss": 1.4857, + "step": 595 + }, + { + "epoch": 1.61, + "grad_norm": 0.19110530614852905, + "learning_rate": 8.836422675234754e-05, + "loss": 1.5597, + "step": 596 + }, + { + "epoch": 1.61, + "grad_norm": 0.24663759768009186, + "learning_rate": 8.807827603243204e-05, + "loss": 1.4354, + "step": 597 + }, + { + "epoch": 1.61, + "grad_norm": 0.19291803240776062, + "learning_rate": 8.779242416532361e-05, + "loss": 1.5331, + "step": 598 + }, + { + "epoch": 1.61, + "grad_norm": 0.20057077705860138, + "learning_rate": 8.750667352125487e-05, + "loss": 1.4771, + "step": 599 + }, + { + "epoch": 1.62, + "grad_norm": 0.18224692344665527, + "learning_rate": 8.722102646961919e-05, + "loss": 1.5867, + "step": 600 + }, + { + "epoch": 1.62, + "grad_norm": 0.20015600323677063, + "learning_rate": 8.69354853789509e-05, + "loss": 1.5603, + "step": 601 + }, + { + "epoch": 1.62, + "grad_norm": 0.19215072691440582, + "learning_rate": 8.665005261690579e-05, + "loss": 1.5803, + "step": 602 + }, + { + "epoch": 1.62, + "grad_norm": 0.17882663011550903, + "learning_rate": 8.636473055024134e-05, + "loss": 1.6111, + "step": 603 + }, + { + "epoch": 1.63, + "grad_norm": 0.20356720685958862, + "learning_rate": 8.607952154479723e-05, + "loss": 1.4877, + "step": 604 + }, + { + "epoch": 1.63, + "grad_norm": 0.1832900196313858, + "learning_rate": 8.579442796547558e-05, + "loss": 1.5668, + "step": 605 + }, + { + "epoch": 1.63, + "grad_norm": 0.20597009360790253, + "learning_rate": 8.550945217622146e-05, + "loss": 1.5496, + "step": 606 + }, + { + "epoch": 1.64, + "grad_norm": 0.19200527667999268, + "learning_rate": 8.522459654000332e-05, + "loss": 1.6538, + "step": 607 + }, + { + "epoch": 1.64, + "grad_norm": 0.19875632226467133, + "learning_rate": 8.493986341879314e-05, + "loss": 1.5669, + "step": 608 + }, + { + "epoch": 1.64, + "grad_norm": 0.1814192533493042, + "learning_rate": 8.465525517354724e-05, + "loss": 1.5273, + "step": 609 + }, + { + "epoch": 1.64, + "grad_norm": 0.19254544377326965, + "learning_rate": 8.437077416418627e-05, + "loss": 1.5282, + "step": 610 + }, + { + "epoch": 1.65, + "grad_norm": 0.20155031979084015, + "learning_rate": 8.408642274957612e-05, + "loss": 1.5957, + "step": 611 + }, + { + "epoch": 1.65, + "grad_norm": 0.19522641599178314, + "learning_rate": 8.380220328750781e-05, + "loss": 1.6277, + "step": 612 + }, + { + "epoch": 1.65, + "grad_norm": 0.18992267549037933, + "learning_rate": 8.351811813467851e-05, + "loss": 1.5687, + "step": 613 + }, + { + "epoch": 1.65, + "grad_norm": 0.18783116340637207, + "learning_rate": 8.32341696466715e-05, + "loss": 1.5739, + "step": 614 + }, + { + "epoch": 1.66, + "grad_norm": 0.17621149122714996, + "learning_rate": 8.295036017793702e-05, + "loss": 1.5197, + "step": 615 + }, + { + "epoch": 1.66, + "grad_norm": 0.2008402794599533, + "learning_rate": 8.266669208177252e-05, + "loss": 1.5905, + "step": 616 + }, + { + "epoch": 1.66, + "grad_norm": 0.212905153632164, + "learning_rate": 8.238316771030318e-05, + "loss": 1.6655, + "step": 617 + }, + { + "epoch": 1.67, + "grad_norm": 0.17801809310913086, + "learning_rate": 8.209978941446252e-05, + "loss": 1.6393, + "step": 618 + }, + { + "epoch": 1.67, + "grad_norm": 0.1957562416791916, + "learning_rate": 8.181655954397275e-05, + "loss": 1.4709, + "step": 619 + }, + { + "epoch": 1.67, + "grad_norm": 0.19428160786628723, + "learning_rate": 8.153348044732543e-05, + "loss": 1.4383, + "step": 620 + }, + { + "epoch": 1.67, + "grad_norm": 0.20604220032691956, + "learning_rate": 8.125055447176186e-05, + "loss": 1.5547, + "step": 621 + }, + { + "epoch": 1.68, + "grad_norm": 0.18895936012268066, + "learning_rate": 8.096778396325377e-05, + "loss": 1.5027, + "step": 622 + }, + { + "epoch": 1.68, + "grad_norm": 0.1925646960735321, + "learning_rate": 8.068517126648369e-05, + "loss": 1.6032, + "step": 623 + }, + { + "epoch": 1.68, + "grad_norm": 0.20500631630420685, + "learning_rate": 8.04027187248257e-05, + "loss": 1.62, + "step": 624 + }, + { + "epoch": 1.68, + "grad_norm": 0.19827888906002045, + "learning_rate": 8.012042868032585e-05, + "loss": 1.5958, + "step": 625 + }, + { + "epoch": 1.69, + "grad_norm": 0.18163664638996124, + "learning_rate": 7.983830347368276e-05, + "loss": 1.5118, + "step": 626 + }, + { + "epoch": 1.69, + "grad_norm": 0.20141161978244781, + "learning_rate": 7.955634544422834e-05, + "loss": 1.54, + "step": 627 + }, + { + "epoch": 1.69, + "grad_norm": 0.1766718327999115, + "learning_rate": 7.927455692990818e-05, + "loss": 1.6012, + "step": 628 + }, + { + "epoch": 1.7, + "grad_norm": 0.1866627186536789, + "learning_rate": 7.899294026726241e-05, + "loss": 1.529, + "step": 629 + }, + { + "epoch": 1.7, + "grad_norm": 0.19402055442333221, + "learning_rate": 7.871149779140604e-05, + "loss": 1.4608, + "step": 630 + }, + { + "epoch": 1.7, + "grad_norm": 0.19119343161582947, + "learning_rate": 7.843023183600988e-05, + "loss": 1.5737, + "step": 631 + }, + { + "epoch": 1.7, + "grad_norm": 0.2043522596359253, + "learning_rate": 7.81491447332811e-05, + "loss": 1.5267, + "step": 632 + }, + { + "epoch": 1.71, + "grad_norm": 0.20135103166103363, + "learning_rate": 7.786823881394364e-05, + "loss": 1.5529, + "step": 633 + }, + { + "epoch": 1.71, + "grad_norm": 0.20661364495754242, + "learning_rate": 7.758751640721937e-05, + "loss": 1.6185, + "step": 634 + }, + { + "epoch": 1.71, + "grad_norm": 0.19659817218780518, + "learning_rate": 7.730697984080827e-05, + "loss": 1.5582, + "step": 635 + }, + { + "epoch": 1.71, + "grad_norm": 0.1995358020067215, + "learning_rate": 7.702663144086957e-05, + "loss": 1.6023, + "step": 636 + }, + { + "epoch": 1.72, + "grad_norm": 0.23160699009895325, + "learning_rate": 7.674647353200208e-05, + "loss": 1.4295, + "step": 637 + }, + { + "epoch": 1.72, + "grad_norm": 0.19710291922092438, + "learning_rate": 7.64665084372252e-05, + "loss": 1.5295, + "step": 638 + }, + { + "epoch": 1.72, + "grad_norm": 0.20388691127300262, + "learning_rate": 7.618673847795953e-05, + "loss": 1.4763, + "step": 639 + }, + { + "epoch": 1.72, + "grad_norm": 0.196650430560112, + "learning_rate": 7.590716597400761e-05, + "loss": 1.5664, + "step": 640 + }, + { + "epoch": 1.73, + "grad_norm": 0.19940368831157684, + "learning_rate": 7.562779324353477e-05, + "loss": 1.537, + "step": 641 + }, + { + "epoch": 1.73, + "grad_norm": 0.18506959080696106, + "learning_rate": 7.53486226030498e-05, + "loss": 1.5549, + "step": 642 + }, + { + "epoch": 1.73, + "grad_norm": 0.19527395069599152, + "learning_rate": 7.506965636738583e-05, + "loss": 1.5862, + "step": 643 + }, + { + "epoch": 1.74, + "grad_norm": 0.19912207126617432, + "learning_rate": 7.479089684968106e-05, + "loss": 1.6224, + "step": 644 + }, + { + "epoch": 1.74, + "eval_loss": 1.588733434677124, + "eval_runtime": 53.8386, + "eval_samples_per_second": 92.814, + "eval_steps_per_second": 23.218, + "step": 644 + }, + { + "epoch": 1.74, + "grad_norm": 0.17763713002204895, + "learning_rate": 7.451234636135969e-05, + "loss": 1.5622, + "step": 645 + }, + { + "epoch": 1.74, + "grad_norm": 0.19509463012218475, + "learning_rate": 7.42340072121126e-05, + "loss": 1.5732, + "step": 646 + }, + { + "epoch": 1.74, + "grad_norm": 0.19236071407794952, + "learning_rate": 7.395588170987839e-05, + "loss": 1.4118, + "step": 647 + }, + { + "epoch": 1.75, + "grad_norm": 0.18488897383213043, + "learning_rate": 7.367797216082402e-05, + "loss": 1.5281, + "step": 648 + }, + { + "epoch": 1.75, + "grad_norm": 0.19441011548042297, + "learning_rate": 7.340028086932587e-05, + "loss": 1.5507, + "step": 649 + }, + { + "epoch": 1.75, + "grad_norm": 0.20152153074741364, + "learning_rate": 7.312281013795064e-05, + "loss": 1.5329, + "step": 650 + }, + { + "epoch": 1.75, + "grad_norm": 0.20338155329227448, + "learning_rate": 7.284556226743598e-05, + "loss": 1.5696, + "step": 651 + }, + { + "epoch": 1.76, + "grad_norm": 0.19269070029258728, + "learning_rate": 7.256853955667187e-05, + "loss": 1.6863, + "step": 652 + }, + { + "epoch": 1.76, + "grad_norm": 0.18623311817646027, + "learning_rate": 7.229174430268104e-05, + "loss": 1.5835, + "step": 653 + }, + { + "epoch": 1.76, + "grad_norm": 0.1871241182088852, + "learning_rate": 7.201517880060049e-05, + "loss": 1.6267, + "step": 654 + }, + { + "epoch": 1.77, + "grad_norm": 0.18970423936843872, + "learning_rate": 7.173884534366182e-05, + "loss": 1.5929, + "step": 655 + }, + { + "epoch": 1.77, + "grad_norm": 0.2575983703136444, + "learning_rate": 7.146274622317288e-05, + "loss": 1.5495, + "step": 656 + }, + { + "epoch": 1.77, + "grad_norm": 0.17981579899787903, + "learning_rate": 7.118688372849815e-05, + "loss": 1.4926, + "step": 657 + }, + { + "epoch": 1.77, + "grad_norm": 0.19219791889190674, + "learning_rate": 7.091126014704032e-05, + "loss": 1.5298, + "step": 658 + }, + { + "epoch": 1.78, + "grad_norm": 0.2046176642179489, + "learning_rate": 7.063587776422088e-05, + "loss": 1.4501, + "step": 659 + }, + { + "epoch": 1.78, + "grad_norm": 0.19982139766216278, + "learning_rate": 7.036073886346138e-05, + "loss": 1.5229, + "step": 660 + }, + { + "epoch": 1.78, + "grad_norm": 0.18925295770168304, + "learning_rate": 7.008584572616448e-05, + "loss": 1.6139, + "step": 661 + }, + { + "epoch": 1.78, + "grad_norm": 0.19633693993091583, + "learning_rate": 6.981120063169499e-05, + "loss": 1.7017, + "step": 662 + }, + { + "epoch": 1.79, + "grad_norm": 0.21355824172496796, + "learning_rate": 6.953680585736105e-05, + "loss": 1.5256, + "step": 663 + }, + { + "epoch": 1.79, + "grad_norm": 0.2261500209569931, + "learning_rate": 6.926266367839508e-05, + "loss": 1.423, + "step": 664 + }, + { + "epoch": 1.79, + "grad_norm": 0.20399561524391174, + "learning_rate": 6.898877636793517e-05, + "loss": 1.5569, + "step": 665 + }, + { + "epoch": 1.8, + "grad_norm": 0.17641954123973846, + "learning_rate": 6.871514619700594e-05, + "loss": 1.5215, + "step": 666 + }, + { + "epoch": 1.8, + "grad_norm": 0.23922847211360931, + "learning_rate": 6.844177543449997e-05, + "loss": 1.365, + "step": 667 + }, + { + "epoch": 1.8, + "grad_norm": 0.17896459996700287, + "learning_rate": 6.816866634715881e-05, + "loss": 1.6711, + "step": 668 + }, + { + "epoch": 1.8, + "grad_norm": 0.1928153783082962, + "learning_rate": 6.789582119955424e-05, + "loss": 1.6874, + "step": 669 + }, + { + "epoch": 1.81, + "grad_norm": 0.18445910513401031, + "learning_rate": 6.762324225406957e-05, + "loss": 1.6136, + "step": 670 + }, + { + "epoch": 1.81, + "grad_norm": 0.19517773389816284, + "learning_rate": 6.73509317708807e-05, + "loss": 1.5981, + "step": 671 + }, + { + "epoch": 1.81, + "grad_norm": 0.20011338591575623, + "learning_rate": 6.70788920079376e-05, + "loss": 1.6253, + "step": 672 + }, + { + "epoch": 1.81, + "grad_norm": 0.19087383151054382, + "learning_rate": 6.680712522094537e-05, + "loss": 1.5274, + "step": 673 + }, + { + "epoch": 1.82, + "grad_norm": 0.1847938448190689, + "learning_rate": 6.653563366334577e-05, + "loss": 1.4764, + "step": 674 + }, + { + "epoch": 1.82, + "grad_norm": 0.1973254382610321, + "learning_rate": 6.626441958629826e-05, + "loss": 1.5333, + "step": 675 + }, + { + "epoch": 1.82, + "grad_norm": 0.18505576252937317, + "learning_rate": 6.599348523866155e-05, + "loss": 1.4504, + "step": 676 + }, + { + "epoch": 1.83, + "grad_norm": 0.2034735381603241, + "learning_rate": 6.572283286697496e-05, + "loss": 1.4913, + "step": 677 + }, + { + "epoch": 1.83, + "grad_norm": 0.21279850602149963, + "learning_rate": 6.54524647154396e-05, + "loss": 1.566, + "step": 678 + }, + { + "epoch": 1.83, + "grad_norm": 0.2010655552148819, + "learning_rate": 6.518238302589994e-05, + "loss": 1.5354, + "step": 679 + }, + { + "epoch": 1.83, + "grad_norm": 0.19974732398986816, + "learning_rate": 6.491259003782511e-05, + "loss": 1.4899, + "step": 680 + }, + { + "epoch": 1.84, + "grad_norm": 0.19252151250839233, + "learning_rate": 6.464308798829043e-05, + "loss": 1.6117, + "step": 681 + }, + { + "epoch": 1.84, + "grad_norm": 0.18204721808433533, + "learning_rate": 6.437387911195875e-05, + "loss": 1.5365, + "step": 682 + }, + { + "epoch": 1.84, + "grad_norm": 0.19013364613056183, + "learning_rate": 6.410496564106207e-05, + "loss": 1.6737, + "step": 683 + }, + { + "epoch": 1.84, + "grad_norm": 0.19772353768348694, + "learning_rate": 6.383634980538286e-05, + "loss": 1.6396, + "step": 684 + }, + { + "epoch": 1.85, + "grad_norm": 0.19503507018089294, + "learning_rate": 6.356803383223569e-05, + "loss": 1.6221, + "step": 685 + }, + { + "epoch": 1.85, + "grad_norm": 0.18829859793186188, + "learning_rate": 6.33000199464487e-05, + "loss": 1.5017, + "step": 686 + }, + { + "epoch": 1.85, + "grad_norm": 0.18767286837100983, + "learning_rate": 6.30323103703452e-05, + "loss": 1.6461, + "step": 687 + }, + { + "epoch": 1.86, + "grad_norm": 0.19198834896087646, + "learning_rate": 6.276490732372522e-05, + "loss": 1.6037, + "step": 688 + }, + { + "epoch": 1.86, + "grad_norm": 0.21538251638412476, + "learning_rate": 6.249781302384705e-05, + "loss": 1.5674, + "step": 689 + }, + { + "epoch": 1.86, + "grad_norm": 0.20613306760787964, + "learning_rate": 6.2231029685409e-05, + "loss": 1.5887, + "step": 690 + }, + { + "epoch": 1.86, + "grad_norm": 0.19516527652740479, + "learning_rate": 6.196455952053084e-05, + "loss": 1.5408, + "step": 691 + }, + { + "epoch": 1.87, + "grad_norm": 0.19336941838264465, + "learning_rate": 6.169840473873565e-05, + "loss": 1.5313, + "step": 692 + }, + { + "epoch": 1.87, + "grad_norm": 0.19075316190719604, + "learning_rate": 6.143256754693134e-05, + "loss": 1.6081, + "step": 693 + }, + { + "epoch": 1.87, + "grad_norm": 0.1863541156053543, + "learning_rate": 6.116705014939246e-05, + "loss": 1.5864, + "step": 694 + }, + { + "epoch": 1.87, + "grad_norm": 0.2056710124015808, + "learning_rate": 6.090185474774192e-05, + "loss": 1.5778, + "step": 695 + }, + { + "epoch": 1.88, + "grad_norm": 0.17553465068340302, + "learning_rate": 6.063698354093255e-05, + "loss": 1.6269, + "step": 696 + }, + { + "epoch": 1.88, + "grad_norm": 0.20064443349838257, + "learning_rate": 6.037243872522924e-05, + "loss": 1.5933, + "step": 697 + }, + { + "epoch": 1.88, + "grad_norm": 0.19043272733688354, + "learning_rate": 6.010822249419027e-05, + "loss": 1.6484, + "step": 698 + }, + { + "epoch": 1.89, + "grad_norm": 0.23049475252628326, + "learning_rate": 5.984433703864956e-05, + "loss": 1.4464, + "step": 699 + }, + { + "epoch": 1.89, + "grad_norm": 0.17904847860336304, + "learning_rate": 5.95807845466981e-05, + "loss": 1.5606, + "step": 700 + }, + { + "epoch": 1.89, + "grad_norm": 0.19818058609962463, + "learning_rate": 5.931756720366621e-05, + "loss": 1.5782, + "step": 701 + }, + { + "epoch": 1.89, + "grad_norm": 0.24131308495998383, + "learning_rate": 5.9054687192105026e-05, + "loss": 1.4238, + "step": 702 + }, + { + "epoch": 1.9, + "grad_norm": 0.20596317946910858, + "learning_rate": 5.8792146691768726e-05, + "loss": 1.5865, + "step": 703 + }, + { + "epoch": 1.9, + "grad_norm": 0.19873526692390442, + "learning_rate": 5.852994787959628e-05, + "loss": 1.496, + "step": 704 + }, + { + "epoch": 1.9, + "grad_norm": 0.2155202329158783, + "learning_rate": 5.8268092929693405e-05, + "loss": 1.4759, + "step": 705 + }, + { + "epoch": 1.9, + "grad_norm": 0.20002122223377228, + "learning_rate": 5.800658401331467e-05, + "loss": 1.5385, + "step": 706 + }, + { + "epoch": 1.91, + "grad_norm": 0.19707627594470978, + "learning_rate": 5.774542329884518e-05, + "loss": 1.6369, + "step": 707 + }, + { + "epoch": 1.91, + "grad_norm": 0.2333543300628662, + "learning_rate": 5.748461295178315e-05, + "loss": 1.5858, + "step": 708 + }, + { + "epoch": 1.91, + "grad_norm": 0.1858336627483368, + "learning_rate": 5.722415513472128e-05, + "loss": 1.5524, + "step": 709 + }, + { + "epoch": 1.92, + "grad_norm": 0.1900344043970108, + "learning_rate": 5.696405200732939e-05, + "loss": 1.6238, + "step": 710 + }, + { + "epoch": 1.92, + "grad_norm": 0.18405009806156158, + "learning_rate": 5.670430572633607e-05, + "loss": 1.5743, + "step": 711 + }, + { + "epoch": 1.92, + "grad_norm": 0.22856737673282623, + "learning_rate": 5.644491844551121e-05, + "loss": 1.5763, + "step": 712 + }, + { + "epoch": 1.92, + "grad_norm": 0.1876060515642166, + "learning_rate": 5.6185892315647856e-05, + "loss": 1.6686, + "step": 713 + }, + { + "epoch": 1.93, + "grad_norm": 0.19633199274539948, + "learning_rate": 5.592722948454437e-05, + "loss": 1.5807, + "step": 714 + }, + { + "epoch": 1.93, + "grad_norm": 0.18378899991512299, + "learning_rate": 5.56689320969868e-05, + "loss": 1.5507, + "step": 715 + }, + { + "epoch": 1.93, + "grad_norm": 0.18441124260425568, + "learning_rate": 5.5411002294730996e-05, + "loss": 1.6386, + "step": 716 + }, + { + "epoch": 1.93, + "grad_norm": 0.20815691351890564, + "learning_rate": 5.515344221648484e-05, + "loss": 1.4727, + "step": 717 + }, + { + "epoch": 1.94, + "grad_norm": 0.19462059438228607, + "learning_rate": 5.489625399789048e-05, + "loss": 1.5376, + "step": 718 + }, + { + "epoch": 1.94, + "grad_norm": 0.2019672840833664, + "learning_rate": 5.463943977150674e-05, + "loss": 1.5645, + "step": 719 + }, + { + "epoch": 1.94, + "grad_norm": 0.20614993572235107, + "learning_rate": 5.438300166679134e-05, + "loss": 1.6275, + "step": 720 + }, + { + "epoch": 1.95, + "grad_norm": 0.18458101153373718, + "learning_rate": 5.412694181008329e-05, + "loss": 1.589, + "step": 721 + }, + { + "epoch": 1.95, + "grad_norm": 0.18971580266952515, + "learning_rate": 5.387126232458522e-05, + "loss": 1.5143, + "step": 722 + }, + { + "epoch": 1.95, + "grad_norm": 0.18646183609962463, + "learning_rate": 5.3615965330345716e-05, + "loss": 1.5866, + "step": 723 + }, + { + "epoch": 1.95, + "grad_norm": 0.19913147389888763, + "learning_rate": 5.336105294424194e-05, + "loss": 1.5466, + "step": 724 + }, + { + "epoch": 1.96, + "grad_norm": 0.20623794198036194, + "learning_rate": 5.310652727996188e-05, + "loss": 1.5166, + "step": 725 + }, + { + "epoch": 1.96, + "grad_norm": 0.19151991605758667, + "learning_rate": 5.285239044798695e-05, + "loss": 1.4791, + "step": 726 + }, + { + "epoch": 1.96, + "grad_norm": 0.20113767683506012, + "learning_rate": 5.2598644555574326e-05, + "loss": 1.6312, + "step": 727 + }, + { + "epoch": 1.96, + "grad_norm": 0.20970012247562408, + "learning_rate": 5.2345291706739684e-05, + "loss": 1.5929, + "step": 728 + }, + { + "epoch": 1.97, + "grad_norm": 0.21298512816429138, + "learning_rate": 5.209233400223963e-05, + "loss": 1.5484, + "step": 729 + }, + { + "epoch": 1.97, + "grad_norm": 0.2009279429912567, + "learning_rate": 5.183977353955427e-05, + "loss": 1.57, + "step": 730 + }, + { + "epoch": 1.97, + "grad_norm": 0.19530166685581207, + "learning_rate": 5.1587612412869954e-05, + "loss": 1.4022, + "step": 731 + }, + { + "epoch": 1.98, + "grad_norm": 0.19641190767288208, + "learning_rate": 5.1335852713061594e-05, + "loss": 1.6239, + "step": 732 + }, + { + "epoch": 1.98, + "grad_norm": 0.1912887543439865, + "learning_rate": 5.108449652767584e-05, + "loss": 1.5945, + "step": 733 + }, + { + "epoch": 1.98, + "grad_norm": 0.20099541544914246, + "learning_rate": 5.0833545940913183e-05, + "loss": 1.5148, + "step": 734 + } + ], + "logging_steps": 1, + "max_steps": 1101, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 367, + "total_flos": 2.190286687555289e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}