{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7541241162608012, "eval_steps": 80, "global_step": 240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003142183817753339, "grad_norm": 2.126755475997925, "learning_rate": 2e-05, "loss": 1.2764, "step": 1 }, { "epoch": 0.006284367635506678, "grad_norm": 3.858302354812622, "learning_rate": 4e-05, "loss": 2.1355, "step": 2 }, { "epoch": 0.009426551453260016, "grad_norm": 4.163000583648682, "learning_rate": 6e-05, "loss": 2.4022, "step": 3 }, { "epoch": 0.012568735271013355, "grad_norm": 4.161789417266846, "learning_rate": 8e-05, "loss": 2.1123, "step": 4 }, { "epoch": 0.015710919088766692, "grad_norm": 4.415841579437256, "learning_rate": 0.0001, "loss": 1.8014, "step": 5 }, { "epoch": 0.018853102906520033, "grad_norm": 4.332009315490723, "learning_rate": 0.00012, "loss": 2.1763, "step": 6 }, { "epoch": 0.02199528672427337, "grad_norm": 4.285618782043457, "learning_rate": 0.00014, "loss": 1.9706, "step": 7 }, { "epoch": 0.02513747054202671, "grad_norm": 4.917290210723877, "learning_rate": 0.00016, "loss": 2.3984, "step": 8 }, { "epoch": 0.028279654359780047, "grad_norm": 4.707202911376953, "learning_rate": 0.00018, "loss": 2.1221, "step": 9 }, { "epoch": 0.031421838177533384, "grad_norm": 4.239159107208252, "learning_rate": 0.0002, "loss": 1.581, "step": 10 }, { "epoch": 0.03456402199528672, "grad_norm": 4.759877681732178, "learning_rate": 0.00019999483168411242, "loss": 2.3197, "step": 11 }, { "epoch": 0.037706205813040065, "grad_norm": 4.169210910797119, "learning_rate": 0.00019997932727067942, "loss": 2.0297, "step": 12 }, { "epoch": 0.0408483896307934, "grad_norm": 4.163512706756592, "learning_rate": 0.00019995348836233516, "loss": 1.847, "step": 13 }, { "epoch": 0.04399057344854674, "grad_norm": 4.143561363220215, "learning_rate": 0.00019991731762995243, "loss": 1.8566, "step": 14 }, { "epoch": 0.04713275726630008, "grad_norm": 4.060585975646973, "learning_rate": 0.00019987081881236667, "loss": 1.8111, "step": 15 }, { "epoch": 0.05027494108405342, "grad_norm": 4.021474361419678, "learning_rate": 0.00019981399671598939, "loss": 1.9765, "step": 16 }, { "epoch": 0.05341712490180676, "grad_norm": 3.236685037612915, "learning_rate": 0.0001997468572143115, "loss": 1.8132, "step": 17 }, { "epoch": 0.056559308719560095, "grad_norm": 4.249021053314209, "learning_rate": 0.00019966940724729603, "loss": 2.0374, "step": 18 }, { "epoch": 0.05970149253731343, "grad_norm": 3.468334197998047, "learning_rate": 0.00019958165482066094, "loss": 1.6331, "step": 19 }, { "epoch": 0.06284367635506677, "grad_norm": 4.119140625, "learning_rate": 0.00019948360900505139, "loss": 1.5705, "step": 20 }, { "epoch": 0.06598586017282011, "grad_norm": 4.388521194458008, "learning_rate": 0.0001993752799351023, "loss": 1.8187, "step": 21 }, { "epoch": 0.06912804399057344, "grad_norm": 4.789368152618408, "learning_rate": 0.0001992566788083908, "loss": 1.7719, "step": 22 }, { "epoch": 0.07227022780832679, "grad_norm": 4.014241695404053, "learning_rate": 0.0001991278178842786, "loss": 1.3226, "step": 23 }, { "epoch": 0.07541241162608013, "grad_norm": 3.5127627849578857, "learning_rate": 0.0001989887104826449, "loss": 1.2867, "step": 24 }, { "epoch": 0.07855459544383346, "grad_norm": 3.8713948726654053, "learning_rate": 0.00019883937098250963, "loss": 1.4878, "step": 25 }, { "epoch": 0.0816967792615868, "grad_norm": 3.8671414852142334, "learning_rate": 0.00019867981482054697, "loss": 1.8736, "step": 26 }, { "epoch": 0.08483896307934014, "grad_norm": 5.4276862144470215, "learning_rate": 0.00019851005848948988, "loss": 2.3185, "step": 27 }, { "epoch": 0.08798114689709348, "grad_norm": 5.030064105987549, "learning_rate": 0.00019833011953642525, "loss": 1.7816, "step": 28 }, { "epoch": 0.09112333071484682, "grad_norm": 4.504077434539795, "learning_rate": 0.00019814001656098001, "loss": 1.5206, "step": 29 }, { "epoch": 0.09426551453260015, "grad_norm": 4.544330596923828, "learning_rate": 0.0001979397692133988, "loss": 1.7453, "step": 30 }, { "epoch": 0.0974076983503535, "grad_norm": 5.2962846755981445, "learning_rate": 0.0001977293981925125, "loss": 1.2598, "step": 31 }, { "epoch": 0.10054988216810684, "grad_norm": 4.962874412536621, "learning_rate": 0.00019750892524359896, "loss": 1.1216, "step": 32 }, { "epoch": 0.10369206598586017, "grad_norm": 4.862067699432373, "learning_rate": 0.00019727837315613504, "loss": 1.6755, "step": 33 }, { "epoch": 0.10683424980361352, "grad_norm": 4.559788703918457, "learning_rate": 0.00019703776576144105, "loss": 1.2827, "step": 34 }, { "epoch": 0.10997643362136685, "grad_norm": 5.2561516761779785, "learning_rate": 0.0001967871279302175, "loss": 1.6958, "step": 35 }, { "epoch": 0.11311861743912019, "grad_norm": 5.258450031280518, "learning_rate": 0.000196526485569974, "loss": 1.8047, "step": 36 }, { "epoch": 0.11626080125687353, "grad_norm": 4.785277366638184, "learning_rate": 0.0001962558656223516, "loss": 1.3523, "step": 37 }, { "epoch": 0.11940298507462686, "grad_norm": 5.375392436981201, "learning_rate": 0.00019597529606033782, "loss": 1.9805, "step": 38 }, { "epoch": 0.12254516889238021, "grad_norm": 6.124971866607666, "learning_rate": 0.00019568480588537512, "loss": 1.8492, "step": 39 }, { "epoch": 0.12568735271013354, "grad_norm": 6.232330322265625, "learning_rate": 0.00019538442512436328, "loss": 1.9942, "step": 40 }, { "epoch": 0.12882953652788687, "grad_norm": 5.813709735870361, "learning_rate": 0.00019507418482655545, "loss": 1.8089, "step": 41 }, { "epoch": 0.13197172034564023, "grad_norm": 5.776797771453857, "learning_rate": 0.00019475411706034883, "loss": 1.49, "step": 42 }, { "epoch": 0.13511390416339356, "grad_norm": 6.618277549743652, "learning_rate": 0.00019442425490996988, "loss": 2.1526, "step": 43 }, { "epoch": 0.13825608798114689, "grad_norm": 5.774255275726318, "learning_rate": 0.0001940846324720544, "loss": 1.9265, "step": 44 }, { "epoch": 0.14139827179890024, "grad_norm": 7.190465927124023, "learning_rate": 0.00019373528485212325, "loss": 1.7804, "step": 45 }, { "epoch": 0.14454045561665357, "grad_norm": 7.139533996582031, "learning_rate": 0.00019337624816095358, "loss": 1.9049, "step": 46 }, { "epoch": 0.1476826394344069, "grad_norm": 6.773796558380127, "learning_rate": 0.00019300755951084594, "loss": 2.2024, "step": 47 }, { "epoch": 0.15082482325216026, "grad_norm": 7.201170444488525, "learning_rate": 0.00019262925701178866, "loss": 1.4886, "step": 48 }, { "epoch": 0.1539670070699136, "grad_norm": 9.906222343444824, "learning_rate": 0.00019224137976751795, "loss": 2.3671, "step": 49 }, { "epoch": 0.15710919088766692, "grad_norm": 8.422285079956055, "learning_rate": 0.00019184396787147632, "loss": 1.8778, "step": 50 }, { "epoch": 0.16025137470542028, "grad_norm": 2.8290162086486816, "learning_rate": 0.0001914370624026681, "loss": 1.0618, "step": 51 }, { "epoch": 0.1633935585231736, "grad_norm": 2.55100417137146, "learning_rate": 0.00019102070542141328, "loss": 1.288, "step": 52 }, { "epoch": 0.16653574234092694, "grad_norm": 3.0041720867156982, "learning_rate": 0.00019059493996499986, "loss": 1.54, "step": 53 }, { "epoch": 0.16967792615868027, "grad_norm": 2.9651379585266113, "learning_rate": 0.00019015981004323536, "loss": 1.1355, "step": 54 }, { "epoch": 0.17282010997643363, "grad_norm": 3.583242654800415, "learning_rate": 0.00018971536063389744, "loss": 1.5989, "step": 55 }, { "epoch": 0.17596229379418696, "grad_norm": 3.1135594844818115, "learning_rate": 0.00018926163767808503, "loss": 1.5147, "step": 56 }, { "epoch": 0.1791044776119403, "grad_norm": 3.134974718093872, "learning_rate": 0.00018879868807546935, "loss": 1.3794, "step": 57 }, { "epoch": 0.18224666142969365, "grad_norm": 3.3052773475646973, "learning_rate": 0.00018832655967944607, "loss": 1.1289, "step": 58 }, { "epoch": 0.18538884524744698, "grad_norm": 3.248166561126709, "learning_rate": 0.0001878453012921891, "loss": 1.7131, "step": 59 }, { "epoch": 0.1885310290652003, "grad_norm": 3.5102086067199707, "learning_rate": 0.00018735496265960574, "loss": 1.4792, "step": 60 }, { "epoch": 0.19167321288295366, "grad_norm": 3.3507046699523926, "learning_rate": 0.0001868555944661949, "loss": 1.1956, "step": 61 }, { "epoch": 0.194815396700707, "grad_norm": 3.418466091156006, "learning_rate": 0.00018634724832980793, "loss": 1.2318, "step": 62 }, { "epoch": 0.19795758051846032, "grad_norm": 3.703152894973755, "learning_rate": 0.00018582997679631315, "loss": 1.3081, "step": 63 }, { "epoch": 0.20109976433621368, "grad_norm": 3.2883918285369873, "learning_rate": 0.00018530383333416418, "loss": 1.272, "step": 64 }, { "epoch": 0.204241948153967, "grad_norm": 3.48500657081604, "learning_rate": 0.0001847688723288733, "loss": 1.3055, "step": 65 }, { "epoch": 0.20738413197172034, "grad_norm": 3.614758253097534, "learning_rate": 0.00018422514907738988, "loss": 1.4524, "step": 66 }, { "epoch": 0.21052631578947367, "grad_norm": 3.5935981273651123, "learning_rate": 0.0001836727197823842, "loss": 1.2017, "step": 67 }, { "epoch": 0.21366849960722703, "grad_norm": 3.526014566421509, "learning_rate": 0.00018311164154643836, "loss": 1.3493, "step": 68 }, { "epoch": 0.21681068342498036, "grad_norm": 3.2999374866485596, "learning_rate": 0.00018254197236614354, "loss": 0.8625, "step": 69 }, { "epoch": 0.2199528672427337, "grad_norm": 3.8912713527679443, "learning_rate": 0.00018196377112610526, "loss": 1.7266, "step": 70 }, { "epoch": 0.22309505106048705, "grad_norm": 4.023575305938721, "learning_rate": 0.00018137709759285663, "loss": 1.4603, "step": 71 }, { "epoch": 0.22623723487824038, "grad_norm": 4.701601982116699, "learning_rate": 0.0001807820124086805, "loss": 1.4397, "step": 72 }, { "epoch": 0.2293794186959937, "grad_norm": 3.814650058746338, "learning_rate": 0.00018017857708534107, "loss": 1.0826, "step": 73 }, { "epoch": 0.23252160251374707, "grad_norm": 3.872602701187134, "learning_rate": 0.00017956685399772576, "loss": 1.1974, "step": 74 }, { "epoch": 0.2356637863315004, "grad_norm": 3.8505866527557373, "learning_rate": 0.00017894690637739765, "loss": 1.4843, "step": 75 }, { "epoch": 0.23880597014925373, "grad_norm": 4.546689987182617, "learning_rate": 0.00017831879830605937, "loss": 1.6185, "step": 76 }, { "epoch": 0.24194815396700706, "grad_norm": 4.184659957885742, "learning_rate": 0.00017768259470892942, "loss": 1.3544, "step": 77 }, { "epoch": 0.24509033778476041, "grad_norm": 4.5855841636657715, "learning_rate": 0.00017703836134803105, "loss": 1.4836, "step": 78 }, { "epoch": 0.24823252160251374, "grad_norm": 3.902205228805542, "learning_rate": 0.0001763861648153945, "loss": 1.3421, "step": 79 }, { "epoch": 0.2513747054202671, "grad_norm": 4.619789123535156, "learning_rate": 0.00017572607252617378, "loss": 1.4522, "step": 80 }, { "epoch": 0.2513747054202671, "eval_loss": 1.2847973108291626, "eval_runtime": 3.1143, "eval_samples_per_second": 43.028, "eval_steps_per_second": 21.514, "step": 80 }, { "epoch": 0.2545168892380204, "grad_norm": 4.021862983703613, "learning_rate": 0.00017505815271167823, "loss": 1.3371, "step": 81 }, { "epoch": 0.25765907305577374, "grad_norm": 4.443328857421875, "learning_rate": 0.0001743824744123196, "loss": 1.0553, "step": 82 }, { "epoch": 0.2608012568735271, "grad_norm": 4.35683012008667, "learning_rate": 0.00017369910747047572, "loss": 1.3028, "step": 83 }, { "epoch": 0.26394344069128045, "grad_norm": 3.971630573272705, "learning_rate": 0.00017300812252327104, "loss": 1.1133, "step": 84 }, { "epoch": 0.2670856245090338, "grad_norm": 4.923866271972656, "learning_rate": 0.00017230959099527512, "loss": 1.604, "step": 85 }, { "epoch": 0.2702278083267871, "grad_norm": 4.6606645584106445, "learning_rate": 0.0001716035850911199, "loss": 1.1737, "step": 86 }, { "epoch": 0.27336999214454044, "grad_norm": 5.046879768371582, "learning_rate": 0.000170890177788036, "loss": 1.4454, "step": 87 }, { "epoch": 0.27651217596229377, "grad_norm": 4.8671369552612305, "learning_rate": 0.00017016944282830933, "loss": 1.5334, "step": 88 }, { "epoch": 0.27965435978004716, "grad_norm": 4.2953877449035645, "learning_rate": 0.0001694414547116588, "loss": 1.0182, "step": 89 }, { "epoch": 0.2827965435978005, "grad_norm": 4.889102458953857, "learning_rate": 0.00016870628868753546, "loss": 1.2988, "step": 90 }, { "epoch": 0.2859387274155538, "grad_norm": 5.039775371551514, "learning_rate": 0.00016796402074734402, "loss": 1.2124, "step": 91 }, { "epoch": 0.28908091123330715, "grad_norm": 4.960280418395996, "learning_rate": 0.00016721472761658837, "loss": 1.303, "step": 92 }, { "epoch": 0.2922230950510605, "grad_norm": 5.405333042144775, "learning_rate": 0.0001664584867469403, "loss": 1.5258, "step": 93 }, { "epoch": 0.2953652788688138, "grad_norm": 5.361093521118164, "learning_rate": 0.00016569537630823383, "loss": 1.4626, "step": 94 }, { "epoch": 0.29850746268656714, "grad_norm": 6.681849956512451, "learning_rate": 0.00016492547518038504, "loss": 1.9639, "step": 95 }, { "epoch": 0.3016496465043205, "grad_norm": 5.811886787414551, "learning_rate": 0.0001641488629452386, "loss": 1.4828, "step": 96 }, { "epoch": 0.30479183032207385, "grad_norm": 5.324227333068848, "learning_rate": 0.00016336561987834153, "loss": 1.3556, "step": 97 }, { "epoch": 0.3079340141398272, "grad_norm": 6.905324935913086, "learning_rate": 0.00016257582694064558, "loss": 1.3208, "step": 98 }, { "epoch": 0.3110761979575805, "grad_norm": 7.901190757751465, "learning_rate": 0.00016177956577013847, "loss": 1.7391, "step": 99 }, { "epoch": 0.31421838177533384, "grad_norm": 7.724400520324707, "learning_rate": 0.00016097691867340545, "loss": 1.588, "step": 100 }, { "epoch": 0.3173605655930872, "grad_norm": 2.0307812690734863, "learning_rate": 0.00016016796861712126, "loss": 0.9474, "step": 101 }, { "epoch": 0.32050274941084056, "grad_norm": 2.7723567485809326, "learning_rate": 0.0001593527992194745, "loss": 1.3374, "step": 102 }, { "epoch": 0.3236449332285939, "grad_norm": 3.38486385345459, "learning_rate": 0.00015853149474152423, "loss": 1.6374, "step": 103 }, { "epoch": 0.3267871170463472, "grad_norm": 3.9574501514434814, "learning_rate": 0.00015770414007848995, "loss": 1.4977, "step": 104 }, { "epoch": 0.32992930086410055, "grad_norm": 2.936070203781128, "learning_rate": 0.00015687082075097677, "loss": 1.3849, "step": 105 }, { "epoch": 0.3330714846818539, "grad_norm": 3.021728754043579, "learning_rate": 0.00015603162289613503, "loss": 1.1803, "step": 106 }, { "epoch": 0.3362136684996072, "grad_norm": 3.1808855533599854, "learning_rate": 0.00015518663325875682, "loss": 1.3232, "step": 107 }, { "epoch": 0.33935585231736054, "grad_norm": 3.1893818378448486, "learning_rate": 0.00015433593918230955, "loss": 1.2869, "step": 108 }, { "epoch": 0.3424980361351139, "grad_norm": 3.432805061340332, "learning_rate": 0.00015347962859990744, "loss": 1.3294, "step": 109 }, { "epoch": 0.34564021995286726, "grad_norm": 3.3902928829193115, "learning_rate": 0.00015261779002522218, "loss": 1.7083, "step": 110 }, { "epoch": 0.3487824037706206, "grad_norm": 3.3720815181732178, "learning_rate": 0.0001517505125433338, "loss": 1.2538, "step": 111 }, { "epoch": 0.3519245875883739, "grad_norm": 3.1808342933654785, "learning_rate": 0.00015087788580152206, "loss": 1.1317, "step": 112 }, { "epoch": 0.35506677140612725, "grad_norm": 2.9972422122955322, "learning_rate": 0.00015000000000000001, "loss": 1.193, "step": 113 }, { "epoch": 0.3582089552238806, "grad_norm": 4.0733323097229, "learning_rate": 0.00014911694588259038, "loss": 1.3054, "step": 114 }, { "epoch": 0.36135113904163396, "grad_norm": 3.7181315422058105, "learning_rate": 0.00014822881472734562, "loss": 1.6147, "step": 115 }, { "epoch": 0.3644933228593873, "grad_norm": 3.633098840713501, "learning_rate": 0.000147335698337113, "loss": 1.3906, "step": 116 }, { "epoch": 0.3676355066771406, "grad_norm": 3.5433828830718994, "learning_rate": 0.00014643768903004504, "loss": 1.158, "step": 117 }, { "epoch": 0.37077769049489395, "grad_norm": 4.175973415374756, "learning_rate": 0.0001455348796300571, "loss": 1.266, "step": 118 }, { "epoch": 0.3739198743126473, "grad_norm": 3.39015531539917, "learning_rate": 0.0001446273634572326, "loss": 1.0612, "step": 119 }, { "epoch": 0.3770620581304006, "grad_norm": 3.77596378326416, "learning_rate": 0.0001437152343181765, "loss": 1.5636, "step": 120 }, { "epoch": 0.38020424194815394, "grad_norm": 3.7370450496673584, "learning_rate": 0.0001427985864963193, "loss": 1.2615, "step": 121 }, { "epoch": 0.38334642576590733, "grad_norm": 4.049937725067139, "learning_rate": 0.00014187751474217098, "loss": 1.3253, "step": 122 }, { "epoch": 0.38648860958366066, "grad_norm": 3.5189642906188965, "learning_rate": 0.0001409521142635272, "loss": 1.0989, "step": 123 }, { "epoch": 0.389630793401414, "grad_norm": 4.486301422119141, "learning_rate": 0.0001400224807156278, "loss": 1.7106, "step": 124 }, { "epoch": 0.3927729772191673, "grad_norm": 3.7089040279388428, "learning_rate": 0.00013908871019126956, "loss": 1.2513, "step": 125 }, { "epoch": 0.39591516103692065, "grad_norm": 4.154073715209961, "learning_rate": 0.00013815089921087316, "loss": 1.1999, "step": 126 }, { "epoch": 0.399057344854674, "grad_norm": 3.7355892658233643, "learning_rate": 0.00013720914471250644, "loss": 1.1288, "step": 127 }, { "epoch": 0.40219952867242736, "grad_norm": 3.6328649520874023, "learning_rate": 0.00013626354404186404, "loss": 1.0193, "step": 128 }, { "epoch": 0.4053417124901807, "grad_norm": 3.760117769241333, "learning_rate": 0.00013531419494220548, "loss": 1.2189, "step": 129 }, { "epoch": 0.408483896307934, "grad_norm": 4.022516250610352, "learning_rate": 0.00013436119554425133, "loss": 1.2665, "step": 130 }, { "epoch": 0.41162608012568735, "grad_norm": 4.155143737792969, "learning_rate": 0.0001334046443560402, "loss": 1.3751, "step": 131 }, { "epoch": 0.4147682639434407, "grad_norm": 4.559612274169922, "learning_rate": 0.0001324446402527462, "loss": 1.4318, "step": 132 }, { "epoch": 0.417910447761194, "grad_norm": 3.62886905670166, "learning_rate": 0.0001314812824664585, "loss": 1.3198, "step": 133 }, { "epoch": 0.42105263157894735, "grad_norm": 4.358442783355713, "learning_rate": 0.00013051467057592414, "loss": 1.165, "step": 134 }, { "epoch": 0.42419481539670073, "grad_norm": 4.394584655761719, "learning_rate": 0.0001295449044962549, "loss": 1.3685, "step": 135 }, { "epoch": 0.42733699921445406, "grad_norm": 4.7039265632629395, "learning_rate": 0.0001285720844685996, "loss": 1.4534, "step": 136 }, { "epoch": 0.4304791830322074, "grad_norm": 4.168838024139404, "learning_rate": 0.00012759631104978223, "loss": 1.3412, "step": 137 }, { "epoch": 0.4336213668499607, "grad_norm": 6.1059370040893555, "learning_rate": 0.00012661768510190816, "loss": 1.4077, "step": 138 }, { "epoch": 0.43676355066771405, "grad_norm": 5.607442855834961, "learning_rate": 0.00012563630778193805, "loss": 1.1932, "step": 139 }, { "epoch": 0.4399057344854674, "grad_norm": 5.637754917144775, "learning_rate": 0.00012465228053123173, "loss": 1.6017, "step": 140 }, { "epoch": 0.4430479183032207, "grad_norm": 5.235214710235596, "learning_rate": 0.0001236657050650627, "loss": 1.9485, "step": 141 }, { "epoch": 0.4461901021209741, "grad_norm": 5.1749587059021, "learning_rate": 0.00012267668336210413, "loss": 1.1955, "step": 142 }, { "epoch": 0.4493322859387274, "grad_norm": 6.0506439208984375, "learning_rate": 0.00012168531765388755, "loss": 1.1485, "step": 143 }, { "epoch": 0.45247446975648076, "grad_norm": 5.571617603302002, "learning_rate": 0.00012069171041423583, "loss": 1.1987, "step": 144 }, { "epoch": 0.4556166535742341, "grad_norm": 4.907689571380615, "learning_rate": 0.00011969596434867063, "loss": 1.1124, "step": 145 }, { "epoch": 0.4587588373919874, "grad_norm": 6.129809856414795, "learning_rate": 0.0001186981823837961, "loss": 1.2803, "step": 146 }, { "epoch": 0.46190102120974075, "grad_norm": 5.888606548309326, "learning_rate": 0.00011769846765665992, "loss": 1.2921, "step": 147 }, { "epoch": 0.46504320502749413, "grad_norm": 6.224395275115967, "learning_rate": 0.00011669692350409223, "loss": 1.5914, "step": 148 }, { "epoch": 0.46818538884524746, "grad_norm": 6.814655780792236, "learning_rate": 0.00011569365345202414, "loss": 1.9211, "step": 149 }, { "epoch": 0.4713275726630008, "grad_norm": 8.580469131469727, "learning_rate": 0.00011468876120478662, "loss": 1.3517, "step": 150 }, { "epoch": 0.4744697564807541, "grad_norm": 2.6370444297790527, "learning_rate": 0.00011368235063439103, "loss": 0.9597, "step": 151 }, { "epoch": 0.47761194029850745, "grad_norm": 3.049766778945923, "learning_rate": 0.00011267452576979218, "loss": 1.6235, "step": 152 }, { "epoch": 0.4807541241162608, "grad_norm": 3.1222424507141113, "learning_rate": 0.00011166539078613525, "loss": 1.2576, "step": 153 }, { "epoch": 0.4838963079340141, "grad_norm": 3.147629737854004, "learning_rate": 0.00011065504999398762, "loss": 1.3355, "step": 154 }, { "epoch": 0.4870384917517675, "grad_norm": 3.2385127544403076, "learning_rate": 0.00010964360782855667, "loss": 1.0497, "step": 155 }, { "epoch": 0.49018067556952083, "grad_norm": 3.464521646499634, "learning_rate": 0.00010863116883889462, "loss": 1.6016, "step": 156 }, { "epoch": 0.49332285938727416, "grad_norm": 3.812378168106079, "learning_rate": 0.00010761783767709182, "loss": 1.6992, "step": 157 }, { "epoch": 0.4964650432050275, "grad_norm": 3.2355921268463135, "learning_rate": 0.0001066037190874591, "loss": 1.6458, "step": 158 }, { "epoch": 0.4996072270227808, "grad_norm": 2.834512710571289, "learning_rate": 0.00010558891789570082, "loss": 0.8675, "step": 159 }, { "epoch": 0.5027494108405341, "grad_norm": 3.5642106533050537, "learning_rate": 0.00010457353899807946, "loss": 1.4582, "step": 160 }, { "epoch": 0.5027494108405341, "eval_loss": 1.2131388187408447, "eval_runtime": 3.1181, "eval_samples_per_second": 42.974, "eval_steps_per_second": 21.487, "step": 160 }, { "epoch": 0.5058915946582875, "grad_norm": 3.221024513244629, "learning_rate": 0.00010355768735057274, "loss": 1.179, "step": 161 }, { "epoch": 0.5090337784760408, "grad_norm": 3.137036085128784, "learning_rate": 0.00010254146795802496, "loss": 1.0908, "step": 162 }, { "epoch": 0.5121759622937941, "grad_norm": 3.8844146728515625, "learning_rate": 0.0001015249858632926, "loss": 1.2816, "step": 163 }, { "epoch": 0.5153181461115475, "grad_norm": 3.191791296005249, "learning_rate": 0.00010050834613638695, "loss": 1.1838, "step": 164 }, { "epoch": 0.5184603299293009, "grad_norm": 3.387047290802002, "learning_rate": 9.949165386361305e-05, "loss": 1.3894, "step": 165 }, { "epoch": 0.5216025137470542, "grad_norm": 3.342327117919922, "learning_rate": 9.847501413670742e-05, "loss": 1.4602, "step": 166 }, { "epoch": 0.5247446975648076, "grad_norm": 3.2737648487091064, "learning_rate": 9.745853204197511e-05, "loss": 1.2197, "step": 167 }, { "epoch": 0.5278868813825609, "grad_norm": 3.4641106128692627, "learning_rate": 9.644231264942724e-05, "loss": 1.4113, "step": 168 }, { "epoch": 0.5310290652003142, "grad_norm": 3.5550365447998047, "learning_rate": 9.542646100192056e-05, "loss": 1.2872, "step": 169 }, { "epoch": 0.5341712490180676, "grad_norm": 3.1159486770629883, "learning_rate": 9.441108210429922e-05, "loss": 1.0507, "step": 170 }, { "epoch": 0.5373134328358209, "grad_norm": 3.65675687789917, "learning_rate": 9.339628091254092e-05, "loss": 1.1444, "step": 171 }, { "epoch": 0.5404556166535742, "grad_norm": 3.3774020671844482, "learning_rate": 9.238216232290822e-05, "loss": 1.0822, "step": 172 }, { "epoch": 0.5435978004713276, "grad_norm": 3.5839242935180664, "learning_rate": 9.136883116110542e-05, "loss": 1.1762, "step": 173 }, { "epoch": 0.5467399842890809, "grad_norm": 4.133352279663086, "learning_rate": 9.035639217144335e-05, "loss": 1.5235, "step": 174 }, { "epoch": 0.5498821681068342, "grad_norm": 3.76536226272583, "learning_rate": 8.93449500060124e-05, "loss": 1.5423, "step": 175 }, { "epoch": 0.5530243519245875, "grad_norm": 4.026939868927002, "learning_rate": 8.833460921386478e-05, "loss": 1.5957, "step": 176 }, { "epoch": 0.5561665357423409, "grad_norm": 4.009226322174072, "learning_rate": 8.732547423020785e-05, "loss": 1.4754, "step": 177 }, { "epoch": 0.5593087195600943, "grad_norm": 3.784341335296631, "learning_rate": 8.6317649365609e-05, "loss": 1.4403, "step": 178 }, { "epoch": 0.5624509033778476, "grad_norm": 3.7374749183654785, "learning_rate": 8.53112387952134e-05, "loss": 1.059, "step": 179 }, { "epoch": 0.565593087195601, "grad_norm": 3.9936928749084473, "learning_rate": 8.430634654797589e-05, "loss": 1.1447, "step": 180 }, { "epoch": 0.5687352710133543, "grad_norm": 3.8754169940948486, "learning_rate": 8.33030764959078e-05, "loss": 1.3895, "step": 181 }, { "epoch": 0.5718774548311076, "grad_norm": 4.160313129425049, "learning_rate": 8.230153234334009e-05, "loss": 1.2961, "step": 182 }, { "epoch": 0.575019638648861, "grad_norm": 4.029177665710449, "learning_rate": 8.130181761620392e-05, "loss": 1.1794, "step": 183 }, { "epoch": 0.5781618224666143, "grad_norm": 4.623962879180908, "learning_rate": 8.030403565132942e-05, "loss": 1.5772, "step": 184 }, { "epoch": 0.5813040062843676, "grad_norm": 4.731784343719482, "learning_rate": 7.930828958576418e-05, "loss": 1.1102, "step": 185 }, { "epoch": 0.584446190102121, "grad_norm": 4.70895528793335, "learning_rate": 7.831468234611248e-05, "loss": 1.47, "step": 186 }, { "epoch": 0.5875883739198743, "grad_norm": 4.335077285766602, "learning_rate": 7.732331663789592e-05, "loss": 1.102, "step": 187 }, { "epoch": 0.5907305577376276, "grad_norm": 4.903375625610352, "learning_rate": 7.63342949349373e-05, "loss": 1.654, "step": 188 }, { "epoch": 0.593872741555381, "grad_norm": 5.286921977996826, "learning_rate": 7.53477194687683e-05, "loss": 1.7055, "step": 189 }, { "epoch": 0.5970149253731343, "grad_norm": 5.063072204589844, "learning_rate": 7.436369221806201e-05, "loss": 1.2112, "step": 190 }, { "epoch": 0.6001571091908877, "grad_norm": 5.407651424407959, "learning_rate": 7.338231489809182e-05, "loss": 1.3919, "step": 191 }, { "epoch": 0.603299293008641, "grad_norm": 4.297738552093506, "learning_rate": 7.240368895021776e-05, "loss": 0.9204, "step": 192 }, { "epoch": 0.6064414768263944, "grad_norm": 5.6122307777404785, "learning_rate": 7.142791553140045e-05, "loss": 1.2202, "step": 193 }, { "epoch": 0.6095836606441477, "grad_norm": 6.088860988616943, "learning_rate": 7.045509550374509e-05, "loss": 1.5773, "step": 194 }, { "epoch": 0.612725844461901, "grad_norm": 4.216585159301758, "learning_rate": 6.948532942407588e-05, "loss": 0.8647, "step": 195 }, { "epoch": 0.6158680282796544, "grad_norm": 6.6124653816223145, "learning_rate": 6.851871753354153e-05, "loss": 1.4118, "step": 196 }, { "epoch": 0.6190102120974077, "grad_norm": 6.052501678466797, "learning_rate": 6.75553597472538e-05, "loss": 1.4777, "step": 197 }, { "epoch": 0.622152395915161, "grad_norm": 6.68459939956665, "learning_rate": 6.659535564395982e-05, "loss": 1.2237, "step": 198 }, { "epoch": 0.6252945797329144, "grad_norm": 6.1412034034729, "learning_rate": 6.563880445574873e-05, "loss": 1.2968, "step": 199 }, { "epoch": 0.6284367635506677, "grad_norm": 8.53506088256836, "learning_rate": 6.468580505779455e-05, "loss": 1.5801, "step": 200 }, { "epoch": 0.631578947368421, "grad_norm": 2.196810722351074, "learning_rate": 6.373645595813597e-05, "loss": 1.0589, "step": 201 }, { "epoch": 0.6347211311861743, "grad_norm": 2.7846407890319824, "learning_rate": 6.279085528749359e-05, "loss": 1.2213, "step": 202 }, { "epoch": 0.6378633150039277, "grad_norm": 3.4782044887542725, "learning_rate": 6.184910078912687e-05, "loss": 1.6829, "step": 203 }, { "epoch": 0.6410054988216811, "grad_norm": 3.3439154624938965, "learning_rate": 6.091128980873047e-05, "loss": 1.5249, "step": 204 }, { "epoch": 0.6441476826394344, "grad_norm": 2.737105369567871, "learning_rate": 5.9977519284372194e-05, "loss": 0.8508, "step": 205 }, { "epoch": 0.6472898664571878, "grad_norm": 2.626682996749878, "learning_rate": 5.904788573647283e-05, "loss": 0.964, "step": 206 }, { "epoch": 0.6504320502749411, "grad_norm": 2.9072468280792236, "learning_rate": 5.812248525782902e-05, "loss": 1.1909, "step": 207 }, { "epoch": 0.6535742340926944, "grad_norm": 3.0129120349884033, "learning_rate": 5.720141350368072e-05, "loss": 0.994, "step": 208 }, { "epoch": 0.6567164179104478, "grad_norm": 3.756761074066162, "learning_rate": 5.628476568182349e-05, "loss": 1.6405, "step": 209 }, { "epoch": 0.6598586017282011, "grad_norm": 3.0887200832366943, "learning_rate": 5.537263654276743e-05, "loss": 1.2731, "step": 210 }, { "epoch": 0.6630007855459544, "grad_norm": 4.020783424377441, "learning_rate": 5.446512036994287e-05, "loss": 1.2932, "step": 211 }, { "epoch": 0.6661429693637078, "grad_norm": 3.2747795581817627, "learning_rate": 5.3562310969954986e-05, "loss": 1.319, "step": 212 }, { "epoch": 0.6692851531814611, "grad_norm": 2.893523931503296, "learning_rate": 5.266430166288705e-05, "loss": 1.1003, "step": 213 }, { "epoch": 0.6724273369992144, "grad_norm": 3.4699766635894775, "learning_rate": 5.177118527265438e-05, "loss": 1.065, "step": 214 }, { "epoch": 0.6755695208169678, "grad_norm": 3.584944725036621, "learning_rate": 5.088305411740966e-05, "loss": 1.4375, "step": 215 }, { "epoch": 0.6787117046347211, "grad_norm": 3.675408363342285, "learning_rate": 5.000000000000002e-05, "loss": 1.0126, "step": 216 }, { "epoch": 0.6818538884524745, "grad_norm": 3.5655829906463623, "learning_rate": 4.912211419847794e-05, "loss": 1.06, "step": 217 }, { "epoch": 0.6849960722702279, "grad_norm": 4.225513935089111, "learning_rate": 4.824948745666621e-05, "loss": 1.6426, "step": 218 }, { "epoch": 0.6881382560879812, "grad_norm": 3.8531696796417236, "learning_rate": 4.738220997477785e-05, "loss": 1.2327, "step": 219 }, { "epoch": 0.6912804399057345, "grad_norm": 4.230501174926758, "learning_rate": 4.652037140009259e-05, "loss": 1.2916, "step": 220 }, { "epoch": 0.6944226237234878, "grad_norm": 3.733222246170044, "learning_rate": 4.566406081769048e-05, "loss": 1.0843, "step": 221 }, { "epoch": 0.6975648075412412, "grad_norm": 3.7841460704803467, "learning_rate": 4.4813366741243235e-05, "loss": 1.188, "step": 222 }, { "epoch": 0.7007069913589945, "grad_norm": 3.6160645484924316, "learning_rate": 4.3968377103865024e-05, "loss": 1.0551, "step": 223 }, { "epoch": 0.7038491751767478, "grad_norm": 3.4155659675598145, "learning_rate": 4.312917924902328e-05, "loss": 1.3099, "step": 224 }, { "epoch": 0.7069913589945012, "grad_norm": 3.728693723678589, "learning_rate": 4.2295859921510065e-05, "loss": 1.4015, "step": 225 }, { "epoch": 0.7101335428122545, "grad_norm": 3.777299642562866, "learning_rate": 4.146850525847579e-05, "loss": 1.2423, "step": 226 }, { "epoch": 0.7132757266300078, "grad_norm": 4.03432559967041, "learning_rate": 4.0647200780525485e-05, "loss": 1.0872, "step": 227 }, { "epoch": 0.7164179104477612, "grad_norm": 4.0440993309021, "learning_rate": 3.9832031382878766e-05, "loss": 1.4426, "step": 228 }, { "epoch": 0.7195600942655145, "grad_norm": 3.996366500854492, "learning_rate": 3.902308132659457e-05, "loss": 1.1019, "step": 229 }, { "epoch": 0.7227022780832679, "grad_norm": 4.013318061828613, "learning_rate": 3.822043422986154e-05, "loss": 1.2735, "step": 230 }, { "epoch": 0.7258444619010213, "grad_norm": 4.059223175048828, "learning_rate": 3.742417305935443e-05, "loss": 1.1051, "step": 231 }, { "epoch": 0.7289866457187746, "grad_norm": 3.98122501373291, "learning_rate": 3.663438012165848e-05, "loss": 1.6716, "step": 232 }, { "epoch": 0.7321288295365279, "grad_norm": 4.2704949378967285, "learning_rate": 3.585113705476143e-05, "loss": 1.4287, "step": 233 }, { "epoch": 0.7352710133542812, "grad_norm": 3.7797279357910156, "learning_rate": 3.507452481961495e-05, "loss": 1.2022, "step": 234 }, { "epoch": 0.7384131971720346, "grad_norm": 4.641343593597412, "learning_rate": 3.430462369176619e-05, "loss": 1.366, "step": 235 }, { "epoch": 0.7415553809897879, "grad_norm": 5.080630779266357, "learning_rate": 3.354151325305973e-05, "loss": 1.5167, "step": 236 }, { "epoch": 0.7446975648075412, "grad_norm": 4.189108848571777, "learning_rate": 3.2785272383411635e-05, "loss": 1.0528, "step": 237 }, { "epoch": 0.7478397486252946, "grad_norm": 4.602407932281494, "learning_rate": 3.203597925265598e-05, "loss": 1.3885, "step": 238 }, { "epoch": 0.7509819324430479, "grad_norm": 4.218674182891846, "learning_rate": 3.129371131246459e-05, "loss": 1.0774, "step": 239 }, { "epoch": 0.7541241162608012, "grad_norm": 5.452424049377441, "learning_rate": 3.05585452883412e-05, "loss": 1.3081, "step": 240 }, { "epoch": 0.7541241162608012, "eval_loss": 1.1719870567321777, "eval_runtime": 3.1089, "eval_samples_per_second": 43.102, "eval_steps_per_second": 21.551, "step": 240 } ], "logging_steps": 1, "max_steps": 319, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 80, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.029490427396096e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }