{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9644670050761421, "eval_steps": 98, "global_step": 196, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01015228426395939, "grad_norm": 0.516016960144043, "learning_rate": 2e-05, "loss": 4.3699, "step": 1 }, { "epoch": 0.01015228426395939, "eval_loss": 4.561354637145996, "eval_runtime": 58.1653, "eval_samples_per_second": 10.144, "eval_steps_per_second": 1.272, "step": 1 }, { "epoch": 0.02030456852791878, "grad_norm": 0.5144633650779724, "learning_rate": 4e-05, "loss": 4.5708, "step": 2 }, { "epoch": 0.030456852791878174, "grad_norm": 0.5246109962463379, "learning_rate": 6e-05, "loss": 4.5871, "step": 3 }, { "epoch": 0.04060913705583756, "grad_norm": 0.5753014087677002, "learning_rate": 8e-05, "loss": 4.3818, "step": 4 }, { "epoch": 0.050761421319796954, "grad_norm": 0.6615588665008545, "learning_rate": 0.0001, "loss": 4.448, "step": 5 }, { "epoch": 0.06091370558375635, "grad_norm": 0.751004159450531, "learning_rate": 0.00012, "loss": 4.4964, "step": 6 }, { "epoch": 0.07106598984771574, "grad_norm": 0.886650562286377, "learning_rate": 0.00014, "loss": 4.4011, "step": 7 }, { "epoch": 0.08121827411167512, "grad_norm": 1.1175657510757446, "learning_rate": 0.00016, "loss": 3.9849, "step": 8 }, { "epoch": 0.09137055837563451, "grad_norm": 0.9861809015274048, "learning_rate": 0.00018, "loss": 3.8218, "step": 9 }, { "epoch": 0.10152284263959391, "grad_norm": 1.1107577085494995, "learning_rate": 0.0002, "loss": 3.6451, "step": 10 }, { "epoch": 0.1116751269035533, "grad_norm": 0.9435870051383972, "learning_rate": 0.000199985736255971, "loss": 3.5619, "step": 11 }, { "epoch": 0.1218274111675127, "grad_norm": 0.9529088139533997, "learning_rate": 0.0001999429490929718, "loss": 3.4561, "step": 12 }, { "epoch": 0.1319796954314721, "grad_norm": 1.3805211782455444, "learning_rate": 0.00019987165071710527, "loss": 3.2067, "step": 13 }, { "epoch": 0.14213197969543148, "grad_norm": 1.319393515586853, "learning_rate": 0.00019977186146800707, "loss": 3.0656, "step": 14 }, { "epoch": 0.15228426395939088, "grad_norm": 1.061409592628479, "learning_rate": 0.0001996436098130433, "loss": 2.7711, "step": 15 }, { "epoch": 0.16243654822335024, "grad_norm": 1.036845326423645, "learning_rate": 0.00019948693233918952, "loss": 2.6576, "step": 16 }, { "epoch": 0.17258883248730963, "grad_norm": 1.0924557447433472, "learning_rate": 0.00019930187374259337, "loss": 2.4101, "step": 17 }, { "epoch": 0.18274111675126903, "grad_norm": 1.0557212829589844, "learning_rate": 0.00019908848681582391, "loss": 2.2991, "step": 18 }, { "epoch": 0.19289340101522842, "grad_norm": 1.1735273599624634, "learning_rate": 0.00019884683243281116, "loss": 2.3991, "step": 19 }, { "epoch": 0.20304568527918782, "grad_norm": 0.8104203343391418, "learning_rate": 0.00019857697953148037, "loss": 2.107, "step": 20 }, { "epoch": 0.2131979695431472, "grad_norm": 0.7275764346122742, "learning_rate": 0.00019827900509408581, "loss": 2.0596, "step": 21 }, { "epoch": 0.2233502538071066, "grad_norm": 1.0672590732574463, "learning_rate": 0.00019795299412524945, "loss": 1.9543, "step": 22 }, { "epoch": 0.233502538071066, "grad_norm": 0.5848283767700195, "learning_rate": 0.00019759903962771156, "loss": 1.8091, "step": 23 }, { "epoch": 0.2436548223350254, "grad_norm": 0.9580035209655762, "learning_rate": 0.00019721724257579907, "loss": 1.7178, "step": 24 }, { "epoch": 0.25380710659898476, "grad_norm": 0.5362741351127625, "learning_rate": 0.00019680771188662044, "loss": 1.6739, "step": 25 }, { "epoch": 0.2639593908629442, "grad_norm": 0.5108774304389954, "learning_rate": 0.0001963705643889941, "loss": 1.7575, "step": 26 }, { "epoch": 0.27411167512690354, "grad_norm": 0.5604164004325867, "learning_rate": 0.00019590592479012023, "loss": 1.6815, "step": 27 }, { "epoch": 0.28426395939086296, "grad_norm": 0.7223322987556458, "learning_rate": 0.00019541392564000488, "loss": 1.6213, "step": 28 }, { "epoch": 0.29441624365482233, "grad_norm": 0.5081471800804138, "learning_rate": 0.00019489470729364692, "loss": 1.5935, "step": 29 }, { "epoch": 0.30456852791878175, "grad_norm": 0.5000993013381958, "learning_rate": 0.00019434841787099803, "loss": 1.6237, "step": 30 }, { "epoch": 0.3147208121827411, "grad_norm": 0.45925211906433105, "learning_rate": 0.00019377521321470805, "loss": 1.6201, "step": 31 }, { "epoch": 0.3248730964467005, "grad_norm": 0.38572826981544495, "learning_rate": 0.00019317525684566685, "loss": 1.4805, "step": 32 }, { "epoch": 0.3350253807106599, "grad_norm": 0.28524091839790344, "learning_rate": 0.00019254871991635598, "loss": 1.5985, "step": 33 }, { "epoch": 0.34517766497461927, "grad_norm": 0.3277890980243683, "learning_rate": 0.00019189578116202307, "loss": 1.4994, "step": 34 }, { "epoch": 0.3553299492385787, "grad_norm": 0.3320370018482208, "learning_rate": 0.00019121662684969335, "loss": 1.5039, "step": 35 }, { "epoch": 0.36548223350253806, "grad_norm": 0.2798719108104706, "learning_rate": 0.00019051145072503215, "loss": 1.4997, "step": 36 }, { "epoch": 0.3756345177664975, "grad_norm": 1.7497050762176514, "learning_rate": 0.00018978045395707418, "loss": 1.5465, "step": 37 }, { "epoch": 0.38578680203045684, "grad_norm": 0.27379170060157776, "learning_rate": 0.00018902384508083517, "loss": 1.4846, "step": 38 }, { "epoch": 0.39593908629441626, "grad_norm": 0.36681699752807617, "learning_rate": 0.00018824183993782192, "loss": 1.4154, "step": 39 }, { "epoch": 0.40609137055837563, "grad_norm": 0.45136329531669617, "learning_rate": 0.00018743466161445823, "loss": 1.3928, "step": 40 }, { "epoch": 0.41624365482233505, "grad_norm": 0.27879664301872253, "learning_rate": 0.00018660254037844388, "loss": 1.5119, "step": 41 }, { "epoch": 0.4263959390862944, "grad_norm": 0.29230332374572754, "learning_rate": 0.0001857457136130651, "loss": 1.5095, "step": 42 }, { "epoch": 0.4365482233502538, "grad_norm": 0.2731008231639862, "learning_rate": 0.00018486442574947511, "loss": 1.4672, "step": 43 }, { "epoch": 0.4467005076142132, "grad_norm": 0.23685932159423828, "learning_rate": 0.00018395892819696389, "loss": 1.4173, "step": 44 }, { "epoch": 0.45685279187817257, "grad_norm": 0.2703058421611786, "learning_rate": 0.00018302947927123766, "loss": 1.4088, "step": 45 }, { "epoch": 0.467005076142132, "grad_norm": 1.65743887424469, "learning_rate": 0.00018207634412072764, "loss": 1.4672, "step": 46 }, { "epoch": 0.47715736040609136, "grad_norm": 0.21287347376346588, "learning_rate": 0.00018109979465095013, "loss": 1.3975, "step": 47 }, { "epoch": 0.4873096446700508, "grad_norm": 0.3460160791873932, "learning_rate": 0.00018010010944693848, "loss": 1.4501, "step": 48 }, { "epoch": 0.49746192893401014, "grad_norm": 0.4228818714618683, "learning_rate": 0.00017907757369376985, "loss": 1.4632, "step": 49 }, { "epoch": 0.5076142131979695, "grad_norm": 0.46471402049064636, "learning_rate": 0.0001780324790952092, "loss": 1.3696, "step": 50 }, { "epoch": 0.5177664974619289, "grad_norm": 0.35602033138275146, "learning_rate": 0.00017696512379049325, "loss": 1.4096, "step": 51 }, { "epoch": 0.5279187817258884, "grad_norm": 0.2879682779312134, "learning_rate": 0.0001758758122692791, "loss": 1.337, "step": 52 }, { "epoch": 0.5380710659898477, "grad_norm": 0.1947374939918518, "learning_rate": 0.00017476485528478093, "loss": 1.3815, "step": 53 }, { "epoch": 0.5482233502538071, "grad_norm": 0.22819018363952637, "learning_rate": 0.00017363256976511972, "loss": 1.4021, "step": 54 }, { "epoch": 0.5583756345177665, "grad_norm": 0.19164641201496124, "learning_rate": 0.000172479278722912, "loss": 1.3899, "step": 55 }, { "epoch": 0.5685279187817259, "grad_norm": 0.5477288961410522, "learning_rate": 0.00017130531116312203, "loss": 1.4089, "step": 56 }, { "epoch": 0.5786802030456852, "grad_norm": 0.6282036900520325, "learning_rate": 0.0001701110019892053, "loss": 1.3983, "step": 57 }, { "epoch": 0.5888324873096447, "grad_norm": 0.5962779521942139, "learning_rate": 0.00016889669190756868, "loss": 1.3126, "step": 58 }, { "epoch": 0.5989847715736041, "grad_norm": 0.39695534110069275, "learning_rate": 0.00016766272733037576, "loss": 1.3693, "step": 59 }, { "epoch": 0.6091370558375635, "grad_norm": 0.2737330198287964, "learning_rate": 0.00016640946027672392, "loss": 1.4286, "step": 60 }, { "epoch": 0.6192893401015228, "grad_norm": 0.34324145317077637, "learning_rate": 0.00016513724827222227, "loss": 1.3363, "step": 61 }, { "epoch": 0.6294416243654822, "grad_norm": 0.4945085942745209, "learning_rate": 0.00016384645424699835, "loss": 1.4388, "step": 62 }, { "epoch": 0.6395939086294417, "grad_norm": 0.3939533829689026, "learning_rate": 0.00016253744643216368, "loss": 1.325, "step": 63 }, { "epoch": 0.649746192893401, "grad_norm": 0.3593675196170807, "learning_rate": 0.0001612105982547663, "loss": 1.353, "step": 64 }, { "epoch": 0.6598984771573604, "grad_norm": 0.3457062244415283, "learning_rate": 0.0001598662882312615, "loss": 1.3119, "step": 65 }, { "epoch": 0.6700507614213198, "grad_norm": 0.22607868909835815, "learning_rate": 0.00015850489985953076, "loss": 1.3281, "step": 66 }, { "epoch": 0.6802030456852792, "grad_norm": 0.1937730461359024, "learning_rate": 0.00015712682150947923, "loss": 1.3061, "step": 67 }, { "epoch": 0.6903553299492385, "grad_norm": 0.19334916770458221, "learning_rate": 0.00015573244631224365, "loss": 1.2995, "step": 68 }, { "epoch": 0.700507614213198, "grad_norm": 0.43978920578956604, "learning_rate": 0.0001543221720480419, "loss": 1.3196, "step": 69 }, { "epoch": 0.7106598984771574, "grad_norm": 0.20429864525794983, "learning_rate": 0.00015289640103269625, "loss": 1.3428, "step": 70 }, { "epoch": 0.7208121827411168, "grad_norm": 0.2042793482542038, "learning_rate": 0.0001514555400028629, "loss": 1.2717, "step": 71 }, { "epoch": 0.7309644670050761, "grad_norm": 0.2089298814535141, "learning_rate": 0.00015000000000000001, "loss": 1.2823, "step": 72 }, { "epoch": 0.7411167512690355, "grad_norm": 0.29447218775749207, "learning_rate": 0.00014853019625310813, "loss": 1.2596, "step": 73 }, { "epoch": 0.751269035532995, "grad_norm": 0.20766524970531464, "learning_rate": 0.0001470465480602756, "loss": 1.3421, "step": 74 }, { "epoch": 0.7614213197969543, "grad_norm": 0.19240014255046844, "learning_rate": 0.0001455494786690634, "loss": 1.3871, "step": 75 }, { "epoch": 0.7715736040609137, "grad_norm": 0.16677537560462952, "learning_rate": 0.00014403941515576344, "loss": 1.2507, "step": 76 }, { "epoch": 0.7817258883248731, "grad_norm": 0.1933940052986145, "learning_rate": 0.00014251678830356408, "loss": 1.2792, "step": 77 }, { "epoch": 0.7918781725888325, "grad_norm": 0.19050206243991852, "learning_rate": 0.00014098203247965875, "loss": 1.3017, "step": 78 }, { "epoch": 0.8020304568527918, "grad_norm": 0.25748810172080994, "learning_rate": 0.00013943558551133186, "loss": 1.2879, "step": 79 }, { "epoch": 0.8121827411167513, "grad_norm": 0.2314893752336502, "learning_rate": 0.0001378778885610576, "loss": 1.3692, "step": 80 }, { "epoch": 0.8223350253807107, "grad_norm": 0.20771433413028717, "learning_rate": 0.00013630938600064747, "loss": 1.3268, "step": 81 }, { "epoch": 0.8324873096446701, "grad_norm": 0.18968452513217926, "learning_rate": 0.00013473052528448201, "loss": 1.2663, "step": 82 }, { "epoch": 0.8426395939086294, "grad_norm": 0.1978602409362793, "learning_rate": 0.0001331417568218636, "loss": 1.2968, "step": 83 }, { "epoch": 0.8527918781725888, "grad_norm": 0.9941853284835815, "learning_rate": 0.00013154353384852558, "loss": 1.3187, "step": 84 }, { "epoch": 0.8629441624365483, "grad_norm": 0.18706466257572174, "learning_rate": 0.00012993631229733582, "loss": 1.2808, "step": 85 }, { "epoch": 0.8730964467005076, "grad_norm": 0.18098409473896027, "learning_rate": 0.00012832055066823038, "loss": 1.2246, "step": 86 }, { "epoch": 0.883248730964467, "grad_norm": 0.22270160913467407, "learning_rate": 0.00012669670989741517, "loss": 1.3028, "step": 87 }, { "epoch": 0.8934010152284264, "grad_norm": 0.25465860962867737, "learning_rate": 0.00012506525322587207, "loss": 1.347, "step": 88 }, { "epoch": 0.9035532994923858, "grad_norm": 0.23076751828193665, "learning_rate": 0.00012342664606720822, "loss": 1.3099, "step": 89 }, { "epoch": 0.9137055837563451, "grad_norm": 0.19831228256225586, "learning_rate": 0.00012178135587488515, "loss": 1.278, "step": 90 }, { "epoch": 0.9238578680203046, "grad_norm": 0.22052858769893646, "learning_rate": 0.00012012985200886602, "loss": 1.2165, "step": 91 }, { "epoch": 0.934010152284264, "grad_norm": 0.18730390071868896, "learning_rate": 0.00011847260560171896, "loss": 1.2814, "step": 92 }, { "epoch": 0.9441624365482234, "grad_norm": 0.16983264684677124, "learning_rate": 0.00011681008942421483, "loss": 1.2235, "step": 93 }, { "epoch": 0.9543147208121827, "grad_norm": 0.17806044220924377, "learning_rate": 0.00011514277775045768, "loss": 1.1867, "step": 94 }, { "epoch": 0.9644670050761421, "grad_norm": 0.1574580818414688, "learning_rate": 0.00011347114622258612, "loss": 1.2718, "step": 95 }, { "epoch": 0.9746192893401016, "grad_norm": 0.15895454585552216, "learning_rate": 0.00011179567171508463, "loss": 1.245, "step": 96 }, { "epoch": 0.9847715736040609, "grad_norm": 0.22224721312522888, "learning_rate": 0.00011011683219874323, "loss": 1.2945, "step": 97 }, { "epoch": 0.9949238578680203, "grad_norm": 0.16613103449344635, "learning_rate": 0.00010843510660430447, "loss": 1.3054, "step": 98 }, { "epoch": 0.9949238578680203, "eval_loss": 1.249497413635254, "eval_runtime": 58.5386, "eval_samples_per_second": 10.079, "eval_steps_per_second": 1.264, "step": 98 }, { "epoch": 1.0050761421319796, "grad_norm": 0.18297390639781952, "learning_rate": 0.00010675097468583652, "loss": 1.2749, "step": 99 }, { "epoch": 1.015228426395939, "grad_norm": 0.1834397166967392, "learning_rate": 0.00010506491688387127, "loss": 1.3218, "step": 100 }, { "epoch": 1.0253807106598984, "grad_norm": 0.37363290786743164, "learning_rate": 0.00010337741418834684, "loss": 1.2591, "step": 101 }, { "epoch": 1.0101522842639594, "grad_norm": 0.14738723635673523, "learning_rate": 0.0001016889480013931, "loss": 1.2353, "step": 102 }, { "epoch": 1.0203045685279188, "grad_norm": 0.17808881402015686, "learning_rate": 0.0001, "loss": 1.2708, "step": 103 }, { "epoch": 1.0304568527918783, "grad_norm": 0.1652560830116272, "learning_rate": 9.83110519986069e-05, "loss": 1.2261, "step": 104 }, { "epoch": 1.0406091370558375, "grad_norm": 0.1601293385028839, "learning_rate": 9.662258581165319e-05, "loss": 1.2336, "step": 105 }, { "epoch": 1.0507614213197969, "grad_norm": 0.18094658851623535, "learning_rate": 9.493508311612874e-05, "loss": 1.2165, "step": 106 }, { "epoch": 1.0609137055837563, "grad_norm": 0.17732879519462585, "learning_rate": 9.324902531416349e-05, "loss": 1.2647, "step": 107 }, { "epoch": 1.0710659898477157, "grad_norm": 0.16203966736793518, "learning_rate": 9.156489339569554e-05, "loss": 1.2343, "step": 108 }, { "epoch": 1.0812182741116751, "grad_norm": 0.21242284774780273, "learning_rate": 8.98831678012568e-05, "loss": 1.2335, "step": 109 }, { "epoch": 1.0913705583756346, "grad_norm": 0.1700202375650406, "learning_rate": 8.820432828491542e-05, "loss": 1.175, "step": 110 }, { "epoch": 1.101522842639594, "grad_norm": 0.1947324275970459, "learning_rate": 8.652885377741393e-05, "loss": 1.2354, "step": 111 }, { "epoch": 1.1116751269035534, "grad_norm": 0.15965348482131958, "learning_rate": 8.485722224954237e-05, "loss": 1.1937, "step": 112 }, { "epoch": 1.1218274111675126, "grad_norm": 0.1767743080854416, "learning_rate": 8.31899105757852e-05, "loss": 1.2075, "step": 113 }, { "epoch": 1.131979695431472, "grad_norm": 0.1793358474969864, "learning_rate": 8.15273943982811e-05, "loss": 1.2963, "step": 114 }, { "epoch": 1.1421319796954315, "grad_norm": 0.17889666557312012, "learning_rate": 7.987014799113397e-05, "loss": 1.208, "step": 115 }, { "epoch": 1.1522842639593909, "grad_norm": 0.16769151389598846, "learning_rate": 7.821864412511485e-05, "loss": 1.2279, "step": 116 }, { "epoch": 1.1624365482233503, "grad_norm": 0.1788126677274704, "learning_rate": 7.65733539327918e-05, "loss": 1.2341, "step": 117 }, { "epoch": 1.1725888324873097, "grad_norm": 0.17543092370033264, "learning_rate": 7.493474677412794e-05, "loss": 1.3065, "step": 118 }, { "epoch": 1.1827411167512691, "grad_norm": 0.18606220185756683, "learning_rate": 7.330329010258483e-05, "loss": 1.2233, "step": 119 }, { "epoch": 1.1928934010152283, "grad_norm": 0.23003295063972473, "learning_rate": 7.16794493317696e-05, "loss": 1.2107, "step": 120 }, { "epoch": 1.2030456852791878, "grad_norm": 0.15619252622127533, "learning_rate": 7.006368770266421e-05, "loss": 1.1885, "step": 121 }, { "epoch": 1.2131979695431472, "grad_norm": 0.22341646254062653, "learning_rate": 6.845646615147445e-05, "loss": 1.2421, "step": 122 }, { "epoch": 1.2233502538071066, "grad_norm": 0.1528923660516739, "learning_rate": 6.685824317813643e-05, "loss": 1.207, "step": 123 }, { "epoch": 1.233502538071066, "grad_norm": 0.15776072442531586, "learning_rate": 6.526947471551798e-05, "loss": 1.278, "step": 124 }, { "epoch": 1.2436548223350254, "grad_norm": 0.1788446009159088, "learning_rate": 6.369061399935255e-05, "loss": 1.2107, "step": 125 }, { "epoch": 1.2538071065989849, "grad_norm": 0.17271803319454193, "learning_rate": 6.21221114389424e-05, "loss": 1.2333, "step": 126 }, { "epoch": 1.263959390862944, "grad_norm": 0.15987545251846313, "learning_rate": 6.0564414488668165e-05, "loss": 1.238, "step": 127 }, { "epoch": 1.2741116751269035, "grad_norm": 0.16485555469989777, "learning_rate": 5.901796752034128e-05, "loss": 1.2471, "step": 128 }, { "epoch": 1.284263959390863, "grad_norm": 0.18228358030319214, "learning_rate": 5.748321169643596e-05, "loss": 1.1761, "step": 129 }, { "epoch": 1.2944162436548223, "grad_norm": 0.1641974151134491, "learning_rate": 5.596058484423656e-05, "loss": 1.2469, "step": 130 }, { "epoch": 1.3045685279187818, "grad_norm": 0.20411786437034607, "learning_rate": 5.44505213309366e-05, "loss": 1.212, "step": 131 }, { "epoch": 1.3147208121827412, "grad_norm": 0.16920053958892822, "learning_rate": 5.2953451939724454e-05, "loss": 1.254, "step": 132 }, { "epoch": 1.3248730964467006, "grad_norm": 0.19527798891067505, "learning_rate": 5.146980374689192e-05, "loss": 1.2187, "step": 133 }, { "epoch": 1.3350253807106598, "grad_norm": 0.19046878814697266, "learning_rate": 5.000000000000002e-05, "loss": 1.2035, "step": 134 }, { "epoch": 1.3451776649746192, "grad_norm": 0.1827058643102646, "learning_rate": 4.854445999713715e-05, "loss": 1.1891, "step": 135 }, { "epoch": 1.3553299492385786, "grad_norm": 0.16475000977516174, "learning_rate": 4.710359896730379e-05, "loss": 1.2263, "step": 136 }, { "epoch": 1.365482233502538, "grad_norm": 0.15977239608764648, "learning_rate": 4.567782795195816e-05, "loss": 1.2006, "step": 137 }, { "epoch": 1.3756345177664975, "grad_norm": 0.16366241872310638, "learning_rate": 4.426755368775637e-05, "loss": 1.1572, "step": 138 }, { "epoch": 1.385786802030457, "grad_norm": 0.16748002171516418, "learning_rate": 4.287317849052075e-05, "loss": 1.1932, "step": 139 }, { "epoch": 1.3959390862944163, "grad_norm": 0.17944949865341187, "learning_rate": 4.149510014046922e-05, "loss": 1.1635, "step": 140 }, { "epoch": 1.4060913705583755, "grad_norm": 0.15999887883663177, "learning_rate": 4.013371176873849e-05, "loss": 1.1987, "step": 141 }, { "epoch": 1.4162436548223352, "grad_norm": 0.17952662706375122, "learning_rate": 3.878940174523371e-05, "loss": 1.2722, "step": 142 }, { "epoch": 1.4263959390862944, "grad_norm": 0.16714362800121307, "learning_rate": 3.746255356783632e-05, "loss": 1.2027, "step": 143 }, { "epoch": 1.4365482233502538, "grad_norm": 0.21137690544128418, "learning_rate": 3.615354575300166e-05, "loss": 1.2099, "step": 144 }, { "epoch": 1.4467005076142132, "grad_norm": 0.16340382397174835, "learning_rate": 3.4862751727777797e-05, "loss": 1.2476, "step": 145 }, { "epoch": 1.4568527918781726, "grad_norm": 0.33795541524887085, "learning_rate": 3.3590539723276083e-05, "loss": 1.1955, "step": 146 }, { "epoch": 1.467005076142132, "grad_norm": 0.1949048787355423, "learning_rate": 3.233727266962425e-05, "loss": 1.2588, "step": 147 }, { "epoch": 1.4771573604060912, "grad_norm": 0.15895332396030426, "learning_rate": 3.110330809243134e-05, "loss": 1.1895, "step": 148 }, { "epoch": 1.487309644670051, "grad_norm": 0.17805150151252747, "learning_rate": 2.9888998010794743e-05, "loss": 1.2412, "step": 149 }, { "epoch": 1.49746192893401, "grad_norm": 0.16068041324615479, "learning_rate": 2.869468883687798e-05, "loss": 1.1935, "step": 150 }, { "epoch": 1.5076142131979695, "grad_norm": 0.16954682767391205, "learning_rate": 2.7520721277088024e-05, "loss": 1.2139, "step": 151 }, { "epoch": 1.517766497461929, "grad_norm": 0.17811493575572968, "learning_rate": 2.6367430234880284e-05, "loss": 1.2791, "step": 152 }, { "epoch": 1.5279187817258884, "grad_norm": 0.1642419546842575, "learning_rate": 2.523514471521913e-05, "loss": 1.2178, "step": 153 }, { "epoch": 1.5380710659898478, "grad_norm": 0.16778188943862915, "learning_rate": 2.4124187730720917e-05, "loss": 1.2735, "step": 154 }, { "epoch": 1.548223350253807, "grad_norm": 0.17863012850284576, "learning_rate": 2.3034876209506772e-05, "loss": 1.1632, "step": 155 }, { "epoch": 1.5583756345177666, "grad_norm": 0.15400968492031097, "learning_rate": 2.1967520904790827e-05, "loss": 1.2465, "step": 156 }, { "epoch": 1.5685279187817258, "grad_norm": 0.1574324369430542, "learning_rate": 2.092242630623016e-05, "loss": 1.2346, "step": 157 }, { "epoch": 1.5786802030456852, "grad_norm": 0.15566711127758026, "learning_rate": 1.9899890553061562e-05, "loss": 1.188, "step": 158 }, { "epoch": 1.5888324873096447, "grad_norm": 0.1699032336473465, "learning_rate": 1.8900205349049904e-05, "loss": 1.2062, "step": 159 }, { "epoch": 1.598984771573604, "grad_norm": 0.20871774852275848, "learning_rate": 1.7923655879272393e-05, "loss": 1.2349, "step": 160 }, { "epoch": 1.6091370558375635, "grad_norm": 0.19627781212329865, "learning_rate": 1.6970520728762375e-05, "loss": 1.193, "step": 161 }, { "epoch": 1.6192893401015227, "grad_norm": 0.1803133487701416, "learning_rate": 1.60410718030361e-05, "loss": 1.28, "step": 162 }, { "epoch": 1.6294416243654823, "grad_norm": 0.17840127646923065, "learning_rate": 1.5135574250524897e-05, "loss": 1.285, "step": 163 }, { "epoch": 1.6395939086294415, "grad_norm": 0.1523265242576599, "learning_rate": 1.425428638693489e-05, "loss": 1.2491, "step": 164 }, { "epoch": 1.649746192893401, "grad_norm": 0.17296157777309418, "learning_rate": 1.339745962155613e-05, "loss": 1.231, "step": 165 }, { "epoch": 1.6598984771573604, "grad_norm": 0.17164817452430725, "learning_rate": 1.2565338385541792e-05, "loss": 1.2217, "step": 166 }, { "epoch": 1.6700507614213198, "grad_norm": 0.17271456122398376, "learning_rate": 1.1758160062178093e-05, "loss": 1.285, "step": 167 }, { "epoch": 1.6802030456852792, "grad_norm": 0.16020996868610382, "learning_rate": 1.097615491916485e-05, "loss": 1.1259, "step": 168 }, { "epoch": 1.6903553299492384, "grad_norm": 0.17722178995609283, "learning_rate": 1.0219546042925843e-05, "loss": 1.2601, "step": 169 }, { "epoch": 1.700507614213198, "grad_norm": 0.1641930788755417, "learning_rate": 9.488549274967872e-06, "loss": 1.1552, "step": 170 }, { "epoch": 1.7106598984771573, "grad_norm": 0.17400699853897095, "learning_rate": 8.783373150306661e-06, "loss": 1.2226, "step": 171 }, { "epoch": 1.720812182741117, "grad_norm": 0.16899757087230682, "learning_rate": 8.10421883797694e-06, "loss": 1.3269, "step": 172 }, { "epoch": 1.7309644670050761, "grad_norm": 0.20650531351566315, "learning_rate": 7.4512800836440525e-06, "loss": 1.1627, "step": 173 }, { "epoch": 1.7411167512690355, "grad_norm": 0.15405453741550446, "learning_rate": 6.824743154333157e-06, "loss": 1.1767, "step": 174 }, { "epoch": 1.751269035532995, "grad_norm": 0.16714583337306976, "learning_rate": 6.22478678529197e-06, "loss": 1.2332, "step": 175 }, { "epoch": 1.7614213197969542, "grad_norm": 0.16613709926605225, "learning_rate": 5.651582129001986e-06, "loss": 1.2306, "step": 176 }, { "epoch": 1.7715736040609138, "grad_norm": 0.22227272391319275, "learning_rate": 5.105292706353093e-06, "loss": 1.1969, "step": 177 }, { "epoch": 1.781725888324873, "grad_norm": 0.18371812999248505, "learning_rate": 4.586074359995119e-06, "loss": 1.2643, "step": 178 }, { "epoch": 1.7918781725888326, "grad_norm": 0.1571132242679596, "learning_rate": 4.094075209879788e-06, "loss": 1.208, "step": 179 }, { "epoch": 1.8020304568527918, "grad_norm": 0.16985946893692017, "learning_rate": 3.6294356110059157e-06, "loss": 1.2069, "step": 180 }, { "epoch": 1.8121827411167513, "grad_norm": 0.16651304066181183, "learning_rate": 3.1922881133795825e-06, "loss": 1.2017, "step": 181 }, { "epoch": 1.8223350253807107, "grad_norm": 0.16982710361480713, "learning_rate": 2.7827574242009437e-06, "loss": 1.1706, "step": 182 }, { "epoch": 1.83248730964467, "grad_norm": 0.16492588818073273, "learning_rate": 2.4009603722884742e-06, "loss": 1.2733, "step": 183 }, { "epoch": 1.8426395939086295, "grad_norm": 0.165365532040596, "learning_rate": 2.0470058747505516e-06, "loss": 1.1843, "step": 184 }, { "epoch": 1.8527918781725887, "grad_norm": 0.15770269930362701, "learning_rate": 1.7209949059142083e-06, "loss": 1.2168, "step": 185 }, { "epoch": 1.8629441624365484, "grad_norm": 0.16646115481853485, "learning_rate": 1.4230204685196203e-06, "loss": 1.1774, "step": 186 }, { "epoch": 1.8730964467005076, "grad_norm": 0.17410410940647125, "learning_rate": 1.1531675671888619e-06, "loss": 1.2268, "step": 187 }, { "epoch": 1.883248730964467, "grad_norm": 0.17677107453346252, "learning_rate": 9.11513184176116e-07, "loss": 1.1485, "step": 188 }, { "epoch": 1.8934010152284264, "grad_norm": 0.15033933520317078, "learning_rate": 6.981262574066394e-07, "loss": 1.2125, "step": 189 }, { "epoch": 1.9035532994923858, "grad_norm": 0.1645897626876831, "learning_rate": 5.130676608104845e-07, "loss": 1.2151, "step": 190 }, { "epoch": 1.9137055837563453, "grad_norm": 0.17260093986988068, "learning_rate": 3.56390186956701e-07, "loss": 1.2509, "step": 191 }, { "epoch": 1.9238578680203045, "grad_norm": 0.16453680396080017, "learning_rate": 2.2813853199292746e-07, "loss": 1.264, "step": 192 }, { "epoch": 1.934010152284264, "grad_norm": 0.1580802947282791, "learning_rate": 1.2834928289472416e-07, "loss": 1.193, "step": 193 }, { "epoch": 1.9441624365482233, "grad_norm": 0.18191225826740265, "learning_rate": 5.705090702819993e-08, "loss": 1.2404, "step": 194 }, { "epoch": 1.9543147208121827, "grad_norm": 0.19886773824691772, "learning_rate": 1.426374402901942e-08, "loss": 1.1602, "step": 195 }, { "epoch": 1.9644670050761421, "grad_norm": 0.19592063128948212, "learning_rate": 0.0, "loss": 1.1879, "step": 196 }, { "epoch": 1.9644670050761421, "eval_loss": 1.2216618061065674, "eval_runtime": 58.8162, "eval_samples_per_second": 10.031, "eval_steps_per_second": 1.258, "step": 196 } ], "logging_steps": 1, "max_steps": 196, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 98, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0307946378260644e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }