{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.99849510910459, "eval_steps": 500, "global_step": 332, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006019563581640331, "grad_norm": 41.037254333496094, "learning_rate": 1.4705882352941177e-06, "loss": 4.2766, "step": 1 }, { "epoch": 0.012039127163280662, "grad_norm": 43.19103240966797, "learning_rate": 2.9411764705882355e-06, "loss": 4.2655, "step": 2 }, { "epoch": 0.01805869074492099, "grad_norm": 41.71216583251953, "learning_rate": 4.411764705882353e-06, "loss": 4.1574, "step": 3 }, { "epoch": 0.024078254326561323, "grad_norm": 51.17884063720703, "learning_rate": 5.882352941176471e-06, "loss": 3.3329, "step": 4 }, { "epoch": 0.030097817908201655, "grad_norm": 28.706247329711914, "learning_rate": 7.3529411764705884e-06, "loss": 2.0066, "step": 5 }, { "epoch": 0.03611738148984198, "grad_norm": 19.702205657958984, "learning_rate": 8.823529411764707e-06, "loss": 1.2989, "step": 6 }, { "epoch": 0.042136945071482315, "grad_norm": 10.646201133728027, "learning_rate": 1.0294117647058824e-05, "loss": 0.4697, "step": 7 }, { "epoch": 0.04815650865312265, "grad_norm": 7.015563488006592, "learning_rate": 1.1764705882352942e-05, "loss": 0.167, "step": 8 }, { "epoch": 0.05417607223476298, "grad_norm": 2.405210494995117, "learning_rate": 1.323529411764706e-05, "loss": 0.054, "step": 9 }, { "epoch": 0.06019563581640331, "grad_norm": 2.9235267639160156, "learning_rate": 1.4705882352941177e-05, "loss": 0.1401, "step": 10 }, { "epoch": 0.06621519939804364, "grad_norm": 3.1382505893707275, "learning_rate": 1.6176470588235296e-05, "loss": 0.0757, "step": 11 }, { "epoch": 0.07223476297968397, "grad_norm": 2.7751779556274414, "learning_rate": 1.7647058823529414e-05, "loss": 0.089, "step": 12 }, { "epoch": 0.0782543265613243, "grad_norm": 1.7391453981399536, "learning_rate": 1.9117647058823528e-05, "loss": 0.11, "step": 13 }, { "epoch": 0.08427389014296463, "grad_norm": 2.010361671447754, "learning_rate": 2.058823529411765e-05, "loss": 0.1479, "step": 14 }, { "epoch": 0.09029345372460497, "grad_norm": 3.3061070442199707, "learning_rate": 2.2058823529411766e-05, "loss": 0.0665, "step": 15 }, { "epoch": 0.0963130173062453, "grad_norm": 2.8843464851379395, "learning_rate": 2.3529411764705884e-05, "loss": 0.074, "step": 16 }, { "epoch": 0.10233258088788563, "grad_norm": 0.8146764039993286, "learning_rate": 2.5e-05, "loss": 0.0315, "step": 17 }, { "epoch": 0.10835214446952596, "grad_norm": 2.45939040184021, "learning_rate": 2.647058823529412e-05, "loss": 0.0863, "step": 18 }, { "epoch": 0.1143717080511663, "grad_norm": 2.4333105087280273, "learning_rate": 2.7941176470588236e-05, "loss": 0.1191, "step": 19 }, { "epoch": 0.12039127163280662, "grad_norm": 1.5534690618515015, "learning_rate": 2.9411764705882354e-05, "loss": 0.1213, "step": 20 }, { "epoch": 0.12641083521444696, "grad_norm": 1.2200188636779785, "learning_rate": 3.0882352941176475e-05, "loss": 0.0619, "step": 21 }, { "epoch": 0.13243039879608728, "grad_norm": 1.9440735578536987, "learning_rate": 3.235294117647059e-05, "loss": 0.1123, "step": 22 }, { "epoch": 0.1384499623777276, "grad_norm": 1.5800230503082275, "learning_rate": 3.382352941176471e-05, "loss": 0.0655, "step": 23 }, { "epoch": 0.14446952595936793, "grad_norm": 0.6280108690261841, "learning_rate": 3.529411764705883e-05, "loss": 0.0309, "step": 24 }, { "epoch": 0.1504890895410083, "grad_norm": 1.1837276220321655, "learning_rate": 3.6764705882352945e-05, "loss": 0.082, "step": 25 }, { "epoch": 0.1565086531226486, "grad_norm": 3.0979809761047363, "learning_rate": 3.8235294117647055e-05, "loss": 0.0754, "step": 26 }, { "epoch": 0.16252821670428894, "grad_norm": 0.919219434261322, "learning_rate": 3.970588235294117e-05, "loss": 0.0652, "step": 27 }, { "epoch": 0.16854778028592926, "grad_norm": 1.2674806118011475, "learning_rate": 4.11764705882353e-05, "loss": 0.0524, "step": 28 }, { "epoch": 0.1745673438675696, "grad_norm": 1.4973307847976685, "learning_rate": 4.2647058823529415e-05, "loss": 0.0739, "step": 29 }, { "epoch": 0.18058690744920994, "grad_norm": 1.3600691556930542, "learning_rate": 4.411764705882353e-05, "loss": 0.0932, "step": 30 }, { "epoch": 0.18660647103085026, "grad_norm": 0.6800034046173096, "learning_rate": 4.558823529411765e-05, "loss": 0.0525, "step": 31 }, { "epoch": 0.1926260346124906, "grad_norm": 0.27061381936073303, "learning_rate": 4.705882352941177e-05, "loss": 0.0183, "step": 32 }, { "epoch": 0.1986455981941309, "grad_norm": 0.5821884870529175, "learning_rate": 4.8529411764705885e-05, "loss": 0.0342, "step": 33 }, { "epoch": 0.20466516177577126, "grad_norm": 1.5963926315307617, "learning_rate": 5e-05, "loss": 0.086, "step": 34 }, { "epoch": 0.2106847253574116, "grad_norm": 1.2303105592727661, "learning_rate": 4.983221476510067e-05, "loss": 0.1283, "step": 35 }, { "epoch": 0.21670428893905191, "grad_norm": 0.7925997376441956, "learning_rate": 4.966442953020135e-05, "loss": 0.0465, "step": 36 }, { "epoch": 0.22272385252069224, "grad_norm": 0.44674405455589294, "learning_rate": 4.9496644295302015e-05, "loss": 0.0208, "step": 37 }, { "epoch": 0.2287434161023326, "grad_norm": 1.129119873046875, "learning_rate": 4.932885906040269e-05, "loss": 0.1034, "step": 38 }, { "epoch": 0.23476297968397292, "grad_norm": 0.747196614742279, "learning_rate": 4.9161073825503354e-05, "loss": 0.1117, "step": 39 }, { "epoch": 0.24078254326561324, "grad_norm": 1.0140711069107056, "learning_rate": 4.8993288590604034e-05, "loss": 0.0713, "step": 40 }, { "epoch": 0.24680210684725357, "grad_norm": 0.9150713086128235, "learning_rate": 4.88255033557047e-05, "loss": 0.1045, "step": 41 }, { "epoch": 0.2528216704288939, "grad_norm": 0.7237759232521057, "learning_rate": 4.865771812080537e-05, "loss": 0.0399, "step": 42 }, { "epoch": 0.2588412340105342, "grad_norm": 0.4736149311065674, "learning_rate": 4.848993288590604e-05, "loss": 0.0283, "step": 43 }, { "epoch": 0.26486079759217457, "grad_norm": 0.8596872091293335, "learning_rate": 4.832214765100672e-05, "loss": 0.0516, "step": 44 }, { "epoch": 0.2708803611738149, "grad_norm": 0.8274044394493103, "learning_rate": 4.8154362416107385e-05, "loss": 0.0866, "step": 45 }, { "epoch": 0.2768999247554552, "grad_norm": 1.1380550861358643, "learning_rate": 4.798657718120805e-05, "loss": 0.0628, "step": 46 }, { "epoch": 0.28291948833709557, "grad_norm": 1.1349643468856812, "learning_rate": 4.7818791946308725e-05, "loss": 0.0997, "step": 47 }, { "epoch": 0.28893905191873587, "grad_norm": 1.2396087646484375, "learning_rate": 4.76510067114094e-05, "loss": 0.0668, "step": 48 }, { "epoch": 0.2949586155003762, "grad_norm": 0.6159345507621765, "learning_rate": 4.748322147651007e-05, "loss": 0.0454, "step": 49 }, { "epoch": 0.3009781790820166, "grad_norm": 0.9823417663574219, "learning_rate": 4.731543624161074e-05, "loss": 0.0358, "step": 50 }, { "epoch": 0.30699774266365687, "grad_norm": 1.3460859060287476, "learning_rate": 4.714765100671141e-05, "loss": 0.1146, "step": 51 }, { "epoch": 0.3130173062452972, "grad_norm": 0.8716734647750854, "learning_rate": 4.697986577181208e-05, "loss": 0.0996, "step": 52 }, { "epoch": 0.3190368698269376, "grad_norm": 0.8868650794029236, "learning_rate": 4.6812080536912756e-05, "loss": 0.0607, "step": 53 }, { "epoch": 0.32505643340857787, "grad_norm": 0.5762543678283691, "learning_rate": 4.664429530201342e-05, "loss": 0.0603, "step": 54 }, { "epoch": 0.3310759969902182, "grad_norm": 0.5473377704620361, "learning_rate": 4.6476510067114095e-05, "loss": 0.031, "step": 55 }, { "epoch": 0.3370955605718585, "grad_norm": 0.4517374634742737, "learning_rate": 4.630872483221477e-05, "loss": 0.0318, "step": 56 }, { "epoch": 0.3431151241534989, "grad_norm": 1.007686734199524, "learning_rate": 4.6140939597315434e-05, "loss": 0.0596, "step": 57 }, { "epoch": 0.3491346877351392, "grad_norm": 0.5532180666923523, "learning_rate": 4.597315436241611e-05, "loss": 0.0917, "step": 58 }, { "epoch": 0.3551542513167795, "grad_norm": 0.6608918309211731, "learning_rate": 4.580536912751678e-05, "loss": 0.0768, "step": 59 }, { "epoch": 0.3611738148984199, "grad_norm": 0.9971833229064941, "learning_rate": 4.5637583892617453e-05, "loss": 0.0614, "step": 60 }, { "epoch": 0.3671933784800602, "grad_norm": 0.4296749532222748, "learning_rate": 4.546979865771812e-05, "loss": 0.0503, "step": 61 }, { "epoch": 0.3732129420617005, "grad_norm": 0.5677506923675537, "learning_rate": 4.530201342281879e-05, "loss": 0.0644, "step": 62 }, { "epoch": 0.3792325056433409, "grad_norm": 0.6360709071159363, "learning_rate": 4.5134228187919466e-05, "loss": 0.0989, "step": 63 }, { "epoch": 0.3852520692249812, "grad_norm": 1.348215937614441, "learning_rate": 4.496644295302014e-05, "loss": 0.0967, "step": 64 }, { "epoch": 0.3912716328066215, "grad_norm": 0.6315649747848511, "learning_rate": 4.4798657718120805e-05, "loss": 0.0489, "step": 65 }, { "epoch": 0.3972911963882618, "grad_norm": 0.6150538921356201, "learning_rate": 4.463087248322148e-05, "loss": 0.0634, "step": 66 }, { "epoch": 0.4033107599699022, "grad_norm": 0.3067854344844818, "learning_rate": 4.446308724832215e-05, "loss": 0.0181, "step": 67 }, { "epoch": 0.40933032355154253, "grad_norm": 0.7488537430763245, "learning_rate": 4.4295302013422824e-05, "loss": 0.0847, "step": 68 }, { "epoch": 0.4153498871331828, "grad_norm": 0.769372284412384, "learning_rate": 4.412751677852349e-05, "loss": 0.037, "step": 69 }, { "epoch": 0.4213694507148232, "grad_norm": 0.9909029006958008, "learning_rate": 4.395973154362416e-05, "loss": 0.0417, "step": 70 }, { "epoch": 0.42738901429646353, "grad_norm": 0.7407757043838501, "learning_rate": 4.3791946308724836e-05, "loss": 0.051, "step": 71 }, { "epoch": 0.43340857787810383, "grad_norm": 1.149268388748169, "learning_rate": 4.36241610738255e-05, "loss": 0.0409, "step": 72 }, { "epoch": 0.4394281414597442, "grad_norm": 0.9848851561546326, "learning_rate": 4.3456375838926176e-05, "loss": 0.0149, "step": 73 }, { "epoch": 0.4454477050413845, "grad_norm": 1.4406760931015015, "learning_rate": 4.328859060402685e-05, "loss": 0.0762, "step": 74 }, { "epoch": 0.45146726862302483, "grad_norm": 1.003056526184082, "learning_rate": 4.312080536912752e-05, "loss": 0.0865, "step": 75 }, { "epoch": 0.4574868322046652, "grad_norm": 1.0864567756652832, "learning_rate": 4.295302013422819e-05, "loss": 0.0535, "step": 76 }, { "epoch": 0.4635063957863055, "grad_norm": 1.5504230260849, "learning_rate": 4.278523489932886e-05, "loss": 0.0679, "step": 77 }, { "epoch": 0.46952595936794583, "grad_norm": 1.389381766319275, "learning_rate": 4.2617449664429534e-05, "loss": 0.1011, "step": 78 }, { "epoch": 0.47554552294958613, "grad_norm": 0.06696069985628128, "learning_rate": 4.244966442953021e-05, "loss": 0.0017, "step": 79 }, { "epoch": 0.4815650865312265, "grad_norm": 0.8552239537239075, "learning_rate": 4.228187919463087e-05, "loss": 0.1052, "step": 80 }, { "epoch": 0.48758465011286684, "grad_norm": 1.8147671222686768, "learning_rate": 4.2114093959731546e-05, "loss": 0.0364, "step": 81 }, { "epoch": 0.49360421369450713, "grad_norm": 0.7592940330505371, "learning_rate": 4.194630872483222e-05, "loss": 0.0373, "step": 82 }, { "epoch": 0.4996237772761475, "grad_norm": 0.7351986765861511, "learning_rate": 4.1778523489932886e-05, "loss": 0.1033, "step": 83 }, { "epoch": 0.5056433408577878, "grad_norm": 0.3439026176929474, "learning_rate": 4.161073825503356e-05, "loss": 0.0166, "step": 84 }, { "epoch": 0.5116629044394282, "grad_norm": 0.41652995347976685, "learning_rate": 4.144295302013423e-05, "loss": 0.0591, "step": 85 }, { "epoch": 0.5176824680210684, "grad_norm": 0.5505680441856384, "learning_rate": 4.1275167785234905e-05, "loss": 0.0719, "step": 86 }, { "epoch": 0.5237020316027088, "grad_norm": 0.5010355114936829, "learning_rate": 4.110738255033557e-05, "loss": 0.0598, "step": 87 }, { "epoch": 0.5297215951843491, "grad_norm": 0.5710484385490417, "learning_rate": 4.0939597315436244e-05, "loss": 0.0354, "step": 88 }, { "epoch": 0.5357411587659895, "grad_norm": 0.815994381904602, "learning_rate": 4.077181208053692e-05, "loss": 0.0583, "step": 89 }, { "epoch": 0.5417607223476298, "grad_norm": 0.5417527556419373, "learning_rate": 4.060402684563759e-05, "loss": 0.0551, "step": 90 }, { "epoch": 0.5477802859292701, "grad_norm": 0.6098296046257019, "learning_rate": 4.0436241610738256e-05, "loss": 0.0675, "step": 91 }, { "epoch": 0.5537998495109104, "grad_norm": 1.0332790613174438, "learning_rate": 4.026845637583892e-05, "loss": 0.0726, "step": 92 }, { "epoch": 0.5598194130925508, "grad_norm": 0.5562874674797058, "learning_rate": 4.01006711409396e-05, "loss": 0.0493, "step": 93 }, { "epoch": 0.5658389766741911, "grad_norm": 0.8887888789176941, "learning_rate": 3.993288590604027e-05, "loss": 0.0924, "step": 94 }, { "epoch": 0.5718585402558315, "grad_norm": 0.67585688829422, "learning_rate": 3.976510067114094e-05, "loss": 0.03, "step": 95 }, { "epoch": 0.5778781038374717, "grad_norm": 3.7685415744781494, "learning_rate": 3.959731543624161e-05, "loss": 0.0489, "step": 96 }, { "epoch": 0.5838976674191121, "grad_norm": 0.8312086462974548, "learning_rate": 3.942953020134229e-05, "loss": 0.071, "step": 97 }, { "epoch": 0.5899172310007524, "grad_norm": 0.6857490539550781, "learning_rate": 3.9261744966442954e-05, "loss": 0.0166, "step": 98 }, { "epoch": 0.5959367945823928, "grad_norm": 0.6559200882911682, "learning_rate": 3.909395973154363e-05, "loss": 0.1012, "step": 99 }, { "epoch": 0.6019563581640331, "grad_norm": 1.4492859840393066, "learning_rate": 3.89261744966443e-05, "loss": 0.076, "step": 100 }, { "epoch": 0.6079759217456734, "grad_norm": 0.7843137383460999, "learning_rate": 3.875838926174497e-05, "loss": 0.0577, "step": 101 }, { "epoch": 0.6139954853273137, "grad_norm": 0.794602632522583, "learning_rate": 3.859060402684564e-05, "loss": 0.0626, "step": 102 }, { "epoch": 0.6200150489089541, "grad_norm": 1.4473546743392944, "learning_rate": 3.8422818791946305e-05, "loss": 0.0356, "step": 103 }, { "epoch": 0.6260346124905944, "grad_norm": 0.4639027416706085, "learning_rate": 3.8255033557046985e-05, "loss": 0.0259, "step": 104 }, { "epoch": 0.6320541760722348, "grad_norm": 1.1497997045516968, "learning_rate": 3.808724832214765e-05, "loss": 0.0516, "step": 105 }, { "epoch": 0.6380737396538751, "grad_norm": 0.327901691198349, "learning_rate": 3.7919463087248324e-05, "loss": 0.0405, "step": 106 }, { "epoch": 0.6440933032355154, "grad_norm": 0.4509243369102478, "learning_rate": 3.775167785234899e-05, "loss": 0.0892, "step": 107 }, { "epoch": 0.6501128668171557, "grad_norm": 0.6975520849227905, "learning_rate": 3.758389261744967e-05, "loss": 0.0978, "step": 108 }, { "epoch": 0.6561324303987961, "grad_norm": 0.6053667664527893, "learning_rate": 3.741610738255034e-05, "loss": 0.0318, "step": 109 }, { "epoch": 0.6621519939804364, "grad_norm": 0.5161236524581909, "learning_rate": 3.724832214765101e-05, "loss": 0.0561, "step": 110 }, { "epoch": 0.6681715575620768, "grad_norm": 0.4180920124053955, "learning_rate": 3.7080536912751676e-05, "loss": 0.0404, "step": 111 }, { "epoch": 0.674191121143717, "grad_norm": 0.4068116843700409, "learning_rate": 3.6912751677852356e-05, "loss": 0.0399, "step": 112 }, { "epoch": 0.6802106847253574, "grad_norm": 0.25368958711624146, "learning_rate": 3.674496644295302e-05, "loss": 0.0374, "step": 113 }, { "epoch": 0.6862302483069977, "grad_norm": 0.4473256766796112, "learning_rate": 3.6577181208053695e-05, "loss": 0.0533, "step": 114 }, { "epoch": 0.6922498118886381, "grad_norm": 0.39927905797958374, "learning_rate": 3.640939597315436e-05, "loss": 0.0367, "step": 115 }, { "epoch": 0.6982693754702785, "grad_norm": 0.5100545883178711, "learning_rate": 3.6241610738255034e-05, "loss": 0.0841, "step": 116 }, { "epoch": 0.7042889390519187, "grad_norm": 1.113686203956604, "learning_rate": 3.607382550335571e-05, "loss": 0.0798, "step": 117 }, { "epoch": 0.710308502633559, "grad_norm": 0.4927540123462677, "learning_rate": 3.5906040268456373e-05, "loss": 0.0201, "step": 118 }, { "epoch": 0.7163280662151994, "grad_norm": 0.2962929904460907, "learning_rate": 3.5738255033557046e-05, "loss": 0.0413, "step": 119 }, { "epoch": 0.7223476297968398, "grad_norm": 0.4307601749897003, "learning_rate": 3.557046979865772e-05, "loss": 0.022, "step": 120 }, { "epoch": 0.7283671933784801, "grad_norm": 0.5823848247528076, "learning_rate": 3.540268456375839e-05, "loss": 0.0357, "step": 121 }, { "epoch": 0.7343867569601203, "grad_norm": 0.3515729010105133, "learning_rate": 3.523489932885906e-05, "loss": 0.016, "step": 122 }, { "epoch": 0.7404063205417607, "grad_norm": 0.6808828115463257, "learning_rate": 3.506711409395974e-05, "loss": 0.0143, "step": 123 }, { "epoch": 0.746425884123401, "grad_norm": 0.7892507910728455, "learning_rate": 3.4899328859060405e-05, "loss": 0.0264, "step": 124 }, { "epoch": 0.7524454477050414, "grad_norm": 1.1977708339691162, "learning_rate": 3.473154362416108e-05, "loss": 0.0229, "step": 125 }, { "epoch": 0.7584650112866818, "grad_norm": 1.4794739484786987, "learning_rate": 3.4563758389261744e-05, "loss": 0.0504, "step": 126 }, { "epoch": 0.764484574868322, "grad_norm": 0.9014606475830078, "learning_rate": 3.439597315436242e-05, "loss": 0.0223, "step": 127 }, { "epoch": 0.7705041384499624, "grad_norm": 0.5018836855888367, "learning_rate": 3.422818791946309e-05, "loss": 0.0359, "step": 128 }, { "epoch": 0.7765237020316027, "grad_norm": 1.3349637985229492, "learning_rate": 3.4060402684563756e-05, "loss": 0.0572, "step": 129 }, { "epoch": 0.782543265613243, "grad_norm": 1.1911202669143677, "learning_rate": 3.389261744966443e-05, "loss": 0.0956, "step": 130 }, { "epoch": 0.7885628291948834, "grad_norm": 2.8993449211120605, "learning_rate": 3.37248322147651e-05, "loss": 0.1212, "step": 131 }, { "epoch": 0.7945823927765236, "grad_norm": 0.6151400208473206, "learning_rate": 3.3557046979865775e-05, "loss": 0.0641, "step": 132 }, { "epoch": 0.800601956358164, "grad_norm": 1.7681182622909546, "learning_rate": 3.338926174496644e-05, "loss": 0.1288, "step": 133 }, { "epoch": 0.8066215199398044, "grad_norm": 1.9313393831253052, "learning_rate": 3.3221476510067115e-05, "loss": 0.122, "step": 134 }, { "epoch": 0.8126410835214447, "grad_norm": 0.7092230916023254, "learning_rate": 3.305369127516779e-05, "loss": 0.0395, "step": 135 }, { "epoch": 0.8186606471030851, "grad_norm": 0.6039671301841736, "learning_rate": 3.288590604026846e-05, "loss": 0.0517, "step": 136 }, { "epoch": 0.8246802106847254, "grad_norm": 0.6897003054618835, "learning_rate": 3.271812080536913e-05, "loss": 0.0507, "step": 137 }, { "epoch": 0.8306997742663657, "grad_norm": 0.05981295928359032, "learning_rate": 3.25503355704698e-05, "loss": 0.0023, "step": 138 }, { "epoch": 0.836719337848006, "grad_norm": 0.48830369114875793, "learning_rate": 3.238255033557047e-05, "loss": 0.0652, "step": 139 }, { "epoch": 0.8427389014296464, "grad_norm": 1.0506463050842285, "learning_rate": 3.221476510067114e-05, "loss": 0.0709, "step": 140 }, { "epoch": 0.8487584650112867, "grad_norm": 0.3859744668006897, "learning_rate": 3.204697986577181e-05, "loss": 0.0396, "step": 141 }, { "epoch": 0.8547780285929271, "grad_norm": 0.768587052822113, "learning_rate": 3.1879194630872485e-05, "loss": 0.0599, "step": 142 }, { "epoch": 0.8607975921745673, "grad_norm": 0.6868757605552673, "learning_rate": 3.171140939597316e-05, "loss": 0.0373, "step": 143 }, { "epoch": 0.8668171557562077, "grad_norm": 0.7010259628295898, "learning_rate": 3.1543624161073825e-05, "loss": 0.0717, "step": 144 }, { "epoch": 0.872836719337848, "grad_norm": 0.5125268697738647, "learning_rate": 3.13758389261745e-05, "loss": 0.0457, "step": 145 }, { "epoch": 0.8788562829194884, "grad_norm": 0.9679777026176453, "learning_rate": 3.120805369127517e-05, "loss": 0.0988, "step": 146 }, { "epoch": 0.8848758465011287, "grad_norm": 0.7588280439376831, "learning_rate": 3.1040268456375844e-05, "loss": 0.0691, "step": 147 }, { "epoch": 0.890895410082769, "grad_norm": 0.4412769079208374, "learning_rate": 3.087248322147651e-05, "loss": 0.0243, "step": 148 }, { "epoch": 0.8969149736644093, "grad_norm": 0.5840623378753662, "learning_rate": 3.070469798657718e-05, "loss": 0.0553, "step": 149 }, { "epoch": 0.9029345372460497, "grad_norm": 0.34683701395988464, "learning_rate": 3.0536912751677856e-05, "loss": 0.0568, "step": 150 }, { "epoch": 0.90895410082769, "grad_norm": 0.6545599102973938, "learning_rate": 3.0369127516778522e-05, "loss": 0.0523, "step": 151 }, { "epoch": 0.9149736644093304, "grad_norm": 0.3024606704711914, "learning_rate": 3.02013422818792e-05, "loss": 0.04, "step": 152 }, { "epoch": 0.9209932279909706, "grad_norm": 0.40984031558036804, "learning_rate": 3.0033557046979865e-05, "loss": 0.027, "step": 153 }, { "epoch": 0.927012791572611, "grad_norm": 0.3794308602809906, "learning_rate": 2.986577181208054e-05, "loss": 0.0927, "step": 154 }, { "epoch": 0.9330323551542513, "grad_norm": 0.6882305145263672, "learning_rate": 2.9697986577181207e-05, "loss": 0.0343, "step": 155 }, { "epoch": 0.9390519187358917, "grad_norm": 0.6028600335121155, "learning_rate": 2.9530201342281884e-05, "loss": 0.0479, "step": 156 }, { "epoch": 0.945071482317532, "grad_norm": 3.9858436584472656, "learning_rate": 2.936241610738255e-05, "loss": 0.0709, "step": 157 }, { "epoch": 0.9510910458991723, "grad_norm": 0.193309485912323, "learning_rate": 2.9194630872483227e-05, "loss": 0.0098, "step": 158 }, { "epoch": 0.9571106094808126, "grad_norm": 0.6534713506698608, "learning_rate": 2.9026845637583893e-05, "loss": 0.0519, "step": 159 }, { "epoch": 0.963130173062453, "grad_norm": 0.27771925926208496, "learning_rate": 2.885906040268457e-05, "loss": 0.0238, "step": 160 }, { "epoch": 0.9691497366440933, "grad_norm": 0.7335049510002136, "learning_rate": 2.8691275167785235e-05, "loss": 0.0677, "step": 161 }, { "epoch": 0.9751693002257337, "grad_norm": 1.1832383871078491, "learning_rate": 2.8523489932885905e-05, "loss": 0.1371, "step": 162 }, { "epoch": 0.9811888638073739, "grad_norm": 0.7754644155502319, "learning_rate": 2.8355704697986578e-05, "loss": 0.0619, "step": 163 }, { "epoch": 0.9872084273890143, "grad_norm": 0.6826126575469971, "learning_rate": 2.8187919463087248e-05, "loss": 0.0682, "step": 164 }, { "epoch": 0.9932279909706546, "grad_norm": 0.9121167659759521, "learning_rate": 2.802013422818792e-05, "loss": 0.0234, "step": 165 }, { "epoch": 0.999247554552295, "grad_norm": 0.6562134623527527, "learning_rate": 2.785234899328859e-05, "loss": 0.0615, "step": 166 }, { "epoch": 1.0052671181339352, "grad_norm": 2.136812925338745, "learning_rate": 2.7684563758389263e-05, "loss": 0.0835, "step": 167 }, { "epoch": 1.0112866817155757, "grad_norm": 0.430277019739151, "learning_rate": 2.7516778523489933e-05, "loss": 0.0205, "step": 168 }, { "epoch": 1.017306245297216, "grad_norm": 0.37437501549720764, "learning_rate": 2.7348993288590606e-05, "loss": 0.0147, "step": 169 }, { "epoch": 1.0233258088788564, "grad_norm": 0.09916182607412338, "learning_rate": 2.7181208053691276e-05, "loss": 0.0166, "step": 170 }, { "epoch": 1.0293453724604966, "grad_norm": 0.22763441503047943, "learning_rate": 2.701342281879195e-05, "loss": 0.0296, "step": 171 }, { "epoch": 1.0353649360421369, "grad_norm": 0.3235589265823364, "learning_rate": 2.6845637583892618e-05, "loss": 0.0214, "step": 172 }, { "epoch": 1.0413844996237773, "grad_norm": 0.09323552995920181, "learning_rate": 2.6677852348993288e-05, "loss": 0.0029, "step": 173 }, { "epoch": 1.0474040632054176, "grad_norm": 0.18400466442108154, "learning_rate": 2.651006711409396e-05, "loss": 0.0227, "step": 174 }, { "epoch": 1.053423626787058, "grad_norm": 0.4601859450340271, "learning_rate": 2.634228187919463e-05, "loss": 0.0247, "step": 175 }, { "epoch": 1.0594431903686983, "grad_norm": 0.06925185769796371, "learning_rate": 2.6174496644295304e-05, "loss": 0.0028, "step": 176 }, { "epoch": 1.0654627539503385, "grad_norm": 0.7103378772735596, "learning_rate": 2.6006711409395973e-05, "loss": 0.0679, "step": 177 }, { "epoch": 1.071482317531979, "grad_norm": 0.2948612868785858, "learning_rate": 2.5838926174496646e-05, "loss": 0.0158, "step": 178 }, { "epoch": 1.0775018811136192, "grad_norm": 0.5460957288742065, "learning_rate": 2.5671140939597316e-05, "loss": 0.0252, "step": 179 }, { "epoch": 1.0835214446952597, "grad_norm": 0.13775992393493652, "learning_rate": 2.550335570469799e-05, "loss": 0.0125, "step": 180 }, { "epoch": 1.0895410082769, "grad_norm": 0.2737879753112793, "learning_rate": 2.533557046979866e-05, "loss": 0.0087, "step": 181 }, { "epoch": 1.0955605718585402, "grad_norm": 0.37196484208106995, "learning_rate": 2.516778523489933e-05, "loss": 0.0579, "step": 182 }, { "epoch": 1.1015801354401806, "grad_norm": 0.3493405282497406, "learning_rate": 2.5e-05, "loss": 0.0126, "step": 183 }, { "epoch": 1.1075996990218209, "grad_norm": 1.0219722986221313, "learning_rate": 2.4832214765100674e-05, "loss": 0.0701, "step": 184 }, { "epoch": 1.1136192626034613, "grad_norm": 0.32175976037979126, "learning_rate": 2.4664429530201344e-05, "loss": 0.012, "step": 185 }, { "epoch": 1.1196388261851016, "grad_norm": 0.33765479922294617, "learning_rate": 2.4496644295302017e-05, "loss": 0.0106, "step": 186 }, { "epoch": 1.1256583897667418, "grad_norm": 0.17531374096870422, "learning_rate": 2.4328859060402687e-05, "loss": 0.0161, "step": 187 }, { "epoch": 1.1316779533483823, "grad_norm": 0.1013503223657608, "learning_rate": 2.416107382550336e-05, "loss": 0.0057, "step": 188 }, { "epoch": 1.1376975169300225, "grad_norm": 0.5186209082603455, "learning_rate": 2.3993288590604026e-05, "loss": 0.0189, "step": 189 }, { "epoch": 1.143717080511663, "grad_norm": 0.577898383140564, "learning_rate": 2.38255033557047e-05, "loss": 0.0315, "step": 190 }, { "epoch": 1.1497366440933032, "grad_norm": 0.2543765604496002, "learning_rate": 2.365771812080537e-05, "loss": 0.0259, "step": 191 }, { "epoch": 1.1557562076749435, "grad_norm": 1.04751718044281, "learning_rate": 2.348993288590604e-05, "loss": 0.0267, "step": 192 }, { "epoch": 1.161775771256584, "grad_norm": 0.30151480436325073, "learning_rate": 2.332214765100671e-05, "loss": 0.016, "step": 193 }, { "epoch": 1.1677953348382242, "grad_norm": 1.1602953672409058, "learning_rate": 2.3154362416107384e-05, "loss": 0.0342, "step": 194 }, { "epoch": 1.1738148984198646, "grad_norm": 0.6510918140411377, "learning_rate": 2.2986577181208054e-05, "loss": 0.0367, "step": 195 }, { "epoch": 1.1798344620015049, "grad_norm": 0.2937709093093872, "learning_rate": 2.2818791946308727e-05, "loss": 0.0124, "step": 196 }, { "epoch": 1.1858540255831453, "grad_norm": 0.3778565526008606, "learning_rate": 2.2651006711409396e-05, "loss": 0.0353, "step": 197 }, { "epoch": 1.1918735891647856, "grad_norm": 0.34342288970947266, "learning_rate": 2.248322147651007e-05, "loss": 0.0228, "step": 198 }, { "epoch": 1.1978931527464258, "grad_norm": 0.25225672125816345, "learning_rate": 2.231543624161074e-05, "loss": 0.0037, "step": 199 }, { "epoch": 1.2039127163280663, "grad_norm": 0.3875395953655243, "learning_rate": 2.2147651006711412e-05, "loss": 0.024, "step": 200 }, { "epoch": 1.2099322799097065, "grad_norm": 0.48843473196029663, "learning_rate": 2.197986577181208e-05, "loss": 0.0411, "step": 201 }, { "epoch": 1.215951843491347, "grad_norm": 0.008358384482562542, "learning_rate": 2.181208053691275e-05, "loss": 0.0002, "step": 202 }, { "epoch": 1.2219714070729872, "grad_norm": 0.0617498978972435, "learning_rate": 2.1644295302013424e-05, "loss": 0.0016, "step": 203 }, { "epoch": 1.2279909706546275, "grad_norm": 0.5839952826499939, "learning_rate": 2.1476510067114094e-05, "loss": 0.0255, "step": 204 }, { "epoch": 1.234010534236268, "grad_norm": 0.6008470058441162, "learning_rate": 2.1308724832214767e-05, "loss": 0.0279, "step": 205 }, { "epoch": 1.2400300978179082, "grad_norm": 0.08057394623756409, "learning_rate": 2.1140939597315437e-05, "loss": 0.014, "step": 206 }, { "epoch": 1.2460496613995486, "grad_norm": 0.8297271728515625, "learning_rate": 2.097315436241611e-05, "loss": 0.0433, "step": 207 }, { "epoch": 1.2520692249811889, "grad_norm": 1.0753511190414429, "learning_rate": 2.080536912751678e-05, "loss": 0.0342, "step": 208 }, { "epoch": 1.2580887885628291, "grad_norm": 0.11652516573667526, "learning_rate": 2.0637583892617452e-05, "loss": 0.0122, "step": 209 }, { "epoch": 1.2641083521444696, "grad_norm": 0.23289084434509277, "learning_rate": 2.0469798657718122e-05, "loss": 0.0229, "step": 210 }, { "epoch": 1.2701279157261098, "grad_norm": 0.5731219053268433, "learning_rate": 2.0302013422818795e-05, "loss": 0.0455, "step": 211 }, { "epoch": 1.2761474793077503, "grad_norm": 0.8601072430610657, "learning_rate": 2.013422818791946e-05, "loss": 0.0294, "step": 212 }, { "epoch": 1.2821670428893905, "grad_norm": 0.9172778129577637, "learning_rate": 1.9966442953020134e-05, "loss": 0.0803, "step": 213 }, { "epoch": 1.2881866064710308, "grad_norm": 0.18378061056137085, "learning_rate": 1.9798657718120804e-05, "loss": 0.0151, "step": 214 }, { "epoch": 1.2942061700526712, "grad_norm": 0.2338120937347412, "learning_rate": 1.9630872483221477e-05, "loss": 0.0214, "step": 215 }, { "epoch": 1.3002257336343115, "grad_norm": 0.09691441804170609, "learning_rate": 1.946308724832215e-05, "loss": 0.0087, "step": 216 }, { "epoch": 1.306245297215952, "grad_norm": 0.1699642539024353, "learning_rate": 1.929530201342282e-05, "loss": 0.0088, "step": 217 }, { "epoch": 1.3122648607975922, "grad_norm": 0.014856619760394096, "learning_rate": 1.9127516778523493e-05, "loss": 0.0003, "step": 218 }, { "epoch": 1.3182844243792324, "grad_norm": 0.17981240153312683, "learning_rate": 1.8959731543624162e-05, "loss": 0.0148, "step": 219 }, { "epoch": 1.324303987960873, "grad_norm": 0.1564723402261734, "learning_rate": 1.8791946308724835e-05, "loss": 0.025, "step": 220 }, { "epoch": 1.3303235515425131, "grad_norm": 0.05128008872270584, "learning_rate": 1.8624161073825505e-05, "loss": 0.0013, "step": 221 }, { "epoch": 1.3363431151241536, "grad_norm": 0.018907951191067696, "learning_rate": 1.8456375838926178e-05, "loss": 0.0006, "step": 222 }, { "epoch": 1.3423626787057938, "grad_norm": 0.047860465943813324, "learning_rate": 1.8288590604026847e-05, "loss": 0.0018, "step": 223 }, { "epoch": 1.348382242287434, "grad_norm": 1.6964343786239624, "learning_rate": 1.8120805369127517e-05, "loss": 0.0834, "step": 224 }, { "epoch": 1.3544018058690745, "grad_norm": 0.09841305017471313, "learning_rate": 1.7953020134228187e-05, "loss": 0.0018, "step": 225 }, { "epoch": 1.3604213694507148, "grad_norm": 0.29318398237228394, "learning_rate": 1.778523489932886e-05, "loss": 0.0292, "step": 226 }, { "epoch": 1.3664409330323553, "grad_norm": 0.6777030229568481, "learning_rate": 1.761744966442953e-05, "loss": 0.0453, "step": 227 }, { "epoch": 1.3724604966139955, "grad_norm": 0.09742780774831772, "learning_rate": 1.7449664429530202e-05, "loss": 0.0022, "step": 228 }, { "epoch": 1.3784800601956357, "grad_norm": 0.21270929276943207, "learning_rate": 1.7281879194630872e-05, "loss": 0.0216, "step": 229 }, { "epoch": 1.3844996237772762, "grad_norm": 0.10257343202829361, "learning_rate": 1.7114093959731545e-05, "loss": 0.0032, "step": 230 }, { "epoch": 1.3905191873589164, "grad_norm": 0.2899154722690582, "learning_rate": 1.6946308724832215e-05, "loss": 0.0336, "step": 231 }, { "epoch": 1.396538750940557, "grad_norm": 0.560697615146637, "learning_rate": 1.6778523489932888e-05, "loss": 0.0167, "step": 232 }, { "epoch": 1.4025583145221971, "grad_norm": 0.15792670845985413, "learning_rate": 1.6610738255033557e-05, "loss": 0.0161, "step": 233 }, { "epoch": 1.4085778781038374, "grad_norm": 0.112309031188488, "learning_rate": 1.644295302013423e-05, "loss": 0.0081, "step": 234 }, { "epoch": 1.4145974416854779, "grad_norm": 0.6623883247375488, "learning_rate": 1.62751677852349e-05, "loss": 0.0679, "step": 235 }, { "epoch": 1.420617005267118, "grad_norm": 0.27897122502326965, "learning_rate": 1.610738255033557e-05, "loss": 0.0162, "step": 236 }, { "epoch": 1.4266365688487586, "grad_norm": 0.08262226730585098, "learning_rate": 1.5939597315436243e-05, "loss": 0.0029, "step": 237 }, { "epoch": 1.4326561324303988, "grad_norm": 0.13499091565608978, "learning_rate": 1.5771812080536912e-05, "loss": 0.0147, "step": 238 }, { "epoch": 1.438675696012039, "grad_norm": 0.2563413977622986, "learning_rate": 1.5604026845637585e-05, "loss": 0.0186, "step": 239 }, { "epoch": 1.4446952595936795, "grad_norm": 0.38309767842292786, "learning_rate": 1.5436241610738255e-05, "loss": 0.0042, "step": 240 }, { "epoch": 1.4507148231753197, "grad_norm": 0.5308915972709656, "learning_rate": 1.5268456375838928e-05, "loss": 0.023, "step": 241 }, { "epoch": 1.4567343867569602, "grad_norm": 0.5418457984924316, "learning_rate": 1.51006711409396e-05, "loss": 0.0271, "step": 242 }, { "epoch": 1.4627539503386005, "grad_norm": 0.16427500545978546, "learning_rate": 1.493288590604027e-05, "loss": 0.0167, "step": 243 }, { "epoch": 1.4687735139202407, "grad_norm": 0.1764906644821167, "learning_rate": 1.4765100671140942e-05, "loss": 0.0041, "step": 244 }, { "epoch": 1.4747930775018812, "grad_norm": 0.028177335858345032, "learning_rate": 1.4597315436241613e-05, "loss": 0.0011, "step": 245 }, { "epoch": 1.4808126410835214, "grad_norm": 0.28984132409095764, "learning_rate": 1.4429530201342285e-05, "loss": 0.0037, "step": 246 }, { "epoch": 1.4868322046651619, "grad_norm": 0.016668178141117096, "learning_rate": 1.4261744966442953e-05, "loss": 0.0007, "step": 247 }, { "epoch": 1.492851768246802, "grad_norm": 0.11294250190258026, "learning_rate": 1.4093959731543624e-05, "loss": 0.0117, "step": 248 }, { "epoch": 1.4988713318284423, "grad_norm": 0.18805146217346191, "learning_rate": 1.3926174496644295e-05, "loss": 0.0152, "step": 249 }, { "epoch": 1.5048908954100828, "grad_norm": 0.15651652216911316, "learning_rate": 1.3758389261744966e-05, "loss": 0.0195, "step": 250 }, { "epoch": 1.510910458991723, "grad_norm": 0.07596039772033691, "learning_rate": 1.3590604026845638e-05, "loss": 0.0035, "step": 251 }, { "epoch": 1.5169300225733635, "grad_norm": 0.5767983198165894, "learning_rate": 1.3422818791946309e-05, "loss": 0.0209, "step": 252 }, { "epoch": 1.5229495861550038, "grad_norm": 0.14054809510707855, "learning_rate": 1.325503355704698e-05, "loss": 0.0156, "step": 253 }, { "epoch": 1.528969149736644, "grad_norm": 0.5480087995529175, "learning_rate": 1.3087248322147652e-05, "loss": 0.0372, "step": 254 }, { "epoch": 1.5349887133182845, "grad_norm": 0.21327197551727295, "learning_rate": 1.2919463087248323e-05, "loss": 0.037, "step": 255 }, { "epoch": 1.5410082768999247, "grad_norm": 0.8947880268096924, "learning_rate": 1.2751677852348994e-05, "loss": 0.0352, "step": 256 }, { "epoch": 1.5470278404815652, "grad_norm": 0.16651660203933716, "learning_rate": 1.2583892617449666e-05, "loss": 0.0062, "step": 257 }, { "epoch": 1.5530474040632054, "grad_norm": 0.18476413190364838, "learning_rate": 1.2416107382550337e-05, "loss": 0.0177, "step": 258 }, { "epoch": 1.5590669676448456, "grad_norm": 0.013061203993856907, "learning_rate": 1.2248322147651008e-05, "loss": 0.0005, "step": 259 }, { "epoch": 1.565086531226486, "grad_norm": 0.31754836440086365, "learning_rate": 1.208053691275168e-05, "loss": 0.0181, "step": 260 }, { "epoch": 1.5711060948081266, "grad_norm": 0.26034584641456604, "learning_rate": 1.191275167785235e-05, "loss": 0.0342, "step": 261 }, { "epoch": 1.5771256583897668, "grad_norm": 0.15222539007663727, "learning_rate": 1.174496644295302e-05, "loss": 0.0081, "step": 262 }, { "epoch": 1.583145221971407, "grad_norm": 0.1855451464653015, "learning_rate": 1.1577181208053692e-05, "loss": 0.0222, "step": 263 }, { "epoch": 1.5891647855530473, "grad_norm": 0.47386428713798523, "learning_rate": 1.1409395973154363e-05, "loss": 0.0484, "step": 264 }, { "epoch": 1.5951843491346878, "grad_norm": 0.05222529545426369, "learning_rate": 1.1241610738255035e-05, "loss": 0.0018, "step": 265 }, { "epoch": 1.6012039127163282, "grad_norm": 0.36145541071891785, "learning_rate": 1.1073825503355706e-05, "loss": 0.0256, "step": 266 }, { "epoch": 1.6072234762979685, "grad_norm": 0.2200651317834854, "learning_rate": 1.0906040268456376e-05, "loss": 0.0048, "step": 267 }, { "epoch": 1.6132430398796087, "grad_norm": 0.2838999330997467, "learning_rate": 1.0738255033557047e-05, "loss": 0.0088, "step": 268 }, { "epoch": 1.619262603461249, "grad_norm": 0.5340823531150818, "learning_rate": 1.0570469798657718e-05, "loss": 0.0054, "step": 269 }, { "epoch": 1.6252821670428894, "grad_norm": 0.27307260036468506, "learning_rate": 1.040268456375839e-05, "loss": 0.014, "step": 270 }, { "epoch": 1.6313017306245299, "grad_norm": 0.694962739944458, "learning_rate": 1.0234899328859061e-05, "loss": 0.0126, "step": 271 }, { "epoch": 1.6373212942061701, "grad_norm": 0.19789136946201324, "learning_rate": 1.006711409395973e-05, "loss": 0.0172, "step": 272 }, { "epoch": 1.6433408577878104, "grad_norm": 0.5267607569694519, "learning_rate": 9.899328859060402e-06, "loss": 0.0408, "step": 273 }, { "epoch": 1.6493604213694506, "grad_norm": 0.015556755475699902, "learning_rate": 9.731543624161075e-06, "loss": 0.0006, "step": 274 }, { "epoch": 1.655379984951091, "grad_norm": 0.5071566104888916, "learning_rate": 9.563758389261746e-06, "loss": 0.0281, "step": 275 }, { "epoch": 1.6613995485327315, "grad_norm": 0.1441573202610016, "learning_rate": 9.395973154362418e-06, "loss": 0.0151, "step": 276 }, { "epoch": 1.6674191121143718, "grad_norm": 0.22274713218212128, "learning_rate": 9.228187919463089e-06, "loss": 0.0174, "step": 277 }, { "epoch": 1.673438675696012, "grad_norm": 1.108049988746643, "learning_rate": 9.060402684563759e-06, "loss": 0.0189, "step": 278 }, { "epoch": 1.6794582392776523, "grad_norm": 0.47223615646362305, "learning_rate": 8.89261744966443e-06, "loss": 0.0261, "step": 279 }, { "epoch": 1.6854778028592927, "grad_norm": 0.11383321136236191, "learning_rate": 8.724832214765101e-06, "loss": 0.0139, "step": 280 }, { "epoch": 1.6914973664409332, "grad_norm": 0.01508291345089674, "learning_rate": 8.557046979865773e-06, "loss": 0.0006, "step": 281 }, { "epoch": 1.6975169300225734, "grad_norm": 0.572291910648346, "learning_rate": 8.389261744966444e-06, "loss": 0.0587, "step": 282 }, { "epoch": 1.7035364936042137, "grad_norm": 0.8027609586715698, "learning_rate": 8.221476510067115e-06, "loss": 0.0252, "step": 283 }, { "epoch": 1.709556057185854, "grad_norm": 0.0062585920095443726, "learning_rate": 8.053691275167785e-06, "loss": 0.0002, "step": 284 }, { "epoch": 1.7155756207674944, "grad_norm": 0.18026615679264069, "learning_rate": 7.885906040268456e-06, "loss": 0.0069, "step": 285 }, { "epoch": 1.7215951843491348, "grad_norm": 0.02148846536874771, "learning_rate": 7.718120805369127e-06, "loss": 0.0006, "step": 286 }, { "epoch": 1.727614747930775, "grad_norm": 0.40165480971336365, "learning_rate": 7.5503355704698e-06, "loss": 0.0483, "step": 287 }, { "epoch": 1.7336343115124153, "grad_norm": 0.34756484627723694, "learning_rate": 7.382550335570471e-06, "loss": 0.0089, "step": 288 }, { "epoch": 1.7396538750940556, "grad_norm": 0.8310803771018982, "learning_rate": 7.214765100671142e-06, "loss": 0.0349, "step": 289 }, { "epoch": 1.745673438675696, "grad_norm": 0.21227827668190002, "learning_rate": 7.046979865771812e-06, "loss": 0.0098, "step": 290 }, { "epoch": 1.7516930022573365, "grad_norm": 0.18423959612846375, "learning_rate": 6.879194630872483e-06, "loss": 0.0196, "step": 291 }, { "epoch": 1.7577125658389767, "grad_norm": 0.13899557292461395, "learning_rate": 6.7114093959731546e-06, "loss": 0.0086, "step": 292 }, { "epoch": 1.763732129420617, "grad_norm": 0.9509983658790588, "learning_rate": 6.543624161073826e-06, "loss": 0.0227, "step": 293 }, { "epoch": 1.7697516930022572, "grad_norm": 0.2806952893733978, "learning_rate": 6.375838926174497e-06, "loss": 0.0308, "step": 294 }, { "epoch": 1.7757712565838977, "grad_norm": 0.2584255337715149, "learning_rate": 6.2080536912751686e-06, "loss": 0.0078, "step": 295 }, { "epoch": 1.7817908201655381, "grad_norm": 0.6496636867523193, "learning_rate": 6.04026845637584e-06, "loss": 0.0764, "step": 296 }, { "epoch": 1.7878103837471784, "grad_norm": 2.131640672683716, "learning_rate": 5.87248322147651e-06, "loss": 0.0485, "step": 297 }, { "epoch": 1.7938299473288186, "grad_norm": 0.25984302163124084, "learning_rate": 5.704697986577182e-06, "loss": 0.0256, "step": 298 }, { "epoch": 1.7998495109104589, "grad_norm": 0.8501216769218445, "learning_rate": 5.536912751677853e-06, "loss": 0.0559, "step": 299 }, { "epoch": 1.8058690744920993, "grad_norm": 0.3276824355125427, "learning_rate": 5.3691275167785235e-06, "loss": 0.0228, "step": 300 }, { "epoch": 1.8118886380737398, "grad_norm": 0.19804896414279938, "learning_rate": 5.201342281879195e-06, "loss": 0.0121, "step": 301 }, { "epoch": 1.81790820165538, "grad_norm": 0.6229518055915833, "learning_rate": 5.033557046979865e-06, "loss": 0.0046, "step": 302 }, { "epoch": 1.8239277652370203, "grad_norm": 0.16715337336063385, "learning_rate": 4.8657718120805375e-06, "loss": 0.0107, "step": 303 }, { "epoch": 1.8299473288186605, "grad_norm": 0.14998656511306763, "learning_rate": 4.697986577181209e-06, "loss": 0.0041, "step": 304 }, { "epoch": 1.835966892400301, "grad_norm": 0.3184771239757538, "learning_rate": 4.530201342281879e-06, "loss": 0.0107, "step": 305 }, { "epoch": 1.8419864559819414, "grad_norm": 0.665398120880127, "learning_rate": 4.362416107382551e-06, "loss": 0.0227, "step": 306 }, { "epoch": 1.8480060195635817, "grad_norm": 0.19727759063243866, "learning_rate": 4.194630872483222e-06, "loss": 0.0122, "step": 307 }, { "epoch": 1.854025583145222, "grad_norm": 0.13394923508167267, "learning_rate": 4.026845637583892e-06, "loss": 0.0053, "step": 308 }, { "epoch": 1.8600451467268622, "grad_norm": 0.17236709594726562, "learning_rate": 3.859060402684564e-06, "loss": 0.0184, "step": 309 }, { "epoch": 1.8660647103085026, "grad_norm": 1.0638315677642822, "learning_rate": 3.6912751677852355e-06, "loss": 0.0311, "step": 310 }, { "epoch": 1.872084273890143, "grad_norm": 0.06651457399129868, "learning_rate": 3.523489932885906e-06, "loss": 0.0023, "step": 311 }, { "epoch": 1.8781038374717833, "grad_norm": 0.19191402196884155, "learning_rate": 3.3557046979865773e-06, "loss": 0.0195, "step": 312 }, { "epoch": 1.8841234010534236, "grad_norm": 0.004882109817117453, "learning_rate": 3.1879194630872486e-06, "loss": 0.0002, "step": 313 }, { "epoch": 1.8901429646350638, "grad_norm": 0.2256098985671997, "learning_rate": 3.02013422818792e-06, "loss": 0.0145, "step": 314 }, { "epoch": 1.8961625282167043, "grad_norm": 0.49490997195243835, "learning_rate": 2.852348993288591e-06, "loss": 0.0378, "step": 315 }, { "epoch": 1.9021820917983447, "grad_norm": 0.2518860101699829, "learning_rate": 2.6845637583892617e-06, "loss": 0.0424, "step": 316 }, { "epoch": 1.908201655379985, "grad_norm": 0.01092343870550394, "learning_rate": 2.5167785234899326e-06, "loss": 0.0003, "step": 317 }, { "epoch": 1.9142212189616252, "grad_norm": 0.45049425959587097, "learning_rate": 2.3489932885906044e-06, "loss": 0.053, "step": 318 }, { "epoch": 1.9202407825432655, "grad_norm": 0.01676754839718342, "learning_rate": 2.1812080536912753e-06, "loss": 0.0007, "step": 319 }, { "epoch": 1.926260346124906, "grad_norm": 0.08567023277282715, "learning_rate": 2.013422818791946e-06, "loss": 0.0022, "step": 320 }, { "epoch": 1.9322799097065464, "grad_norm": 0.243726447224617, "learning_rate": 1.8456375838926177e-06, "loss": 0.022, "step": 321 }, { "epoch": 1.9382994732881866, "grad_norm": 0.20735050737857819, "learning_rate": 1.6778523489932886e-06, "loss": 0.0113, "step": 322 }, { "epoch": 1.9443190368698269, "grad_norm": 0.013643703423440456, "learning_rate": 1.51006711409396e-06, "loss": 0.0005, "step": 323 }, { "epoch": 1.9503386004514671, "grad_norm": 0.14169737696647644, "learning_rate": 1.3422818791946309e-06, "loss": 0.004, "step": 324 }, { "epoch": 1.9563581640331076, "grad_norm": 0.21097888052463531, "learning_rate": 1.1744966442953022e-06, "loss": 0.0163, "step": 325 }, { "epoch": 1.962377727614748, "grad_norm": 0.3481338620185852, "learning_rate": 1.006711409395973e-06, "loss": 0.0328, "step": 326 }, { "epoch": 1.9683972911963883, "grad_norm": 0.639370858669281, "learning_rate": 8.389261744966443e-07, "loss": 0.0385, "step": 327 }, { "epoch": 1.9744168547780285, "grad_norm": 0.01708345301449299, "learning_rate": 6.711409395973154e-07, "loss": 0.0006, "step": 328 }, { "epoch": 1.9804364183596688, "grad_norm": 1.5634732246398926, "learning_rate": 5.033557046979866e-07, "loss": 0.0966, "step": 329 }, { "epoch": 1.9864559819413092, "grad_norm": 0.03818768635392189, "learning_rate": 3.355704697986577e-07, "loss": 0.001, "step": 330 }, { "epoch": 1.9924755455229497, "grad_norm": 1.7485132217407227, "learning_rate": 1.6778523489932886e-07, "loss": 0.0881, "step": 331 }, { "epoch": 1.99849510910459, "grad_norm": 2.490537643432617, "learning_rate": 0.0, "loss": 0.0464, "step": 332 } ], "logging_steps": 1, "max_steps": 332, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.806882647064781e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }