ssunbear's picture
update unsloth_qwen_2.5_32B_bnb_4bit_finetuned
700141d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.99849510910459,
"eval_steps": 500,
"global_step": 332,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006019563581640331,
"grad_norm": 41.037254333496094,
"learning_rate": 1.4705882352941177e-06,
"loss": 4.2766,
"step": 1
},
{
"epoch": 0.012039127163280662,
"grad_norm": 43.19103240966797,
"learning_rate": 2.9411764705882355e-06,
"loss": 4.2655,
"step": 2
},
{
"epoch": 0.01805869074492099,
"grad_norm": 41.71216583251953,
"learning_rate": 4.411764705882353e-06,
"loss": 4.1574,
"step": 3
},
{
"epoch": 0.024078254326561323,
"grad_norm": 51.17884063720703,
"learning_rate": 5.882352941176471e-06,
"loss": 3.3329,
"step": 4
},
{
"epoch": 0.030097817908201655,
"grad_norm": 28.706247329711914,
"learning_rate": 7.3529411764705884e-06,
"loss": 2.0066,
"step": 5
},
{
"epoch": 0.03611738148984198,
"grad_norm": 19.702205657958984,
"learning_rate": 8.823529411764707e-06,
"loss": 1.2989,
"step": 6
},
{
"epoch": 0.042136945071482315,
"grad_norm": 10.646201133728027,
"learning_rate": 1.0294117647058824e-05,
"loss": 0.4697,
"step": 7
},
{
"epoch": 0.04815650865312265,
"grad_norm": 7.015563488006592,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.167,
"step": 8
},
{
"epoch": 0.05417607223476298,
"grad_norm": 2.405210494995117,
"learning_rate": 1.323529411764706e-05,
"loss": 0.054,
"step": 9
},
{
"epoch": 0.06019563581640331,
"grad_norm": 2.9235267639160156,
"learning_rate": 1.4705882352941177e-05,
"loss": 0.1401,
"step": 10
},
{
"epoch": 0.06621519939804364,
"grad_norm": 3.1382505893707275,
"learning_rate": 1.6176470588235296e-05,
"loss": 0.0757,
"step": 11
},
{
"epoch": 0.07223476297968397,
"grad_norm": 2.7751779556274414,
"learning_rate": 1.7647058823529414e-05,
"loss": 0.089,
"step": 12
},
{
"epoch": 0.0782543265613243,
"grad_norm": 1.7391453981399536,
"learning_rate": 1.9117647058823528e-05,
"loss": 0.11,
"step": 13
},
{
"epoch": 0.08427389014296463,
"grad_norm": 2.010361671447754,
"learning_rate": 2.058823529411765e-05,
"loss": 0.1479,
"step": 14
},
{
"epoch": 0.09029345372460497,
"grad_norm": 3.3061070442199707,
"learning_rate": 2.2058823529411766e-05,
"loss": 0.0665,
"step": 15
},
{
"epoch": 0.0963130173062453,
"grad_norm": 2.8843464851379395,
"learning_rate": 2.3529411764705884e-05,
"loss": 0.074,
"step": 16
},
{
"epoch": 0.10233258088788563,
"grad_norm": 0.8146764039993286,
"learning_rate": 2.5e-05,
"loss": 0.0315,
"step": 17
},
{
"epoch": 0.10835214446952596,
"grad_norm": 2.45939040184021,
"learning_rate": 2.647058823529412e-05,
"loss": 0.0863,
"step": 18
},
{
"epoch": 0.1143717080511663,
"grad_norm": 2.4333105087280273,
"learning_rate": 2.7941176470588236e-05,
"loss": 0.1191,
"step": 19
},
{
"epoch": 0.12039127163280662,
"grad_norm": 1.5534690618515015,
"learning_rate": 2.9411764705882354e-05,
"loss": 0.1213,
"step": 20
},
{
"epoch": 0.12641083521444696,
"grad_norm": 1.2200188636779785,
"learning_rate": 3.0882352941176475e-05,
"loss": 0.0619,
"step": 21
},
{
"epoch": 0.13243039879608728,
"grad_norm": 1.9440735578536987,
"learning_rate": 3.235294117647059e-05,
"loss": 0.1123,
"step": 22
},
{
"epoch": 0.1384499623777276,
"grad_norm": 1.5800230503082275,
"learning_rate": 3.382352941176471e-05,
"loss": 0.0655,
"step": 23
},
{
"epoch": 0.14446952595936793,
"grad_norm": 0.6280108690261841,
"learning_rate": 3.529411764705883e-05,
"loss": 0.0309,
"step": 24
},
{
"epoch": 0.1504890895410083,
"grad_norm": 1.1837276220321655,
"learning_rate": 3.6764705882352945e-05,
"loss": 0.082,
"step": 25
},
{
"epoch": 0.1565086531226486,
"grad_norm": 3.0979809761047363,
"learning_rate": 3.8235294117647055e-05,
"loss": 0.0754,
"step": 26
},
{
"epoch": 0.16252821670428894,
"grad_norm": 0.919219434261322,
"learning_rate": 3.970588235294117e-05,
"loss": 0.0652,
"step": 27
},
{
"epoch": 0.16854778028592926,
"grad_norm": 1.2674806118011475,
"learning_rate": 4.11764705882353e-05,
"loss": 0.0524,
"step": 28
},
{
"epoch": 0.1745673438675696,
"grad_norm": 1.4973307847976685,
"learning_rate": 4.2647058823529415e-05,
"loss": 0.0739,
"step": 29
},
{
"epoch": 0.18058690744920994,
"grad_norm": 1.3600691556930542,
"learning_rate": 4.411764705882353e-05,
"loss": 0.0932,
"step": 30
},
{
"epoch": 0.18660647103085026,
"grad_norm": 0.6800034046173096,
"learning_rate": 4.558823529411765e-05,
"loss": 0.0525,
"step": 31
},
{
"epoch": 0.1926260346124906,
"grad_norm": 0.27061381936073303,
"learning_rate": 4.705882352941177e-05,
"loss": 0.0183,
"step": 32
},
{
"epoch": 0.1986455981941309,
"grad_norm": 0.5821884870529175,
"learning_rate": 4.8529411764705885e-05,
"loss": 0.0342,
"step": 33
},
{
"epoch": 0.20466516177577126,
"grad_norm": 1.5963926315307617,
"learning_rate": 5e-05,
"loss": 0.086,
"step": 34
},
{
"epoch": 0.2106847253574116,
"grad_norm": 1.2303105592727661,
"learning_rate": 4.983221476510067e-05,
"loss": 0.1283,
"step": 35
},
{
"epoch": 0.21670428893905191,
"grad_norm": 0.7925997376441956,
"learning_rate": 4.966442953020135e-05,
"loss": 0.0465,
"step": 36
},
{
"epoch": 0.22272385252069224,
"grad_norm": 0.44674405455589294,
"learning_rate": 4.9496644295302015e-05,
"loss": 0.0208,
"step": 37
},
{
"epoch": 0.2287434161023326,
"grad_norm": 1.129119873046875,
"learning_rate": 4.932885906040269e-05,
"loss": 0.1034,
"step": 38
},
{
"epoch": 0.23476297968397292,
"grad_norm": 0.747196614742279,
"learning_rate": 4.9161073825503354e-05,
"loss": 0.1117,
"step": 39
},
{
"epoch": 0.24078254326561324,
"grad_norm": 1.0140711069107056,
"learning_rate": 4.8993288590604034e-05,
"loss": 0.0713,
"step": 40
},
{
"epoch": 0.24680210684725357,
"grad_norm": 0.9150713086128235,
"learning_rate": 4.88255033557047e-05,
"loss": 0.1045,
"step": 41
},
{
"epoch": 0.2528216704288939,
"grad_norm": 0.7237759232521057,
"learning_rate": 4.865771812080537e-05,
"loss": 0.0399,
"step": 42
},
{
"epoch": 0.2588412340105342,
"grad_norm": 0.4736149311065674,
"learning_rate": 4.848993288590604e-05,
"loss": 0.0283,
"step": 43
},
{
"epoch": 0.26486079759217457,
"grad_norm": 0.8596872091293335,
"learning_rate": 4.832214765100672e-05,
"loss": 0.0516,
"step": 44
},
{
"epoch": 0.2708803611738149,
"grad_norm": 0.8274044394493103,
"learning_rate": 4.8154362416107385e-05,
"loss": 0.0866,
"step": 45
},
{
"epoch": 0.2768999247554552,
"grad_norm": 1.1380550861358643,
"learning_rate": 4.798657718120805e-05,
"loss": 0.0628,
"step": 46
},
{
"epoch": 0.28291948833709557,
"grad_norm": 1.1349643468856812,
"learning_rate": 4.7818791946308725e-05,
"loss": 0.0997,
"step": 47
},
{
"epoch": 0.28893905191873587,
"grad_norm": 1.2396087646484375,
"learning_rate": 4.76510067114094e-05,
"loss": 0.0668,
"step": 48
},
{
"epoch": 0.2949586155003762,
"grad_norm": 0.6159345507621765,
"learning_rate": 4.748322147651007e-05,
"loss": 0.0454,
"step": 49
},
{
"epoch": 0.3009781790820166,
"grad_norm": 0.9823417663574219,
"learning_rate": 4.731543624161074e-05,
"loss": 0.0358,
"step": 50
},
{
"epoch": 0.30699774266365687,
"grad_norm": 1.3460859060287476,
"learning_rate": 4.714765100671141e-05,
"loss": 0.1146,
"step": 51
},
{
"epoch": 0.3130173062452972,
"grad_norm": 0.8716734647750854,
"learning_rate": 4.697986577181208e-05,
"loss": 0.0996,
"step": 52
},
{
"epoch": 0.3190368698269376,
"grad_norm": 0.8868650794029236,
"learning_rate": 4.6812080536912756e-05,
"loss": 0.0607,
"step": 53
},
{
"epoch": 0.32505643340857787,
"grad_norm": 0.5762543678283691,
"learning_rate": 4.664429530201342e-05,
"loss": 0.0603,
"step": 54
},
{
"epoch": 0.3310759969902182,
"grad_norm": 0.5473377704620361,
"learning_rate": 4.6476510067114095e-05,
"loss": 0.031,
"step": 55
},
{
"epoch": 0.3370955605718585,
"grad_norm": 0.4517374634742737,
"learning_rate": 4.630872483221477e-05,
"loss": 0.0318,
"step": 56
},
{
"epoch": 0.3431151241534989,
"grad_norm": 1.007686734199524,
"learning_rate": 4.6140939597315434e-05,
"loss": 0.0596,
"step": 57
},
{
"epoch": 0.3491346877351392,
"grad_norm": 0.5532180666923523,
"learning_rate": 4.597315436241611e-05,
"loss": 0.0917,
"step": 58
},
{
"epoch": 0.3551542513167795,
"grad_norm": 0.6608918309211731,
"learning_rate": 4.580536912751678e-05,
"loss": 0.0768,
"step": 59
},
{
"epoch": 0.3611738148984199,
"grad_norm": 0.9971833229064941,
"learning_rate": 4.5637583892617453e-05,
"loss": 0.0614,
"step": 60
},
{
"epoch": 0.3671933784800602,
"grad_norm": 0.4296749532222748,
"learning_rate": 4.546979865771812e-05,
"loss": 0.0503,
"step": 61
},
{
"epoch": 0.3732129420617005,
"grad_norm": 0.5677506923675537,
"learning_rate": 4.530201342281879e-05,
"loss": 0.0644,
"step": 62
},
{
"epoch": 0.3792325056433409,
"grad_norm": 0.6360709071159363,
"learning_rate": 4.5134228187919466e-05,
"loss": 0.0989,
"step": 63
},
{
"epoch": 0.3852520692249812,
"grad_norm": 1.348215937614441,
"learning_rate": 4.496644295302014e-05,
"loss": 0.0967,
"step": 64
},
{
"epoch": 0.3912716328066215,
"grad_norm": 0.6315649747848511,
"learning_rate": 4.4798657718120805e-05,
"loss": 0.0489,
"step": 65
},
{
"epoch": 0.3972911963882618,
"grad_norm": 0.6150538921356201,
"learning_rate": 4.463087248322148e-05,
"loss": 0.0634,
"step": 66
},
{
"epoch": 0.4033107599699022,
"grad_norm": 0.3067854344844818,
"learning_rate": 4.446308724832215e-05,
"loss": 0.0181,
"step": 67
},
{
"epoch": 0.40933032355154253,
"grad_norm": 0.7488537430763245,
"learning_rate": 4.4295302013422824e-05,
"loss": 0.0847,
"step": 68
},
{
"epoch": 0.4153498871331828,
"grad_norm": 0.769372284412384,
"learning_rate": 4.412751677852349e-05,
"loss": 0.037,
"step": 69
},
{
"epoch": 0.4213694507148232,
"grad_norm": 0.9909029006958008,
"learning_rate": 4.395973154362416e-05,
"loss": 0.0417,
"step": 70
},
{
"epoch": 0.42738901429646353,
"grad_norm": 0.7407757043838501,
"learning_rate": 4.3791946308724836e-05,
"loss": 0.051,
"step": 71
},
{
"epoch": 0.43340857787810383,
"grad_norm": 1.149268388748169,
"learning_rate": 4.36241610738255e-05,
"loss": 0.0409,
"step": 72
},
{
"epoch": 0.4394281414597442,
"grad_norm": 0.9848851561546326,
"learning_rate": 4.3456375838926176e-05,
"loss": 0.0149,
"step": 73
},
{
"epoch": 0.4454477050413845,
"grad_norm": 1.4406760931015015,
"learning_rate": 4.328859060402685e-05,
"loss": 0.0762,
"step": 74
},
{
"epoch": 0.45146726862302483,
"grad_norm": 1.003056526184082,
"learning_rate": 4.312080536912752e-05,
"loss": 0.0865,
"step": 75
},
{
"epoch": 0.4574868322046652,
"grad_norm": 1.0864567756652832,
"learning_rate": 4.295302013422819e-05,
"loss": 0.0535,
"step": 76
},
{
"epoch": 0.4635063957863055,
"grad_norm": 1.5504230260849,
"learning_rate": 4.278523489932886e-05,
"loss": 0.0679,
"step": 77
},
{
"epoch": 0.46952595936794583,
"grad_norm": 1.389381766319275,
"learning_rate": 4.2617449664429534e-05,
"loss": 0.1011,
"step": 78
},
{
"epoch": 0.47554552294958613,
"grad_norm": 0.06696069985628128,
"learning_rate": 4.244966442953021e-05,
"loss": 0.0017,
"step": 79
},
{
"epoch": 0.4815650865312265,
"grad_norm": 0.8552239537239075,
"learning_rate": 4.228187919463087e-05,
"loss": 0.1052,
"step": 80
},
{
"epoch": 0.48758465011286684,
"grad_norm": 1.8147671222686768,
"learning_rate": 4.2114093959731546e-05,
"loss": 0.0364,
"step": 81
},
{
"epoch": 0.49360421369450713,
"grad_norm": 0.7592940330505371,
"learning_rate": 4.194630872483222e-05,
"loss": 0.0373,
"step": 82
},
{
"epoch": 0.4996237772761475,
"grad_norm": 0.7351986765861511,
"learning_rate": 4.1778523489932886e-05,
"loss": 0.1033,
"step": 83
},
{
"epoch": 0.5056433408577878,
"grad_norm": 0.3439026176929474,
"learning_rate": 4.161073825503356e-05,
"loss": 0.0166,
"step": 84
},
{
"epoch": 0.5116629044394282,
"grad_norm": 0.41652995347976685,
"learning_rate": 4.144295302013423e-05,
"loss": 0.0591,
"step": 85
},
{
"epoch": 0.5176824680210684,
"grad_norm": 0.5505680441856384,
"learning_rate": 4.1275167785234905e-05,
"loss": 0.0719,
"step": 86
},
{
"epoch": 0.5237020316027088,
"grad_norm": 0.5010355114936829,
"learning_rate": 4.110738255033557e-05,
"loss": 0.0598,
"step": 87
},
{
"epoch": 0.5297215951843491,
"grad_norm": 0.5710484385490417,
"learning_rate": 4.0939597315436244e-05,
"loss": 0.0354,
"step": 88
},
{
"epoch": 0.5357411587659895,
"grad_norm": 0.815994381904602,
"learning_rate": 4.077181208053692e-05,
"loss": 0.0583,
"step": 89
},
{
"epoch": 0.5417607223476298,
"grad_norm": 0.5417527556419373,
"learning_rate": 4.060402684563759e-05,
"loss": 0.0551,
"step": 90
},
{
"epoch": 0.5477802859292701,
"grad_norm": 0.6098296046257019,
"learning_rate": 4.0436241610738256e-05,
"loss": 0.0675,
"step": 91
},
{
"epoch": 0.5537998495109104,
"grad_norm": 1.0332790613174438,
"learning_rate": 4.026845637583892e-05,
"loss": 0.0726,
"step": 92
},
{
"epoch": 0.5598194130925508,
"grad_norm": 0.5562874674797058,
"learning_rate": 4.01006711409396e-05,
"loss": 0.0493,
"step": 93
},
{
"epoch": 0.5658389766741911,
"grad_norm": 0.8887888789176941,
"learning_rate": 3.993288590604027e-05,
"loss": 0.0924,
"step": 94
},
{
"epoch": 0.5718585402558315,
"grad_norm": 0.67585688829422,
"learning_rate": 3.976510067114094e-05,
"loss": 0.03,
"step": 95
},
{
"epoch": 0.5778781038374717,
"grad_norm": 3.7685415744781494,
"learning_rate": 3.959731543624161e-05,
"loss": 0.0489,
"step": 96
},
{
"epoch": 0.5838976674191121,
"grad_norm": 0.8312086462974548,
"learning_rate": 3.942953020134229e-05,
"loss": 0.071,
"step": 97
},
{
"epoch": 0.5899172310007524,
"grad_norm": 0.6857490539550781,
"learning_rate": 3.9261744966442954e-05,
"loss": 0.0166,
"step": 98
},
{
"epoch": 0.5959367945823928,
"grad_norm": 0.6559200882911682,
"learning_rate": 3.909395973154363e-05,
"loss": 0.1012,
"step": 99
},
{
"epoch": 0.6019563581640331,
"grad_norm": 1.4492859840393066,
"learning_rate": 3.89261744966443e-05,
"loss": 0.076,
"step": 100
},
{
"epoch": 0.6079759217456734,
"grad_norm": 0.7843137383460999,
"learning_rate": 3.875838926174497e-05,
"loss": 0.0577,
"step": 101
},
{
"epoch": 0.6139954853273137,
"grad_norm": 0.794602632522583,
"learning_rate": 3.859060402684564e-05,
"loss": 0.0626,
"step": 102
},
{
"epoch": 0.6200150489089541,
"grad_norm": 1.4473546743392944,
"learning_rate": 3.8422818791946305e-05,
"loss": 0.0356,
"step": 103
},
{
"epoch": 0.6260346124905944,
"grad_norm": 0.4639027416706085,
"learning_rate": 3.8255033557046985e-05,
"loss": 0.0259,
"step": 104
},
{
"epoch": 0.6320541760722348,
"grad_norm": 1.1497997045516968,
"learning_rate": 3.808724832214765e-05,
"loss": 0.0516,
"step": 105
},
{
"epoch": 0.6380737396538751,
"grad_norm": 0.327901691198349,
"learning_rate": 3.7919463087248324e-05,
"loss": 0.0405,
"step": 106
},
{
"epoch": 0.6440933032355154,
"grad_norm": 0.4509243369102478,
"learning_rate": 3.775167785234899e-05,
"loss": 0.0892,
"step": 107
},
{
"epoch": 0.6501128668171557,
"grad_norm": 0.6975520849227905,
"learning_rate": 3.758389261744967e-05,
"loss": 0.0978,
"step": 108
},
{
"epoch": 0.6561324303987961,
"grad_norm": 0.6053667664527893,
"learning_rate": 3.741610738255034e-05,
"loss": 0.0318,
"step": 109
},
{
"epoch": 0.6621519939804364,
"grad_norm": 0.5161236524581909,
"learning_rate": 3.724832214765101e-05,
"loss": 0.0561,
"step": 110
},
{
"epoch": 0.6681715575620768,
"grad_norm": 0.4180920124053955,
"learning_rate": 3.7080536912751676e-05,
"loss": 0.0404,
"step": 111
},
{
"epoch": 0.674191121143717,
"grad_norm": 0.4068116843700409,
"learning_rate": 3.6912751677852356e-05,
"loss": 0.0399,
"step": 112
},
{
"epoch": 0.6802106847253574,
"grad_norm": 0.25368958711624146,
"learning_rate": 3.674496644295302e-05,
"loss": 0.0374,
"step": 113
},
{
"epoch": 0.6862302483069977,
"grad_norm": 0.4473256766796112,
"learning_rate": 3.6577181208053695e-05,
"loss": 0.0533,
"step": 114
},
{
"epoch": 0.6922498118886381,
"grad_norm": 0.39927905797958374,
"learning_rate": 3.640939597315436e-05,
"loss": 0.0367,
"step": 115
},
{
"epoch": 0.6982693754702785,
"grad_norm": 0.5100545883178711,
"learning_rate": 3.6241610738255034e-05,
"loss": 0.0841,
"step": 116
},
{
"epoch": 0.7042889390519187,
"grad_norm": 1.113686203956604,
"learning_rate": 3.607382550335571e-05,
"loss": 0.0798,
"step": 117
},
{
"epoch": 0.710308502633559,
"grad_norm": 0.4927540123462677,
"learning_rate": 3.5906040268456373e-05,
"loss": 0.0201,
"step": 118
},
{
"epoch": 0.7163280662151994,
"grad_norm": 0.2962929904460907,
"learning_rate": 3.5738255033557046e-05,
"loss": 0.0413,
"step": 119
},
{
"epoch": 0.7223476297968398,
"grad_norm": 0.4307601749897003,
"learning_rate": 3.557046979865772e-05,
"loss": 0.022,
"step": 120
},
{
"epoch": 0.7283671933784801,
"grad_norm": 0.5823848247528076,
"learning_rate": 3.540268456375839e-05,
"loss": 0.0357,
"step": 121
},
{
"epoch": 0.7343867569601203,
"grad_norm": 0.3515729010105133,
"learning_rate": 3.523489932885906e-05,
"loss": 0.016,
"step": 122
},
{
"epoch": 0.7404063205417607,
"grad_norm": 0.6808828115463257,
"learning_rate": 3.506711409395974e-05,
"loss": 0.0143,
"step": 123
},
{
"epoch": 0.746425884123401,
"grad_norm": 0.7892507910728455,
"learning_rate": 3.4899328859060405e-05,
"loss": 0.0264,
"step": 124
},
{
"epoch": 0.7524454477050414,
"grad_norm": 1.1977708339691162,
"learning_rate": 3.473154362416108e-05,
"loss": 0.0229,
"step": 125
},
{
"epoch": 0.7584650112866818,
"grad_norm": 1.4794739484786987,
"learning_rate": 3.4563758389261744e-05,
"loss": 0.0504,
"step": 126
},
{
"epoch": 0.764484574868322,
"grad_norm": 0.9014606475830078,
"learning_rate": 3.439597315436242e-05,
"loss": 0.0223,
"step": 127
},
{
"epoch": 0.7705041384499624,
"grad_norm": 0.5018836855888367,
"learning_rate": 3.422818791946309e-05,
"loss": 0.0359,
"step": 128
},
{
"epoch": 0.7765237020316027,
"grad_norm": 1.3349637985229492,
"learning_rate": 3.4060402684563756e-05,
"loss": 0.0572,
"step": 129
},
{
"epoch": 0.782543265613243,
"grad_norm": 1.1911202669143677,
"learning_rate": 3.389261744966443e-05,
"loss": 0.0956,
"step": 130
},
{
"epoch": 0.7885628291948834,
"grad_norm": 2.8993449211120605,
"learning_rate": 3.37248322147651e-05,
"loss": 0.1212,
"step": 131
},
{
"epoch": 0.7945823927765236,
"grad_norm": 0.6151400208473206,
"learning_rate": 3.3557046979865775e-05,
"loss": 0.0641,
"step": 132
},
{
"epoch": 0.800601956358164,
"grad_norm": 1.7681182622909546,
"learning_rate": 3.338926174496644e-05,
"loss": 0.1288,
"step": 133
},
{
"epoch": 0.8066215199398044,
"grad_norm": 1.9313393831253052,
"learning_rate": 3.3221476510067115e-05,
"loss": 0.122,
"step": 134
},
{
"epoch": 0.8126410835214447,
"grad_norm": 0.7092230916023254,
"learning_rate": 3.305369127516779e-05,
"loss": 0.0395,
"step": 135
},
{
"epoch": 0.8186606471030851,
"grad_norm": 0.6039671301841736,
"learning_rate": 3.288590604026846e-05,
"loss": 0.0517,
"step": 136
},
{
"epoch": 0.8246802106847254,
"grad_norm": 0.6897003054618835,
"learning_rate": 3.271812080536913e-05,
"loss": 0.0507,
"step": 137
},
{
"epoch": 0.8306997742663657,
"grad_norm": 0.05981295928359032,
"learning_rate": 3.25503355704698e-05,
"loss": 0.0023,
"step": 138
},
{
"epoch": 0.836719337848006,
"grad_norm": 0.48830369114875793,
"learning_rate": 3.238255033557047e-05,
"loss": 0.0652,
"step": 139
},
{
"epoch": 0.8427389014296464,
"grad_norm": 1.0506463050842285,
"learning_rate": 3.221476510067114e-05,
"loss": 0.0709,
"step": 140
},
{
"epoch": 0.8487584650112867,
"grad_norm": 0.3859744668006897,
"learning_rate": 3.204697986577181e-05,
"loss": 0.0396,
"step": 141
},
{
"epoch": 0.8547780285929271,
"grad_norm": 0.768587052822113,
"learning_rate": 3.1879194630872485e-05,
"loss": 0.0599,
"step": 142
},
{
"epoch": 0.8607975921745673,
"grad_norm": 0.6868757605552673,
"learning_rate": 3.171140939597316e-05,
"loss": 0.0373,
"step": 143
},
{
"epoch": 0.8668171557562077,
"grad_norm": 0.7010259628295898,
"learning_rate": 3.1543624161073825e-05,
"loss": 0.0717,
"step": 144
},
{
"epoch": 0.872836719337848,
"grad_norm": 0.5125268697738647,
"learning_rate": 3.13758389261745e-05,
"loss": 0.0457,
"step": 145
},
{
"epoch": 0.8788562829194884,
"grad_norm": 0.9679777026176453,
"learning_rate": 3.120805369127517e-05,
"loss": 0.0988,
"step": 146
},
{
"epoch": 0.8848758465011287,
"grad_norm": 0.7588280439376831,
"learning_rate": 3.1040268456375844e-05,
"loss": 0.0691,
"step": 147
},
{
"epoch": 0.890895410082769,
"grad_norm": 0.4412769079208374,
"learning_rate": 3.087248322147651e-05,
"loss": 0.0243,
"step": 148
},
{
"epoch": 0.8969149736644093,
"grad_norm": 0.5840623378753662,
"learning_rate": 3.070469798657718e-05,
"loss": 0.0553,
"step": 149
},
{
"epoch": 0.9029345372460497,
"grad_norm": 0.34683701395988464,
"learning_rate": 3.0536912751677856e-05,
"loss": 0.0568,
"step": 150
},
{
"epoch": 0.90895410082769,
"grad_norm": 0.6545599102973938,
"learning_rate": 3.0369127516778522e-05,
"loss": 0.0523,
"step": 151
},
{
"epoch": 0.9149736644093304,
"grad_norm": 0.3024606704711914,
"learning_rate": 3.02013422818792e-05,
"loss": 0.04,
"step": 152
},
{
"epoch": 0.9209932279909706,
"grad_norm": 0.40984031558036804,
"learning_rate": 3.0033557046979865e-05,
"loss": 0.027,
"step": 153
},
{
"epoch": 0.927012791572611,
"grad_norm": 0.3794308602809906,
"learning_rate": 2.986577181208054e-05,
"loss": 0.0927,
"step": 154
},
{
"epoch": 0.9330323551542513,
"grad_norm": 0.6882305145263672,
"learning_rate": 2.9697986577181207e-05,
"loss": 0.0343,
"step": 155
},
{
"epoch": 0.9390519187358917,
"grad_norm": 0.6028600335121155,
"learning_rate": 2.9530201342281884e-05,
"loss": 0.0479,
"step": 156
},
{
"epoch": 0.945071482317532,
"grad_norm": 3.9858436584472656,
"learning_rate": 2.936241610738255e-05,
"loss": 0.0709,
"step": 157
},
{
"epoch": 0.9510910458991723,
"grad_norm": 0.193309485912323,
"learning_rate": 2.9194630872483227e-05,
"loss": 0.0098,
"step": 158
},
{
"epoch": 0.9571106094808126,
"grad_norm": 0.6534713506698608,
"learning_rate": 2.9026845637583893e-05,
"loss": 0.0519,
"step": 159
},
{
"epoch": 0.963130173062453,
"grad_norm": 0.27771925926208496,
"learning_rate": 2.885906040268457e-05,
"loss": 0.0238,
"step": 160
},
{
"epoch": 0.9691497366440933,
"grad_norm": 0.7335049510002136,
"learning_rate": 2.8691275167785235e-05,
"loss": 0.0677,
"step": 161
},
{
"epoch": 0.9751693002257337,
"grad_norm": 1.1832383871078491,
"learning_rate": 2.8523489932885905e-05,
"loss": 0.1371,
"step": 162
},
{
"epoch": 0.9811888638073739,
"grad_norm": 0.7754644155502319,
"learning_rate": 2.8355704697986578e-05,
"loss": 0.0619,
"step": 163
},
{
"epoch": 0.9872084273890143,
"grad_norm": 0.6826126575469971,
"learning_rate": 2.8187919463087248e-05,
"loss": 0.0682,
"step": 164
},
{
"epoch": 0.9932279909706546,
"grad_norm": 0.9121167659759521,
"learning_rate": 2.802013422818792e-05,
"loss": 0.0234,
"step": 165
},
{
"epoch": 0.999247554552295,
"grad_norm": 0.6562134623527527,
"learning_rate": 2.785234899328859e-05,
"loss": 0.0615,
"step": 166
},
{
"epoch": 1.0052671181339352,
"grad_norm": 2.136812925338745,
"learning_rate": 2.7684563758389263e-05,
"loss": 0.0835,
"step": 167
},
{
"epoch": 1.0112866817155757,
"grad_norm": 0.430277019739151,
"learning_rate": 2.7516778523489933e-05,
"loss": 0.0205,
"step": 168
},
{
"epoch": 1.017306245297216,
"grad_norm": 0.37437501549720764,
"learning_rate": 2.7348993288590606e-05,
"loss": 0.0147,
"step": 169
},
{
"epoch": 1.0233258088788564,
"grad_norm": 0.09916182607412338,
"learning_rate": 2.7181208053691276e-05,
"loss": 0.0166,
"step": 170
},
{
"epoch": 1.0293453724604966,
"grad_norm": 0.22763441503047943,
"learning_rate": 2.701342281879195e-05,
"loss": 0.0296,
"step": 171
},
{
"epoch": 1.0353649360421369,
"grad_norm": 0.3235589265823364,
"learning_rate": 2.6845637583892618e-05,
"loss": 0.0214,
"step": 172
},
{
"epoch": 1.0413844996237773,
"grad_norm": 0.09323552995920181,
"learning_rate": 2.6677852348993288e-05,
"loss": 0.0029,
"step": 173
},
{
"epoch": 1.0474040632054176,
"grad_norm": 0.18400466442108154,
"learning_rate": 2.651006711409396e-05,
"loss": 0.0227,
"step": 174
},
{
"epoch": 1.053423626787058,
"grad_norm": 0.4601859450340271,
"learning_rate": 2.634228187919463e-05,
"loss": 0.0247,
"step": 175
},
{
"epoch": 1.0594431903686983,
"grad_norm": 0.06925185769796371,
"learning_rate": 2.6174496644295304e-05,
"loss": 0.0028,
"step": 176
},
{
"epoch": 1.0654627539503385,
"grad_norm": 0.7103378772735596,
"learning_rate": 2.6006711409395973e-05,
"loss": 0.0679,
"step": 177
},
{
"epoch": 1.071482317531979,
"grad_norm": 0.2948612868785858,
"learning_rate": 2.5838926174496646e-05,
"loss": 0.0158,
"step": 178
},
{
"epoch": 1.0775018811136192,
"grad_norm": 0.5460957288742065,
"learning_rate": 2.5671140939597316e-05,
"loss": 0.0252,
"step": 179
},
{
"epoch": 1.0835214446952597,
"grad_norm": 0.13775992393493652,
"learning_rate": 2.550335570469799e-05,
"loss": 0.0125,
"step": 180
},
{
"epoch": 1.0895410082769,
"grad_norm": 0.2737879753112793,
"learning_rate": 2.533557046979866e-05,
"loss": 0.0087,
"step": 181
},
{
"epoch": 1.0955605718585402,
"grad_norm": 0.37196484208106995,
"learning_rate": 2.516778523489933e-05,
"loss": 0.0579,
"step": 182
},
{
"epoch": 1.1015801354401806,
"grad_norm": 0.3493405282497406,
"learning_rate": 2.5e-05,
"loss": 0.0126,
"step": 183
},
{
"epoch": 1.1075996990218209,
"grad_norm": 1.0219722986221313,
"learning_rate": 2.4832214765100674e-05,
"loss": 0.0701,
"step": 184
},
{
"epoch": 1.1136192626034613,
"grad_norm": 0.32175976037979126,
"learning_rate": 2.4664429530201344e-05,
"loss": 0.012,
"step": 185
},
{
"epoch": 1.1196388261851016,
"grad_norm": 0.33765479922294617,
"learning_rate": 2.4496644295302017e-05,
"loss": 0.0106,
"step": 186
},
{
"epoch": 1.1256583897667418,
"grad_norm": 0.17531374096870422,
"learning_rate": 2.4328859060402687e-05,
"loss": 0.0161,
"step": 187
},
{
"epoch": 1.1316779533483823,
"grad_norm": 0.1013503223657608,
"learning_rate": 2.416107382550336e-05,
"loss": 0.0057,
"step": 188
},
{
"epoch": 1.1376975169300225,
"grad_norm": 0.5186209082603455,
"learning_rate": 2.3993288590604026e-05,
"loss": 0.0189,
"step": 189
},
{
"epoch": 1.143717080511663,
"grad_norm": 0.577898383140564,
"learning_rate": 2.38255033557047e-05,
"loss": 0.0315,
"step": 190
},
{
"epoch": 1.1497366440933032,
"grad_norm": 0.2543765604496002,
"learning_rate": 2.365771812080537e-05,
"loss": 0.0259,
"step": 191
},
{
"epoch": 1.1557562076749435,
"grad_norm": 1.04751718044281,
"learning_rate": 2.348993288590604e-05,
"loss": 0.0267,
"step": 192
},
{
"epoch": 1.161775771256584,
"grad_norm": 0.30151480436325073,
"learning_rate": 2.332214765100671e-05,
"loss": 0.016,
"step": 193
},
{
"epoch": 1.1677953348382242,
"grad_norm": 1.1602953672409058,
"learning_rate": 2.3154362416107384e-05,
"loss": 0.0342,
"step": 194
},
{
"epoch": 1.1738148984198646,
"grad_norm": 0.6510918140411377,
"learning_rate": 2.2986577181208054e-05,
"loss": 0.0367,
"step": 195
},
{
"epoch": 1.1798344620015049,
"grad_norm": 0.2937709093093872,
"learning_rate": 2.2818791946308727e-05,
"loss": 0.0124,
"step": 196
},
{
"epoch": 1.1858540255831453,
"grad_norm": 0.3778565526008606,
"learning_rate": 2.2651006711409396e-05,
"loss": 0.0353,
"step": 197
},
{
"epoch": 1.1918735891647856,
"grad_norm": 0.34342288970947266,
"learning_rate": 2.248322147651007e-05,
"loss": 0.0228,
"step": 198
},
{
"epoch": 1.1978931527464258,
"grad_norm": 0.25225672125816345,
"learning_rate": 2.231543624161074e-05,
"loss": 0.0037,
"step": 199
},
{
"epoch": 1.2039127163280663,
"grad_norm": 0.3875395953655243,
"learning_rate": 2.2147651006711412e-05,
"loss": 0.024,
"step": 200
},
{
"epoch": 1.2099322799097065,
"grad_norm": 0.48843473196029663,
"learning_rate": 2.197986577181208e-05,
"loss": 0.0411,
"step": 201
},
{
"epoch": 1.215951843491347,
"grad_norm": 0.008358384482562542,
"learning_rate": 2.181208053691275e-05,
"loss": 0.0002,
"step": 202
},
{
"epoch": 1.2219714070729872,
"grad_norm": 0.0617498978972435,
"learning_rate": 2.1644295302013424e-05,
"loss": 0.0016,
"step": 203
},
{
"epoch": 1.2279909706546275,
"grad_norm": 0.5839952826499939,
"learning_rate": 2.1476510067114094e-05,
"loss": 0.0255,
"step": 204
},
{
"epoch": 1.234010534236268,
"grad_norm": 0.6008470058441162,
"learning_rate": 2.1308724832214767e-05,
"loss": 0.0279,
"step": 205
},
{
"epoch": 1.2400300978179082,
"grad_norm": 0.08057394623756409,
"learning_rate": 2.1140939597315437e-05,
"loss": 0.014,
"step": 206
},
{
"epoch": 1.2460496613995486,
"grad_norm": 0.8297271728515625,
"learning_rate": 2.097315436241611e-05,
"loss": 0.0433,
"step": 207
},
{
"epoch": 1.2520692249811889,
"grad_norm": 1.0753511190414429,
"learning_rate": 2.080536912751678e-05,
"loss": 0.0342,
"step": 208
},
{
"epoch": 1.2580887885628291,
"grad_norm": 0.11652516573667526,
"learning_rate": 2.0637583892617452e-05,
"loss": 0.0122,
"step": 209
},
{
"epoch": 1.2641083521444696,
"grad_norm": 0.23289084434509277,
"learning_rate": 2.0469798657718122e-05,
"loss": 0.0229,
"step": 210
},
{
"epoch": 1.2701279157261098,
"grad_norm": 0.5731219053268433,
"learning_rate": 2.0302013422818795e-05,
"loss": 0.0455,
"step": 211
},
{
"epoch": 1.2761474793077503,
"grad_norm": 0.8601072430610657,
"learning_rate": 2.013422818791946e-05,
"loss": 0.0294,
"step": 212
},
{
"epoch": 1.2821670428893905,
"grad_norm": 0.9172778129577637,
"learning_rate": 1.9966442953020134e-05,
"loss": 0.0803,
"step": 213
},
{
"epoch": 1.2881866064710308,
"grad_norm": 0.18378061056137085,
"learning_rate": 1.9798657718120804e-05,
"loss": 0.0151,
"step": 214
},
{
"epoch": 1.2942061700526712,
"grad_norm": 0.2338120937347412,
"learning_rate": 1.9630872483221477e-05,
"loss": 0.0214,
"step": 215
},
{
"epoch": 1.3002257336343115,
"grad_norm": 0.09691441804170609,
"learning_rate": 1.946308724832215e-05,
"loss": 0.0087,
"step": 216
},
{
"epoch": 1.306245297215952,
"grad_norm": 0.1699642539024353,
"learning_rate": 1.929530201342282e-05,
"loss": 0.0088,
"step": 217
},
{
"epoch": 1.3122648607975922,
"grad_norm": 0.014856619760394096,
"learning_rate": 1.9127516778523493e-05,
"loss": 0.0003,
"step": 218
},
{
"epoch": 1.3182844243792324,
"grad_norm": 0.17981240153312683,
"learning_rate": 1.8959731543624162e-05,
"loss": 0.0148,
"step": 219
},
{
"epoch": 1.324303987960873,
"grad_norm": 0.1564723402261734,
"learning_rate": 1.8791946308724835e-05,
"loss": 0.025,
"step": 220
},
{
"epoch": 1.3303235515425131,
"grad_norm": 0.05128008872270584,
"learning_rate": 1.8624161073825505e-05,
"loss": 0.0013,
"step": 221
},
{
"epoch": 1.3363431151241536,
"grad_norm": 0.018907951191067696,
"learning_rate": 1.8456375838926178e-05,
"loss": 0.0006,
"step": 222
},
{
"epoch": 1.3423626787057938,
"grad_norm": 0.047860465943813324,
"learning_rate": 1.8288590604026847e-05,
"loss": 0.0018,
"step": 223
},
{
"epoch": 1.348382242287434,
"grad_norm": 1.6964343786239624,
"learning_rate": 1.8120805369127517e-05,
"loss": 0.0834,
"step": 224
},
{
"epoch": 1.3544018058690745,
"grad_norm": 0.09841305017471313,
"learning_rate": 1.7953020134228187e-05,
"loss": 0.0018,
"step": 225
},
{
"epoch": 1.3604213694507148,
"grad_norm": 0.29318398237228394,
"learning_rate": 1.778523489932886e-05,
"loss": 0.0292,
"step": 226
},
{
"epoch": 1.3664409330323553,
"grad_norm": 0.6777030229568481,
"learning_rate": 1.761744966442953e-05,
"loss": 0.0453,
"step": 227
},
{
"epoch": 1.3724604966139955,
"grad_norm": 0.09742780774831772,
"learning_rate": 1.7449664429530202e-05,
"loss": 0.0022,
"step": 228
},
{
"epoch": 1.3784800601956357,
"grad_norm": 0.21270929276943207,
"learning_rate": 1.7281879194630872e-05,
"loss": 0.0216,
"step": 229
},
{
"epoch": 1.3844996237772762,
"grad_norm": 0.10257343202829361,
"learning_rate": 1.7114093959731545e-05,
"loss": 0.0032,
"step": 230
},
{
"epoch": 1.3905191873589164,
"grad_norm": 0.2899154722690582,
"learning_rate": 1.6946308724832215e-05,
"loss": 0.0336,
"step": 231
},
{
"epoch": 1.396538750940557,
"grad_norm": 0.560697615146637,
"learning_rate": 1.6778523489932888e-05,
"loss": 0.0167,
"step": 232
},
{
"epoch": 1.4025583145221971,
"grad_norm": 0.15792670845985413,
"learning_rate": 1.6610738255033557e-05,
"loss": 0.0161,
"step": 233
},
{
"epoch": 1.4085778781038374,
"grad_norm": 0.112309031188488,
"learning_rate": 1.644295302013423e-05,
"loss": 0.0081,
"step": 234
},
{
"epoch": 1.4145974416854779,
"grad_norm": 0.6623883247375488,
"learning_rate": 1.62751677852349e-05,
"loss": 0.0679,
"step": 235
},
{
"epoch": 1.420617005267118,
"grad_norm": 0.27897122502326965,
"learning_rate": 1.610738255033557e-05,
"loss": 0.0162,
"step": 236
},
{
"epoch": 1.4266365688487586,
"grad_norm": 0.08262226730585098,
"learning_rate": 1.5939597315436243e-05,
"loss": 0.0029,
"step": 237
},
{
"epoch": 1.4326561324303988,
"grad_norm": 0.13499091565608978,
"learning_rate": 1.5771812080536912e-05,
"loss": 0.0147,
"step": 238
},
{
"epoch": 1.438675696012039,
"grad_norm": 0.2563413977622986,
"learning_rate": 1.5604026845637585e-05,
"loss": 0.0186,
"step": 239
},
{
"epoch": 1.4446952595936795,
"grad_norm": 0.38309767842292786,
"learning_rate": 1.5436241610738255e-05,
"loss": 0.0042,
"step": 240
},
{
"epoch": 1.4507148231753197,
"grad_norm": 0.5308915972709656,
"learning_rate": 1.5268456375838928e-05,
"loss": 0.023,
"step": 241
},
{
"epoch": 1.4567343867569602,
"grad_norm": 0.5418457984924316,
"learning_rate": 1.51006711409396e-05,
"loss": 0.0271,
"step": 242
},
{
"epoch": 1.4627539503386005,
"grad_norm": 0.16427500545978546,
"learning_rate": 1.493288590604027e-05,
"loss": 0.0167,
"step": 243
},
{
"epoch": 1.4687735139202407,
"grad_norm": 0.1764906644821167,
"learning_rate": 1.4765100671140942e-05,
"loss": 0.0041,
"step": 244
},
{
"epoch": 1.4747930775018812,
"grad_norm": 0.028177335858345032,
"learning_rate": 1.4597315436241613e-05,
"loss": 0.0011,
"step": 245
},
{
"epoch": 1.4808126410835214,
"grad_norm": 0.28984132409095764,
"learning_rate": 1.4429530201342285e-05,
"loss": 0.0037,
"step": 246
},
{
"epoch": 1.4868322046651619,
"grad_norm": 0.016668178141117096,
"learning_rate": 1.4261744966442953e-05,
"loss": 0.0007,
"step": 247
},
{
"epoch": 1.492851768246802,
"grad_norm": 0.11294250190258026,
"learning_rate": 1.4093959731543624e-05,
"loss": 0.0117,
"step": 248
},
{
"epoch": 1.4988713318284423,
"grad_norm": 0.18805146217346191,
"learning_rate": 1.3926174496644295e-05,
"loss": 0.0152,
"step": 249
},
{
"epoch": 1.5048908954100828,
"grad_norm": 0.15651652216911316,
"learning_rate": 1.3758389261744966e-05,
"loss": 0.0195,
"step": 250
},
{
"epoch": 1.510910458991723,
"grad_norm": 0.07596039772033691,
"learning_rate": 1.3590604026845638e-05,
"loss": 0.0035,
"step": 251
},
{
"epoch": 1.5169300225733635,
"grad_norm": 0.5767983198165894,
"learning_rate": 1.3422818791946309e-05,
"loss": 0.0209,
"step": 252
},
{
"epoch": 1.5229495861550038,
"grad_norm": 0.14054809510707855,
"learning_rate": 1.325503355704698e-05,
"loss": 0.0156,
"step": 253
},
{
"epoch": 1.528969149736644,
"grad_norm": 0.5480087995529175,
"learning_rate": 1.3087248322147652e-05,
"loss": 0.0372,
"step": 254
},
{
"epoch": 1.5349887133182845,
"grad_norm": 0.21327197551727295,
"learning_rate": 1.2919463087248323e-05,
"loss": 0.037,
"step": 255
},
{
"epoch": 1.5410082768999247,
"grad_norm": 0.8947880268096924,
"learning_rate": 1.2751677852348994e-05,
"loss": 0.0352,
"step": 256
},
{
"epoch": 1.5470278404815652,
"grad_norm": 0.16651660203933716,
"learning_rate": 1.2583892617449666e-05,
"loss": 0.0062,
"step": 257
},
{
"epoch": 1.5530474040632054,
"grad_norm": 0.18476413190364838,
"learning_rate": 1.2416107382550337e-05,
"loss": 0.0177,
"step": 258
},
{
"epoch": 1.5590669676448456,
"grad_norm": 0.013061203993856907,
"learning_rate": 1.2248322147651008e-05,
"loss": 0.0005,
"step": 259
},
{
"epoch": 1.565086531226486,
"grad_norm": 0.31754836440086365,
"learning_rate": 1.208053691275168e-05,
"loss": 0.0181,
"step": 260
},
{
"epoch": 1.5711060948081266,
"grad_norm": 0.26034584641456604,
"learning_rate": 1.191275167785235e-05,
"loss": 0.0342,
"step": 261
},
{
"epoch": 1.5771256583897668,
"grad_norm": 0.15222539007663727,
"learning_rate": 1.174496644295302e-05,
"loss": 0.0081,
"step": 262
},
{
"epoch": 1.583145221971407,
"grad_norm": 0.1855451464653015,
"learning_rate": 1.1577181208053692e-05,
"loss": 0.0222,
"step": 263
},
{
"epoch": 1.5891647855530473,
"grad_norm": 0.47386428713798523,
"learning_rate": 1.1409395973154363e-05,
"loss": 0.0484,
"step": 264
},
{
"epoch": 1.5951843491346878,
"grad_norm": 0.05222529545426369,
"learning_rate": 1.1241610738255035e-05,
"loss": 0.0018,
"step": 265
},
{
"epoch": 1.6012039127163282,
"grad_norm": 0.36145541071891785,
"learning_rate": 1.1073825503355706e-05,
"loss": 0.0256,
"step": 266
},
{
"epoch": 1.6072234762979685,
"grad_norm": 0.2200651317834854,
"learning_rate": 1.0906040268456376e-05,
"loss": 0.0048,
"step": 267
},
{
"epoch": 1.6132430398796087,
"grad_norm": 0.2838999330997467,
"learning_rate": 1.0738255033557047e-05,
"loss": 0.0088,
"step": 268
},
{
"epoch": 1.619262603461249,
"grad_norm": 0.5340823531150818,
"learning_rate": 1.0570469798657718e-05,
"loss": 0.0054,
"step": 269
},
{
"epoch": 1.6252821670428894,
"grad_norm": 0.27307260036468506,
"learning_rate": 1.040268456375839e-05,
"loss": 0.014,
"step": 270
},
{
"epoch": 1.6313017306245299,
"grad_norm": 0.694962739944458,
"learning_rate": 1.0234899328859061e-05,
"loss": 0.0126,
"step": 271
},
{
"epoch": 1.6373212942061701,
"grad_norm": 0.19789136946201324,
"learning_rate": 1.006711409395973e-05,
"loss": 0.0172,
"step": 272
},
{
"epoch": 1.6433408577878104,
"grad_norm": 0.5267607569694519,
"learning_rate": 9.899328859060402e-06,
"loss": 0.0408,
"step": 273
},
{
"epoch": 1.6493604213694506,
"grad_norm": 0.015556755475699902,
"learning_rate": 9.731543624161075e-06,
"loss": 0.0006,
"step": 274
},
{
"epoch": 1.655379984951091,
"grad_norm": 0.5071566104888916,
"learning_rate": 9.563758389261746e-06,
"loss": 0.0281,
"step": 275
},
{
"epoch": 1.6613995485327315,
"grad_norm": 0.1441573202610016,
"learning_rate": 9.395973154362418e-06,
"loss": 0.0151,
"step": 276
},
{
"epoch": 1.6674191121143718,
"grad_norm": 0.22274713218212128,
"learning_rate": 9.228187919463089e-06,
"loss": 0.0174,
"step": 277
},
{
"epoch": 1.673438675696012,
"grad_norm": 1.108049988746643,
"learning_rate": 9.060402684563759e-06,
"loss": 0.0189,
"step": 278
},
{
"epoch": 1.6794582392776523,
"grad_norm": 0.47223615646362305,
"learning_rate": 8.89261744966443e-06,
"loss": 0.0261,
"step": 279
},
{
"epoch": 1.6854778028592927,
"grad_norm": 0.11383321136236191,
"learning_rate": 8.724832214765101e-06,
"loss": 0.0139,
"step": 280
},
{
"epoch": 1.6914973664409332,
"grad_norm": 0.01508291345089674,
"learning_rate": 8.557046979865773e-06,
"loss": 0.0006,
"step": 281
},
{
"epoch": 1.6975169300225734,
"grad_norm": 0.572291910648346,
"learning_rate": 8.389261744966444e-06,
"loss": 0.0587,
"step": 282
},
{
"epoch": 1.7035364936042137,
"grad_norm": 0.8027609586715698,
"learning_rate": 8.221476510067115e-06,
"loss": 0.0252,
"step": 283
},
{
"epoch": 1.709556057185854,
"grad_norm": 0.0062585920095443726,
"learning_rate": 8.053691275167785e-06,
"loss": 0.0002,
"step": 284
},
{
"epoch": 1.7155756207674944,
"grad_norm": 0.18026615679264069,
"learning_rate": 7.885906040268456e-06,
"loss": 0.0069,
"step": 285
},
{
"epoch": 1.7215951843491348,
"grad_norm": 0.02148846536874771,
"learning_rate": 7.718120805369127e-06,
"loss": 0.0006,
"step": 286
},
{
"epoch": 1.727614747930775,
"grad_norm": 0.40165480971336365,
"learning_rate": 7.5503355704698e-06,
"loss": 0.0483,
"step": 287
},
{
"epoch": 1.7336343115124153,
"grad_norm": 0.34756484627723694,
"learning_rate": 7.382550335570471e-06,
"loss": 0.0089,
"step": 288
},
{
"epoch": 1.7396538750940556,
"grad_norm": 0.8310803771018982,
"learning_rate": 7.214765100671142e-06,
"loss": 0.0349,
"step": 289
},
{
"epoch": 1.745673438675696,
"grad_norm": 0.21227827668190002,
"learning_rate": 7.046979865771812e-06,
"loss": 0.0098,
"step": 290
},
{
"epoch": 1.7516930022573365,
"grad_norm": 0.18423959612846375,
"learning_rate": 6.879194630872483e-06,
"loss": 0.0196,
"step": 291
},
{
"epoch": 1.7577125658389767,
"grad_norm": 0.13899557292461395,
"learning_rate": 6.7114093959731546e-06,
"loss": 0.0086,
"step": 292
},
{
"epoch": 1.763732129420617,
"grad_norm": 0.9509983658790588,
"learning_rate": 6.543624161073826e-06,
"loss": 0.0227,
"step": 293
},
{
"epoch": 1.7697516930022572,
"grad_norm": 0.2806952893733978,
"learning_rate": 6.375838926174497e-06,
"loss": 0.0308,
"step": 294
},
{
"epoch": 1.7757712565838977,
"grad_norm": 0.2584255337715149,
"learning_rate": 6.2080536912751686e-06,
"loss": 0.0078,
"step": 295
},
{
"epoch": 1.7817908201655381,
"grad_norm": 0.6496636867523193,
"learning_rate": 6.04026845637584e-06,
"loss": 0.0764,
"step": 296
},
{
"epoch": 1.7878103837471784,
"grad_norm": 2.131640672683716,
"learning_rate": 5.87248322147651e-06,
"loss": 0.0485,
"step": 297
},
{
"epoch": 1.7938299473288186,
"grad_norm": 0.25984302163124084,
"learning_rate": 5.704697986577182e-06,
"loss": 0.0256,
"step": 298
},
{
"epoch": 1.7998495109104589,
"grad_norm": 0.8501216769218445,
"learning_rate": 5.536912751677853e-06,
"loss": 0.0559,
"step": 299
},
{
"epoch": 1.8058690744920993,
"grad_norm": 0.3276824355125427,
"learning_rate": 5.3691275167785235e-06,
"loss": 0.0228,
"step": 300
},
{
"epoch": 1.8118886380737398,
"grad_norm": 0.19804896414279938,
"learning_rate": 5.201342281879195e-06,
"loss": 0.0121,
"step": 301
},
{
"epoch": 1.81790820165538,
"grad_norm": 0.6229518055915833,
"learning_rate": 5.033557046979865e-06,
"loss": 0.0046,
"step": 302
},
{
"epoch": 1.8239277652370203,
"grad_norm": 0.16715337336063385,
"learning_rate": 4.8657718120805375e-06,
"loss": 0.0107,
"step": 303
},
{
"epoch": 1.8299473288186605,
"grad_norm": 0.14998656511306763,
"learning_rate": 4.697986577181209e-06,
"loss": 0.0041,
"step": 304
},
{
"epoch": 1.835966892400301,
"grad_norm": 0.3184771239757538,
"learning_rate": 4.530201342281879e-06,
"loss": 0.0107,
"step": 305
},
{
"epoch": 1.8419864559819414,
"grad_norm": 0.665398120880127,
"learning_rate": 4.362416107382551e-06,
"loss": 0.0227,
"step": 306
},
{
"epoch": 1.8480060195635817,
"grad_norm": 0.19727759063243866,
"learning_rate": 4.194630872483222e-06,
"loss": 0.0122,
"step": 307
},
{
"epoch": 1.854025583145222,
"grad_norm": 0.13394923508167267,
"learning_rate": 4.026845637583892e-06,
"loss": 0.0053,
"step": 308
},
{
"epoch": 1.8600451467268622,
"grad_norm": 0.17236709594726562,
"learning_rate": 3.859060402684564e-06,
"loss": 0.0184,
"step": 309
},
{
"epoch": 1.8660647103085026,
"grad_norm": 1.0638315677642822,
"learning_rate": 3.6912751677852355e-06,
"loss": 0.0311,
"step": 310
},
{
"epoch": 1.872084273890143,
"grad_norm": 0.06651457399129868,
"learning_rate": 3.523489932885906e-06,
"loss": 0.0023,
"step": 311
},
{
"epoch": 1.8781038374717833,
"grad_norm": 0.19191402196884155,
"learning_rate": 3.3557046979865773e-06,
"loss": 0.0195,
"step": 312
},
{
"epoch": 1.8841234010534236,
"grad_norm": 0.004882109817117453,
"learning_rate": 3.1879194630872486e-06,
"loss": 0.0002,
"step": 313
},
{
"epoch": 1.8901429646350638,
"grad_norm": 0.2256098985671997,
"learning_rate": 3.02013422818792e-06,
"loss": 0.0145,
"step": 314
},
{
"epoch": 1.8961625282167043,
"grad_norm": 0.49490997195243835,
"learning_rate": 2.852348993288591e-06,
"loss": 0.0378,
"step": 315
},
{
"epoch": 1.9021820917983447,
"grad_norm": 0.2518860101699829,
"learning_rate": 2.6845637583892617e-06,
"loss": 0.0424,
"step": 316
},
{
"epoch": 1.908201655379985,
"grad_norm": 0.01092343870550394,
"learning_rate": 2.5167785234899326e-06,
"loss": 0.0003,
"step": 317
},
{
"epoch": 1.9142212189616252,
"grad_norm": 0.45049425959587097,
"learning_rate": 2.3489932885906044e-06,
"loss": 0.053,
"step": 318
},
{
"epoch": 1.9202407825432655,
"grad_norm": 0.01676754839718342,
"learning_rate": 2.1812080536912753e-06,
"loss": 0.0007,
"step": 319
},
{
"epoch": 1.926260346124906,
"grad_norm": 0.08567023277282715,
"learning_rate": 2.013422818791946e-06,
"loss": 0.0022,
"step": 320
},
{
"epoch": 1.9322799097065464,
"grad_norm": 0.243726447224617,
"learning_rate": 1.8456375838926177e-06,
"loss": 0.022,
"step": 321
},
{
"epoch": 1.9382994732881866,
"grad_norm": 0.20735050737857819,
"learning_rate": 1.6778523489932886e-06,
"loss": 0.0113,
"step": 322
},
{
"epoch": 1.9443190368698269,
"grad_norm": 0.013643703423440456,
"learning_rate": 1.51006711409396e-06,
"loss": 0.0005,
"step": 323
},
{
"epoch": 1.9503386004514671,
"grad_norm": 0.14169737696647644,
"learning_rate": 1.3422818791946309e-06,
"loss": 0.004,
"step": 324
},
{
"epoch": 1.9563581640331076,
"grad_norm": 0.21097888052463531,
"learning_rate": 1.1744966442953022e-06,
"loss": 0.0163,
"step": 325
},
{
"epoch": 1.962377727614748,
"grad_norm": 0.3481338620185852,
"learning_rate": 1.006711409395973e-06,
"loss": 0.0328,
"step": 326
},
{
"epoch": 1.9683972911963883,
"grad_norm": 0.639370858669281,
"learning_rate": 8.389261744966443e-07,
"loss": 0.0385,
"step": 327
},
{
"epoch": 1.9744168547780285,
"grad_norm": 0.01708345301449299,
"learning_rate": 6.711409395973154e-07,
"loss": 0.0006,
"step": 328
},
{
"epoch": 1.9804364183596688,
"grad_norm": 1.5634732246398926,
"learning_rate": 5.033557046979866e-07,
"loss": 0.0966,
"step": 329
},
{
"epoch": 1.9864559819413092,
"grad_norm": 0.03818768635392189,
"learning_rate": 3.355704697986577e-07,
"loss": 0.001,
"step": 330
},
{
"epoch": 1.9924755455229497,
"grad_norm": 1.7485132217407227,
"learning_rate": 1.6778523489932886e-07,
"loss": 0.0881,
"step": 331
},
{
"epoch": 1.99849510910459,
"grad_norm": 2.490537643432617,
"learning_rate": 0.0,
"loss": 0.0464,
"step": 332
}
],
"logging_steps": 1,
"max_steps": 332,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.806882647064781e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}