Training in progress, step 186, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +662 -3

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b77cad872e4bcc0e7430f7ae5e5dd49663565ee18e3d3f00e8e2a69232aaefe9
 size 80013120

 version https://git-lfs.github.com/spec/v1
+oid sha256:f7183e64a109a343469dca147a8c0f81c155762d2008823f579ca9d3c894683e
 size 80013120

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:07e5a97fc87827d4ac283ef085c485e13587e49c599f0bc37667e3ec5c9fd3b5
 size 41119636

 version https://git-lfs.github.com/spec/v1
+oid sha256:05740b4cc52050b284c0a5ae3bb1a0c79e4e11d33baa793c0b7886b2990dd202
 size 41119636

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e43e7b805b054b95bdd6a42492a7a566a708443ec3c2c635b33190dba7252c59
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:74d5b0d31a7bfc657111f3d8a8c89bd9f54c57945ce1f937d44749b81c417e07
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b893c9226071507bd706a0d6a7a997c6693067f1b6d62a3307e8595b10559486
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:51627bede7e359d4449d36ee1f729a3d0065d65146d217ca0847d4a1da7e2115
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.25016812373907193,
   "eval_steps": 93,
-  "global_step": 93,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -666,6 +666,665 @@
       "eval_samples_per_second": 14.493,
       "eval_steps_per_second": 7.293,
       "step": 93
     }
   ],
   "logging_steps": 1,
@@ -685,7 +1344,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.029473590194995e+16,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.5003362474781439,
   "eval_steps": 93,
+  "global_step": 186,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 14.493,
       "eval_steps_per_second": 7.293,
       "step": 93
+    },
+    {
+      "epoch": 0.2528581035642233,
+      "grad_norm": 5.78995418548584,
+      "learning_rate": 0.00017458486592061704,
+      "loss": 0.9346,
+      "step": 94
+    },
+    {
+      "epoch": 0.25554808338937457,
+      "grad_norm": 3.517900228500366,
+      "learning_rate": 0.00017400398070435293,
+      "loss": 0.3506,
+      "step": 95
+    },
+    {
+      "epoch": 0.2582380632145259,
+      "grad_norm": 5.804417610168457,
+      "learning_rate": 0.00017341752189883983,
+      "loss": 0.4959,
+      "step": 96
+    },
+    {
+      "epoch": 0.2609280430396772,
+      "grad_norm": 8.148117065429688,
+      "learning_rate": 0.00017282553367305975,
+      "loss": 0.9842,
+      "step": 97
+    },
+    {
+      "epoch": 0.2636180228648285,
+      "grad_norm": 9.511378288269043,
+      "learning_rate": 0.0001722280606124415,
+      "loss": 0.7143,
+      "step": 98
+    },
+    {
+      "epoch": 0.26630800268997984,
+      "grad_norm": 6.079991340637207,
+      "learning_rate": 0.00017162514771550255,
+      "loss": 0.2979,
+      "step": 99
+    },
+    {
+      "epoch": 0.26899798251513113,
+      "grad_norm": 6.114333152770996,
+      "learning_rate": 0.00017101684039046036,
+      "loss": 0.5812,
+      "step": 100
+    },
+    {
+      "epoch": 0.2716879623402825,
+      "grad_norm": 4.91884183883667,
+      "learning_rate": 0.0001704031844518121,
+      "loss": 1.8317,
+      "step": 101
+    },
+    {
+      "epoch": 0.27437794216543376,
+      "grad_norm": 5.735188007354736,
+      "learning_rate": 0.0001697842261168843,
+      "loss": 2.3345,
+      "step": 102
+    },
+    {
+      "epoch": 0.27706792199058505,
+      "grad_norm": 5.317649841308594,
+      "learning_rate": 0.0001691600120023521,
+      "loss": 2.0851,
+      "step": 103
+    },
+    {
+      "epoch": 0.2797579018157364,
+      "grad_norm": 7.778799057006836,
+      "learning_rate": 0.00016853058912072802,
+      "loss": 1.1674,
+      "step": 104
+    },
+    {
+      "epoch": 0.2824478816408877,
+      "grad_norm": 4.196416854858398,
+      "learning_rate": 0.00016789600487682156,
+      "loss": 1.5939,
+      "step": 105
+    },
+    {
+      "epoch": 0.285137861466039,
+      "grad_norm": 4.3927741050720215,
+      "learning_rate": 0.0001672563070641688,
+      "loss": 1.4615,
+      "step": 106
+    },
+    {
+      "epoch": 0.2878278412911903,
+      "grad_norm": 4.284142017364502,
+      "learning_rate": 0.0001666115438614328,
+      "loss": 1.9508,
+      "step": 107
+    },
+    {
+      "epoch": 0.2905178211163416,
+      "grad_norm": 5.4508867263793945,
+      "learning_rate": 0.00016596176382877506,
+      "loss": 1.3256,
+      "step": 108
+    },
+    {
+      "epoch": 0.29320780094149296,
+      "grad_norm": 11.987678527832031,
+      "learning_rate": 0.00016530701590419824,
+      "loss": 0.9202,
+      "step": 109
+    },
+    {
+      "epoch": 0.29589778076664425,
+      "grad_norm": 5.667636394500732,
+      "learning_rate": 0.00016464734939986036,
+      "loss": 1.3247,
+      "step": 110
+    },
+    {
+      "epoch": 0.29858776059179554,
+      "grad_norm": 3.8087687492370605,
+      "learning_rate": 0.00016398281399836097,
+      "loss": 0.9626,
+      "step": 111
+    },
+    {
+      "epoch": 0.3012777404169469,
+      "grad_norm": 5.772204875946045,
+      "learning_rate": 0.00016331345974899923,
+      "loss": 1.3912,
+      "step": 112
+    },
+    {
+      "epoch": 0.30396772024209817,
+      "grad_norm": 3.2174160480499268,
+      "learning_rate": 0.00016263933706400451,
+      "loss": 1.0545,
+      "step": 113
+    },
+    {
+      "epoch": 0.3066577000672495,
+      "grad_norm": 3.539743423461914,
+      "learning_rate": 0.00016196049671473954,
+      "loss": 0.9489,
+      "step": 114
+    },
+    {
+      "epoch": 0.3093476798924008,
+      "grad_norm": 3.6935033798217773,
+      "learning_rate": 0.0001612769898278766,
+      "loss": 1.0005,
+      "step": 115
+    },
+    {
+      "epoch": 0.3120376597175521,
+      "grad_norm": 3.477961301803589,
+      "learning_rate": 0.00016058886788154712,
+      "loss": 0.6155,
+      "step": 116
+    },
+    {
+      "epoch": 0.31472763954270344,
+      "grad_norm": 3.9399242401123047,
+      "learning_rate": 0.00015989618270146423,
+      "loss": 0.7689,
+      "step": 117
+    },
+    {
+      "epoch": 0.31741761936785473,
+      "grad_norm": 4.4496846199035645,
+      "learning_rate": 0.0001591989864570199,
+      "loss": 1.0174,
+      "step": 118
+    },
+    {
+      "epoch": 0.3201075991930061,
+      "grad_norm": 4.519758224487305,
+      "learning_rate": 0.00015849733165735556,
+      "loss": 0.9051,
+      "step": 119
+    },
+    {
+      "epoch": 0.32279757901815737,
+      "grad_norm": 3.636235237121582,
+      "learning_rate": 0.00015779127114740757,
+      "loss": 0.5993,
+      "step": 120
+    },
+    {
+      "epoch": 0.32548755884330866,
+      "grad_norm": 2.2947537899017334,
+      "learning_rate": 0.0001570808581039271,
+      "loss": 0.23,
+      "step": 121
+    },
+    {
+      "epoch": 0.32817753866846,
+      "grad_norm": 3.0490782260894775,
+      "learning_rate": 0.00015636614603147512,
+      "loss": 0.5818,
+      "step": 122
+    },
+    {
+      "epoch": 0.3308675184936113,
+      "grad_norm": 3.2933220863342285,
+      "learning_rate": 0.0001556471887583929,
+      "loss": 0.6548,
+      "step": 123
+    },
+    {
+      "epoch": 0.33355749831876264,
+      "grad_norm": 4.488528251647949,
+      "learning_rate": 0.0001549240404327477,
+      "loss": 0.9628,
+      "step": 124
+    },
+    {
+      "epoch": 0.3362474781439139,
+      "grad_norm": 4.679425239562988,
+      "learning_rate": 0.00015419675551825475,
+      "loss": 0.4106,
+      "step": 125
+    },
+    {
+      "epoch": 0.3389374579690652,
+      "grad_norm": 4.400868892669678,
+      "learning_rate": 0.0001534653887901754,
+      "loss": 0.3852,
+      "step": 126
+    },
+    {
+      "epoch": 0.34162743779421656,
+      "grad_norm": 4.978918552398682,
+      "learning_rate": 0.00015272999533119162,
+      "loss": 0.8162,
+      "step": 127
+    },
+    {
+      "epoch": 0.34431741761936785,
+      "grad_norm": 5.046586990356445,
+      "learning_rate": 0.00015199063052725745,
+      "loss": 0.649,
+      "step": 128
+    },
+    {
+      "epoch": 0.34700739744451914,
+      "grad_norm": 7.412467956542969,
+      "learning_rate": 0.0001512473500634277,
+      "loss": 0.6579,
+      "step": 129
+    },
+    {
+      "epoch": 0.3496973772696705,
+      "grad_norm": 3.8262441158294678,
+      "learning_rate": 0.00015050020991966406,
+      "loss": 0.4359,
+      "step": 130
+    },
+    {
+      "epoch": 0.3523873570948218,
+      "grad_norm": 5.179169654846191,
+      "learning_rate": 0.0001497492663666189,
+      "loss": 0.5676,
+      "step": 131
+    },
+    {
+      "epoch": 0.3550773369199731,
+      "grad_norm": 5.74229097366333,
+      "learning_rate": 0.00014899457596139729,
+      "loss": 0.3635,
+      "step": 132
+    },
+    {
+      "epoch": 0.3577673167451244,
+      "grad_norm": 7.098540782928467,
+      "learning_rate": 0.00014823619554329745,
+      "loss": 0.996,
+      "step": 133
+    },
+    {
+      "epoch": 0.3604572965702757,
+      "grad_norm": 4.635382652282715,
+      "learning_rate": 0.00014747418222952995,
+      "loss": 0.7149,
+      "step": 134
+    },
+    {
+      "epoch": 0.36314727639542704,
+      "grad_norm": 3.750243663787842,
+      "learning_rate": 0.0001467085934109158,
+      "loss": 0.3169,
+      "step": 135
+    },
+    {
+      "epoch": 0.36583725622057833,
+      "grad_norm": 4.545015811920166,
+      "learning_rate": 0.00014593948674756417,
+      "loss": 0.5511,
+      "step": 136
+    },
+    {
+      "epoch": 0.3685272360457297,
+      "grad_norm": 5.990297794342041,
+      "learning_rate": 0.0001451669201645298,
+      "loss": 0.766,
+      "step": 137
+    },
+    {
+      "epoch": 0.37121721587088097,
+      "grad_norm": 3.692354679107666,
+      "learning_rate": 0.00014439095184745024,
+      "loss": 0.4151,
+      "step": 138
+    },
+    {
+      "epoch": 0.37390719569603226,
+      "grad_norm": 3.4247729778289795,
+      "learning_rate": 0.00014361164023816376,
+      "loss": 0.466,
+      "step": 139
+    },
+    {
+      "epoch": 0.3765971755211836,
+      "grad_norm": 3.962257146835327,
+      "learning_rate": 0.00014282904403030772,
+      "loss": 0.4263,
+      "step": 140
+    },
+    {
+      "epoch": 0.3792871553463349,
+      "grad_norm": 5.7197771072387695,
+      "learning_rate": 0.00014204322216489814,
+      "loss": 0.4988,
+      "step": 141
+    },
+    {
+      "epoch": 0.38197713517148624,
+      "grad_norm": 5.587864398956299,
+      "learning_rate": 0.00014125423382589048,
+      "loss": 0.6946,
+      "step": 142
+    },
+    {
+      "epoch": 0.3846671149966375,
+      "grad_norm": 11.981307029724121,
+      "learning_rate": 0.00014046213843572236,
+      "loss": 0.7456,
+      "step": 143
+    },
+    {
+      "epoch": 0.3873570948217888,
+      "grad_norm": 6.747979164123535,
+      "learning_rate": 0.00013966699565083802,
+      "loss": 1.2804,
+      "step": 144
+    },
+    {
+      "epoch": 0.39004707464694016,
+      "grad_norm": 4.663575649261475,
+      "learning_rate": 0.0001388688653571954,
+      "loss": 0.5548,
+      "step": 145
+    },
+    {
+      "epoch": 0.39273705447209145,
+      "grad_norm": 5.274585247039795,
+      "learning_rate": 0.00013806780766575588,
+      "loss": 0.6681,
+      "step": 146
+    },
+    {
+      "epoch": 0.3954270342972428,
+      "grad_norm": 13.038918495178223,
+      "learning_rate": 0.00013726388290795697,
+      "loss": 1.082,
+      "step": 147
+    },
+    {
+      "epoch": 0.3981170141223941,
+      "grad_norm": 7.035642623901367,
+      "learning_rate": 0.00013645715163116846,
+      "loss": 0.3975,
+      "step": 148
+    },
+    {
+      "epoch": 0.4008069939475454,
+      "grad_norm": 5.065128326416016,
+      "learning_rate": 0.00013564767459413237,
+      "loss": 0.2747,
+      "step": 149
+    },
+    {
+      "epoch": 0.4034969737726967,
+      "grad_norm": 4.475830554962158,
+      "learning_rate": 0.0001348355127623869,
+      "loss": 0.2169,
+      "step": 150
+    },
+    {
+      "epoch": 0.406186953597848,
+      "grad_norm": 4.0652031898498535,
+      "learning_rate": 0.00013402072730367475,
+      "loss": 1.7546,
+      "step": 151
+    },
+    {
+      "epoch": 0.4088769334229993,
+      "grad_norm": 4.62870454788208,
+      "learning_rate": 0.0001332033795833364,
+      "loss": 1.5081,
+      "step": 152
+    },
+    {
+      "epoch": 0.41156691324815065,
+      "grad_norm": 3.8758082389831543,
+      "learning_rate": 0.0001323835311596884,
+      "loss": 1.371,
+      "step": 153
+    },
+    {
+      "epoch": 0.41425689307330194,
+      "grad_norm": 4.078228950500488,
+      "learning_rate": 0.00013156124377938699,
+      "loss": 1.5507,
+      "step": 154
+    },
+    {
+      "epoch": 0.4169468728984533,
+      "grad_norm": 3.6525630950927734,
+      "learning_rate": 0.0001307365793727778,
+      "loss": 1.1093,
+      "step": 155
+    },
+    {
+      "epoch": 0.41963685272360457,
+      "grad_norm": 4.3088202476501465,
+      "learning_rate": 0.00012990960004923154,
+      "loss": 1.6154,
+      "step": 156
+    },
+    {
+      "epoch": 0.42232683254875586,
+      "grad_norm": 4.335425853729248,
+      "learning_rate": 0.00012908036809246623,
+      "loss": 1.4037,
+      "step": 157
+    },
+    {
+      "epoch": 0.4250168123739072,
+      "grad_norm": 3.7850985527038574,
+      "learning_rate": 0.00012824894595585637,
+      "loss": 1.1471,
+      "step": 158
+    },
+    {
+      "epoch": 0.4277067921990585,
+      "grad_norm": 4.085525035858154,
+      "learning_rate": 0.00012741539625772918,
+      "loss": 1.2586,
+      "step": 159
+    },
+    {
+      "epoch": 0.43039677202420984,
+      "grad_norm": 3.4970481395721436,
+      "learning_rate": 0.0001265797817766486,
+      "loss": 1.1133,
+      "step": 160
+    },
+    {
+      "epoch": 0.43308675184936113,
+      "grad_norm": 4.015367031097412,
+      "learning_rate": 0.0001257421654466872,
+      "loss": 0.71,
+      "step": 161
+    },
+    {
+      "epoch": 0.4357767316745124,
+      "grad_norm": 3.805530071258545,
+      "learning_rate": 0.00012490261035268612,
+      "loss": 1.4369,
+      "step": 162
+    },
+    {
+      "epoch": 0.43846671149966376,
+      "grad_norm": 4.442086696624756,
+      "learning_rate": 0.00012406117972550414,
+      "loss": 1.1577,
+      "step": 163
+    },
+    {
+      "epoch": 0.44115669132481505,
+      "grad_norm": 3.171997308731079,
+      "learning_rate": 0.00012321793693725509,
+      "loss": 0.667,
+      "step": 164
+    },
+    {
+      "epoch": 0.4438466711499664,
+      "grad_norm": 4.185075759887695,
+      "learning_rate": 0.0001223729454965354,
+      "loss": 0.7278,
+      "step": 165
+    },
+    {
+      "epoch": 0.4465366509751177,
+      "grad_norm": 3.8975086212158203,
+      "learning_rate": 0.00012152626904364067,
+      "loss": 0.9939,
+      "step": 166
+    },
+    {
+      "epoch": 0.449226630800269,
+      "grad_norm": 3.1474146842956543,
+      "learning_rate": 0.00012067797134577275,
+      "loss": 0.7392,
+      "step": 167
+    },
+    {
+      "epoch": 0.4519166106254203,
+      "grad_norm": 3.5632522106170654,
+      "learning_rate": 0.00011982811629223709,
+      "loss": 0.8636,
+      "step": 168
+    },
+    {
+      "epoch": 0.4546065904505716,
+      "grad_norm": 2.6525533199310303,
+      "learning_rate": 0.00011897676788963101,
+      "loss": 0.3818,
+      "step": 169
+    },
+    {
+      "epoch": 0.45729657027572296,
+      "grad_norm": 3.889469861984253,
+      "learning_rate": 0.0001181239902570229,
+      "loss": 0.5985,
+      "step": 170
+    },
+    {
+      "epoch": 0.45998655010087425,
+      "grad_norm": 3.6286370754241943,
+      "learning_rate": 0.00011726984762112328,
+      "loss": 0.8639,
+      "step": 171
+    },
+    {
+      "epoch": 0.46267652992602554,
+      "grad_norm": 2.5282163619995117,
+      "learning_rate": 0.0001164144043114475,
+      "loss": 0.3303,
+      "step": 172
+    },
+    {
+      "epoch": 0.4653665097511769,
+      "grad_norm": 4.00683069229126,
+      "learning_rate": 0.00011555772475547084,
+      "loss": 0.414,
+      "step": 173
+    },
+    {
+      "epoch": 0.46805648957632817,
+      "grad_norm": 5.255921363830566,
+      "learning_rate": 0.00011469987347377602,
+      "loss": 0.8622,
+      "step": 174
+    },
+    {
+      "epoch": 0.47074646940147946,
+      "grad_norm": 4.201918601989746,
+      "learning_rate": 0.00011384091507519403,
+      "loss": 0.8862,
+      "step": 175
+    },
+    {
+      "epoch": 0.4734364492266308,
+      "grad_norm": 4.199880599975586,
+      "learning_rate": 0.00011298091425193806,
+      "loss": 0.4554,
+      "step": 176
+    },
+    {
+      "epoch": 0.4761264290517821,
+      "grad_norm": 3.6669838428497314,
+      "learning_rate": 0.00011211993577473121,
+      "loss": 0.343,
+      "step": 177
+    },
+    {
+      "epoch": 0.47881640887693344,
+      "grad_norm": 4.186169147491455,
+      "learning_rate": 0.00011125804448792831,
+      "loss": 0.8039,
+      "step": 178
+    },
+    {
+      "epoch": 0.48150638870208473,
+      "grad_norm": 4.209519386291504,
+      "learning_rate": 0.00011039530530463218,
+      "loss": 0.3221,
+      "step": 179
+    },
+    {
+      "epoch": 0.484196368527236,
+      "grad_norm": 2.875875234603882,
+      "learning_rate": 0.00010953178320180475,
+      "loss": 0.2874,
+      "step": 180
+    },
+    {
+      "epoch": 0.48688634835238737,
+      "grad_norm": 4.24071741104126,
+      "learning_rate": 0.00010866754321537338,
+      "loss": 0.5502,
+      "step": 181
+    },
+    {
+      "epoch": 0.48957632817753866,
+      "grad_norm": 4.230165481567383,
+      "learning_rate": 0.0001078026504353325,
+      "loss": 0.5396,
+      "step": 182
+    },
+    {
+      "epoch": 0.49226630800269,
+      "grad_norm": 4.387772560119629,
+      "learning_rate": 0.0001069371700008416,
+      "loss": 0.5987,
+      "step": 183
+    },
+    {
+      "epoch": 0.4949562878278413,
+      "grad_norm": 4.988356113433838,
+      "learning_rate": 0.00010607116709531918,
+      "loss": 0.6046,
+      "step": 184
+    },
+    {
+      "epoch": 0.4976462676529926,
+      "grad_norm": 4.388515472412109,
+      "learning_rate": 0.00010520470694153353,
+      "loss": 0.595,
+      "step": 185
+    },
+    {
+      "epoch": 0.5003362474781439,
+      "grad_norm": 4.310067653656006,
+      "learning_rate": 0.00010433785479669038,
+      "loss": 0.5557,
+      "step": 186
+    },
+    {
+      "epoch": 0.5003362474781439,
+      "eval_loss": 0.8278390765190125,
+      "eval_runtime": 10.698,
+      "eval_samples_per_second": 14.676,
+      "eval_steps_per_second": 7.385,
+      "step": 186
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 6.05894718038999e+16,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null