lesso's picture
Training in progress, step 200, checkpoint
6e91b66 verified
raw
history blame
35.4 kB
{
"best_metric": 0.9818174242973328,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.020891001201232568,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00010445500600616284,
"grad_norm": 1.0160599946975708,
"learning_rate": 1.007e-05,
"loss": 0.9026,
"step": 1
},
{
"epoch": 0.00010445500600616284,
"eval_loss": 1.3163843154907227,
"eval_runtime": 118.417,
"eval_samples_per_second": 34.041,
"eval_steps_per_second": 8.512,
"step": 1
},
{
"epoch": 0.00020891001201232568,
"grad_norm": 1.103493571281433,
"learning_rate": 2.014e-05,
"loss": 0.9891,
"step": 2
},
{
"epoch": 0.0003133650180184885,
"grad_norm": 1.031664490699768,
"learning_rate": 3.0209999999999997e-05,
"loss": 1.0485,
"step": 3
},
{
"epoch": 0.00041782002402465136,
"grad_norm": 0.985599160194397,
"learning_rate": 4.028e-05,
"loss": 1.1374,
"step": 4
},
{
"epoch": 0.0005222750300308142,
"grad_norm": 0.8242583274841309,
"learning_rate": 5.035e-05,
"loss": 1.0288,
"step": 5
},
{
"epoch": 0.000626730036036977,
"grad_norm": 0.8407261967658997,
"learning_rate": 6.0419999999999994e-05,
"loss": 1.0153,
"step": 6
},
{
"epoch": 0.0007311850420431399,
"grad_norm": 1.0834583044052124,
"learning_rate": 7.049e-05,
"loss": 0.9715,
"step": 7
},
{
"epoch": 0.0008356400480493027,
"grad_norm": 1.1035923957824707,
"learning_rate": 8.056e-05,
"loss": 1.0755,
"step": 8
},
{
"epoch": 0.0009400950540554656,
"grad_norm": 0.9618456363677979,
"learning_rate": 9.062999999999999e-05,
"loss": 0.9921,
"step": 9
},
{
"epoch": 0.0010445500600616284,
"grad_norm": 1.0877008438110352,
"learning_rate": 0.0001007,
"loss": 1.1217,
"step": 10
},
{
"epoch": 0.0011490050660677912,
"grad_norm": 0.9684674739837646,
"learning_rate": 0.00010017,
"loss": 1.0438,
"step": 11
},
{
"epoch": 0.001253460072073954,
"grad_norm": 0.8440430164337158,
"learning_rate": 9.963999999999999e-05,
"loss": 1.0471,
"step": 12
},
{
"epoch": 0.001357915078080117,
"grad_norm": 0.7783815860748291,
"learning_rate": 9.910999999999999e-05,
"loss": 0.9841,
"step": 13
},
{
"epoch": 0.0014623700840862798,
"grad_norm": 0.8125823140144348,
"learning_rate": 9.858e-05,
"loss": 1.0053,
"step": 14
},
{
"epoch": 0.0015668250900924426,
"grad_norm": 0.8336841464042664,
"learning_rate": 9.805e-05,
"loss": 0.9672,
"step": 15
},
{
"epoch": 0.0016712800960986054,
"grad_norm": 0.9024210572242737,
"learning_rate": 9.752e-05,
"loss": 1.1344,
"step": 16
},
{
"epoch": 0.0017757351021047683,
"grad_norm": 0.9698878526687622,
"learning_rate": 9.698999999999999e-05,
"loss": 1.1192,
"step": 17
},
{
"epoch": 0.0018801901081109311,
"grad_norm": 0.9430877566337585,
"learning_rate": 9.646e-05,
"loss": 1.0096,
"step": 18
},
{
"epoch": 0.001984645114117094,
"grad_norm": 0.9249778985977173,
"learning_rate": 9.593e-05,
"loss": 1.0335,
"step": 19
},
{
"epoch": 0.002089100120123257,
"grad_norm": 1.2025758028030396,
"learning_rate": 9.539999999999999e-05,
"loss": 1.2275,
"step": 20
},
{
"epoch": 0.0021935551261294197,
"grad_norm": 1.0294830799102783,
"learning_rate": 9.487e-05,
"loss": 1.1516,
"step": 21
},
{
"epoch": 0.0022980101321355825,
"grad_norm": 1.0079749822616577,
"learning_rate": 9.434e-05,
"loss": 1.0904,
"step": 22
},
{
"epoch": 0.0024024651381417453,
"grad_norm": 1.0860754251480103,
"learning_rate": 9.381e-05,
"loss": 1.1063,
"step": 23
},
{
"epoch": 0.002506920144147908,
"grad_norm": 1.0929911136627197,
"learning_rate": 9.327999999999999e-05,
"loss": 1.213,
"step": 24
},
{
"epoch": 0.002611375150154071,
"grad_norm": 1.0362168550491333,
"learning_rate": 9.274999999999999e-05,
"loss": 1.105,
"step": 25
},
{
"epoch": 0.002715830156160234,
"grad_norm": 1.035015344619751,
"learning_rate": 9.222e-05,
"loss": 1.0202,
"step": 26
},
{
"epoch": 0.0028202851621663967,
"grad_norm": 1.1152434349060059,
"learning_rate": 9.169e-05,
"loss": 1.0801,
"step": 27
},
{
"epoch": 0.0029247401681725595,
"grad_norm": 1.1516571044921875,
"learning_rate": 9.116e-05,
"loss": 1.0891,
"step": 28
},
{
"epoch": 0.0030291951741787224,
"grad_norm": 1.0675947666168213,
"learning_rate": 9.062999999999999e-05,
"loss": 1.0438,
"step": 29
},
{
"epoch": 0.0031336501801848852,
"grad_norm": 1.0566611289978027,
"learning_rate": 9.01e-05,
"loss": 1.0598,
"step": 30
},
{
"epoch": 0.003238105186191048,
"grad_norm": 1.0381075143814087,
"learning_rate": 8.957e-05,
"loss": 0.9869,
"step": 31
},
{
"epoch": 0.003342560192197211,
"grad_norm": 1.0372414588928223,
"learning_rate": 8.903999999999999e-05,
"loss": 1.0221,
"step": 32
},
{
"epoch": 0.0034470151982033737,
"grad_norm": 1.1146482229232788,
"learning_rate": 8.850999999999999e-05,
"loss": 1.1111,
"step": 33
},
{
"epoch": 0.0035514702042095366,
"grad_norm": 1.117113471031189,
"learning_rate": 8.798e-05,
"loss": 1.0337,
"step": 34
},
{
"epoch": 0.0036559252102156994,
"grad_norm": 1.1380937099456787,
"learning_rate": 8.745e-05,
"loss": 1.0539,
"step": 35
},
{
"epoch": 0.0037603802162218623,
"grad_norm": 1.125671148300171,
"learning_rate": 8.692e-05,
"loss": 1.2324,
"step": 36
},
{
"epoch": 0.003864835222228025,
"grad_norm": 1.178640604019165,
"learning_rate": 8.638999999999999e-05,
"loss": 1.0558,
"step": 37
},
{
"epoch": 0.003969290228234188,
"grad_norm": 1.1600550413131714,
"learning_rate": 8.586e-05,
"loss": 1.1477,
"step": 38
},
{
"epoch": 0.004073745234240351,
"grad_norm": 1.1267294883728027,
"learning_rate": 8.533e-05,
"loss": 1.1102,
"step": 39
},
{
"epoch": 0.004178200240246514,
"grad_norm": 1.148314118385315,
"learning_rate": 8.479999999999999e-05,
"loss": 1.1249,
"step": 40
},
{
"epoch": 0.004282655246252677,
"grad_norm": 1.7030447721481323,
"learning_rate": 8.427e-05,
"loss": 1.3342,
"step": 41
},
{
"epoch": 0.004387110252258839,
"grad_norm": 1.1302878856658936,
"learning_rate": 8.374e-05,
"loss": 1.0549,
"step": 42
},
{
"epoch": 0.004491565258265003,
"grad_norm": 1.2263422012329102,
"learning_rate": 8.321e-05,
"loss": 1.049,
"step": 43
},
{
"epoch": 0.004596020264271165,
"grad_norm": 1.1995285749435425,
"learning_rate": 8.268e-05,
"loss": 1.0055,
"step": 44
},
{
"epoch": 0.004700475270277328,
"grad_norm": 1.2773244380950928,
"learning_rate": 8.214999999999999e-05,
"loss": 1.0983,
"step": 45
},
{
"epoch": 0.004804930276283491,
"grad_norm": 1.3492332696914673,
"learning_rate": 8.162e-05,
"loss": 1.181,
"step": 46
},
{
"epoch": 0.004909385282289654,
"grad_norm": 1.35885751247406,
"learning_rate": 8.108999999999998e-05,
"loss": 1.0918,
"step": 47
},
{
"epoch": 0.005013840288295816,
"grad_norm": 1.250424861907959,
"learning_rate": 8.056e-05,
"loss": 0.9498,
"step": 48
},
{
"epoch": 0.00511829529430198,
"grad_norm": 1.5273371934890747,
"learning_rate": 8.003e-05,
"loss": 1.0708,
"step": 49
},
{
"epoch": 0.005222750300308142,
"grad_norm": 1.8503930568695068,
"learning_rate": 7.95e-05,
"loss": 1.3104,
"step": 50
},
{
"epoch": 0.005222750300308142,
"eval_loss": 1.0883654356002808,
"eval_runtime": 120.0706,
"eval_samples_per_second": 33.572,
"eval_steps_per_second": 8.395,
"step": 50
},
{
"epoch": 0.005327205306314305,
"grad_norm": 0.9896413087844849,
"learning_rate": 7.897e-05,
"loss": 0.9202,
"step": 51
},
{
"epoch": 0.005431660312320468,
"grad_norm": 0.840713381767273,
"learning_rate": 7.843999999999999e-05,
"loss": 0.8256,
"step": 52
},
{
"epoch": 0.005536115318326631,
"grad_norm": 0.7416518330574036,
"learning_rate": 7.790999999999999e-05,
"loss": 0.9071,
"step": 53
},
{
"epoch": 0.005640570324332793,
"grad_norm": 0.7955224514007568,
"learning_rate": 7.738e-05,
"loss": 0.9046,
"step": 54
},
{
"epoch": 0.005745025330338957,
"grad_norm": 0.7123813629150391,
"learning_rate": 7.685e-05,
"loss": 1.0314,
"step": 55
},
{
"epoch": 0.005849480336345119,
"grad_norm": 0.683822751045227,
"learning_rate": 7.632e-05,
"loss": 0.9354,
"step": 56
},
{
"epoch": 0.005953935342351282,
"grad_norm": 0.6209269165992737,
"learning_rate": 7.578999999999999e-05,
"loss": 0.8914,
"step": 57
},
{
"epoch": 0.006058390348357445,
"grad_norm": 0.6532514691352844,
"learning_rate": 7.526e-05,
"loss": 1.0181,
"step": 58
},
{
"epoch": 0.006162845354363608,
"grad_norm": 0.6706631183624268,
"learning_rate": 7.473e-05,
"loss": 0.9697,
"step": 59
},
{
"epoch": 0.0062673003603697704,
"grad_norm": 0.6528756022453308,
"learning_rate": 7.419999999999999e-05,
"loss": 0.9479,
"step": 60
},
{
"epoch": 0.006371755366375934,
"grad_norm": 0.7368625998497009,
"learning_rate": 7.367e-05,
"loss": 0.9429,
"step": 61
},
{
"epoch": 0.006476210372382096,
"grad_norm": 0.7886870503425598,
"learning_rate": 7.314e-05,
"loss": 1.0517,
"step": 62
},
{
"epoch": 0.006580665378388259,
"grad_norm": 0.7552511692047119,
"learning_rate": 7.261e-05,
"loss": 0.997,
"step": 63
},
{
"epoch": 0.006685120384394422,
"grad_norm": 0.7769532799720764,
"learning_rate": 7.208e-05,
"loss": 0.9554,
"step": 64
},
{
"epoch": 0.006789575390400585,
"grad_norm": 0.8453531265258789,
"learning_rate": 7.154999999999999e-05,
"loss": 1.0108,
"step": 65
},
{
"epoch": 0.0068940303964067475,
"grad_norm": 0.8387408256530762,
"learning_rate": 7.102e-05,
"loss": 0.9538,
"step": 66
},
{
"epoch": 0.006998485402412911,
"grad_norm": 0.8454548120498657,
"learning_rate": 7.049e-05,
"loss": 0.9305,
"step": 67
},
{
"epoch": 0.007102940408419073,
"grad_norm": 0.9299591779708862,
"learning_rate": 6.996e-05,
"loss": 1.1564,
"step": 68
},
{
"epoch": 0.0072073954144252364,
"grad_norm": 0.863427460193634,
"learning_rate": 6.943e-05,
"loss": 0.9635,
"step": 69
},
{
"epoch": 0.007311850420431399,
"grad_norm": 0.9572794437408447,
"learning_rate": 6.89e-05,
"loss": 1.1278,
"step": 70
},
{
"epoch": 0.007416305426437562,
"grad_norm": 0.9274687767028809,
"learning_rate": 6.837e-05,
"loss": 1.0153,
"step": 71
},
{
"epoch": 0.0075207604324437245,
"grad_norm": 0.8995688557624817,
"learning_rate": 6.784e-05,
"loss": 1.0095,
"step": 72
},
{
"epoch": 0.007625215438449888,
"grad_norm": 0.9216225743293762,
"learning_rate": 6.730999999999999e-05,
"loss": 1.004,
"step": 73
},
{
"epoch": 0.00772967044445605,
"grad_norm": 0.8909146785736084,
"learning_rate": 6.678e-05,
"loss": 0.9789,
"step": 74
},
{
"epoch": 0.007834125450462213,
"grad_norm": 0.8936184048652649,
"learning_rate": 6.625e-05,
"loss": 0.9622,
"step": 75
},
{
"epoch": 0.007938580456468376,
"grad_norm": 0.9004867672920227,
"learning_rate": 6.572e-05,
"loss": 0.9201,
"step": 76
},
{
"epoch": 0.008043035462474538,
"grad_norm": 1.025423288345337,
"learning_rate": 6.519e-05,
"loss": 1.1964,
"step": 77
},
{
"epoch": 0.008147490468480702,
"grad_norm": 1.002456784248352,
"learning_rate": 6.466e-05,
"loss": 1.1274,
"step": 78
},
{
"epoch": 0.008251945474486865,
"grad_norm": 0.967106819152832,
"learning_rate": 6.413e-05,
"loss": 0.9247,
"step": 79
},
{
"epoch": 0.008356400480493027,
"grad_norm": 1.0033572912216187,
"learning_rate": 6.359999999999999e-05,
"loss": 1.0381,
"step": 80
},
{
"epoch": 0.00846085548649919,
"grad_norm": 0.9540228843688965,
"learning_rate": 6.306999999999999e-05,
"loss": 0.8963,
"step": 81
},
{
"epoch": 0.008565310492505354,
"grad_norm": 1.1677919626235962,
"learning_rate": 6.254000000000001e-05,
"loss": 1.1372,
"step": 82
},
{
"epoch": 0.008669765498511516,
"grad_norm": 1.0950039625167847,
"learning_rate": 6.201e-05,
"loss": 1.052,
"step": 83
},
{
"epoch": 0.008774220504517679,
"grad_norm": 1.028153657913208,
"learning_rate": 6.148e-05,
"loss": 0.9451,
"step": 84
},
{
"epoch": 0.008878675510523841,
"grad_norm": 1.1274486780166626,
"learning_rate": 6.095e-05,
"loss": 1.0042,
"step": 85
},
{
"epoch": 0.008983130516530005,
"grad_norm": 1.1423695087432861,
"learning_rate": 6.0419999999999994e-05,
"loss": 1.094,
"step": 86
},
{
"epoch": 0.009087585522536168,
"grad_norm": 1.1429065465927124,
"learning_rate": 5.988999999999999e-05,
"loss": 0.9644,
"step": 87
},
{
"epoch": 0.00919204052854233,
"grad_norm": 1.2021771669387817,
"learning_rate": 5.9359999999999994e-05,
"loss": 1.1807,
"step": 88
},
{
"epoch": 0.009296495534548492,
"grad_norm": 1.1174052953720093,
"learning_rate": 5.8830000000000004e-05,
"loss": 1.029,
"step": 89
},
{
"epoch": 0.009400950540554657,
"grad_norm": 1.2131744623184204,
"learning_rate": 5.83e-05,
"loss": 1.2473,
"step": 90
},
{
"epoch": 0.009505405546560819,
"grad_norm": 1.1659351587295532,
"learning_rate": 5.777e-05,
"loss": 1.1075,
"step": 91
},
{
"epoch": 0.009609860552566981,
"grad_norm": 1.155617594718933,
"learning_rate": 5.7239999999999994e-05,
"loss": 0.9338,
"step": 92
},
{
"epoch": 0.009714315558573145,
"grad_norm": 1.1732633113861084,
"learning_rate": 5.671e-05,
"loss": 1.1125,
"step": 93
},
{
"epoch": 0.009818770564579308,
"grad_norm": 1.1406437158584595,
"learning_rate": 5.6179999999999994e-05,
"loss": 1.0323,
"step": 94
},
{
"epoch": 0.00992322557058547,
"grad_norm": 1.25766122341156,
"learning_rate": 5.5650000000000004e-05,
"loss": 1.153,
"step": 95
},
{
"epoch": 0.010027680576591633,
"grad_norm": 1.3154778480529785,
"learning_rate": 5.512e-05,
"loss": 1.1242,
"step": 96
},
{
"epoch": 0.010132135582597797,
"grad_norm": 1.355385184288025,
"learning_rate": 5.459e-05,
"loss": 1.1835,
"step": 97
},
{
"epoch": 0.01023659058860396,
"grad_norm": 1.3438916206359863,
"learning_rate": 5.406e-05,
"loss": 1.0795,
"step": 98
},
{
"epoch": 0.010341045594610122,
"grad_norm": 1.2769006490707397,
"learning_rate": 5.353e-05,
"loss": 0.9322,
"step": 99
},
{
"epoch": 0.010445500600616284,
"grad_norm": 1.896607518196106,
"learning_rate": 5.2999999999999994e-05,
"loss": 1.1321,
"step": 100
},
{
"epoch": 0.010445500600616284,
"eval_loss": 1.0444438457489014,
"eval_runtime": 118.2357,
"eval_samples_per_second": 34.093,
"eval_steps_per_second": 8.525,
"step": 100
},
{
"epoch": 0.010549955606622448,
"grad_norm": 0.6772998571395874,
"learning_rate": 5.246999999999999e-05,
"loss": 0.8867,
"step": 101
},
{
"epoch": 0.01065441061262861,
"grad_norm": 0.6309265494346619,
"learning_rate": 5.194e-05,
"loss": 0.9269,
"step": 102
},
{
"epoch": 0.010758865618634773,
"grad_norm": 0.6723343729972839,
"learning_rate": 5.141e-05,
"loss": 0.9639,
"step": 103
},
{
"epoch": 0.010863320624640935,
"grad_norm": 0.6599306464195251,
"learning_rate": 5.088e-05,
"loss": 0.9483,
"step": 104
},
{
"epoch": 0.0109677756306471,
"grad_norm": 0.5985355973243713,
"learning_rate": 5.035e-05,
"loss": 0.9826,
"step": 105
},
{
"epoch": 0.011072230636653262,
"grad_norm": 0.6056426763534546,
"learning_rate": 4.9819999999999994e-05,
"loss": 0.87,
"step": 106
},
{
"epoch": 0.011176685642659424,
"grad_norm": 0.6577640771865845,
"learning_rate": 4.929e-05,
"loss": 0.8896,
"step": 107
},
{
"epoch": 0.011281140648665587,
"grad_norm": 0.6197834014892578,
"learning_rate": 4.876e-05,
"loss": 0.9857,
"step": 108
},
{
"epoch": 0.011385595654671751,
"grad_norm": 0.6561485528945923,
"learning_rate": 4.823e-05,
"loss": 1.0036,
"step": 109
},
{
"epoch": 0.011490050660677913,
"grad_norm": 0.6277485489845276,
"learning_rate": 4.7699999999999994e-05,
"loss": 0.9196,
"step": 110
},
{
"epoch": 0.011594505666684076,
"grad_norm": 0.6193849444389343,
"learning_rate": 4.717e-05,
"loss": 0.8803,
"step": 111
},
{
"epoch": 0.011698960672690238,
"grad_norm": 0.64503014087677,
"learning_rate": 4.6639999999999994e-05,
"loss": 1.0019,
"step": 112
},
{
"epoch": 0.011803415678696402,
"grad_norm": 0.686529278755188,
"learning_rate": 4.611e-05,
"loss": 0.9412,
"step": 113
},
{
"epoch": 0.011907870684702565,
"grad_norm": 0.7062692642211914,
"learning_rate": 4.558e-05,
"loss": 1.0139,
"step": 114
},
{
"epoch": 0.012012325690708727,
"grad_norm": 0.7408269643783569,
"learning_rate": 4.505e-05,
"loss": 0.9398,
"step": 115
},
{
"epoch": 0.01211678069671489,
"grad_norm": 0.8046457767486572,
"learning_rate": 4.4519999999999994e-05,
"loss": 1.0817,
"step": 116
},
{
"epoch": 0.012221235702721054,
"grad_norm": 0.8560929894447327,
"learning_rate": 4.399e-05,
"loss": 0.9393,
"step": 117
},
{
"epoch": 0.012325690708727216,
"grad_norm": 0.8270806074142456,
"learning_rate": 4.346e-05,
"loss": 1.029,
"step": 118
},
{
"epoch": 0.012430145714733378,
"grad_norm": 0.8439892530441284,
"learning_rate": 4.293e-05,
"loss": 1.0061,
"step": 119
},
{
"epoch": 0.012534600720739541,
"grad_norm": 0.9163686037063599,
"learning_rate": 4.2399999999999994e-05,
"loss": 1.1759,
"step": 120
},
{
"epoch": 0.012639055726745705,
"grad_norm": 0.9552029371261597,
"learning_rate": 4.187e-05,
"loss": 0.9827,
"step": 121
},
{
"epoch": 0.012743510732751867,
"grad_norm": 0.9216101169586182,
"learning_rate": 4.134e-05,
"loss": 1.0798,
"step": 122
},
{
"epoch": 0.01284796573875803,
"grad_norm": 0.9589611887931824,
"learning_rate": 4.081e-05,
"loss": 1.077,
"step": 123
},
{
"epoch": 0.012952420744764192,
"grad_norm": 0.9211677312850952,
"learning_rate": 4.028e-05,
"loss": 1.0484,
"step": 124
},
{
"epoch": 0.013056875750770356,
"grad_norm": 0.8966543078422546,
"learning_rate": 3.975e-05,
"loss": 0.9896,
"step": 125
},
{
"epoch": 0.013161330756776519,
"grad_norm": 0.9282961487770081,
"learning_rate": 3.9219999999999994e-05,
"loss": 1.0094,
"step": 126
},
{
"epoch": 0.013265785762782681,
"grad_norm": 1.004485011100769,
"learning_rate": 3.869e-05,
"loss": 1.1737,
"step": 127
},
{
"epoch": 0.013370240768788844,
"grad_norm": 0.9591395854949951,
"learning_rate": 3.816e-05,
"loss": 1.0858,
"step": 128
},
{
"epoch": 0.013474695774795008,
"grad_norm": 0.9005763530731201,
"learning_rate": 3.763e-05,
"loss": 1.0078,
"step": 129
},
{
"epoch": 0.01357915078080117,
"grad_norm": 0.9479995965957642,
"learning_rate": 3.7099999999999994e-05,
"loss": 1.0498,
"step": 130
},
{
"epoch": 0.013683605786807333,
"grad_norm": 1.0200867652893066,
"learning_rate": 3.657e-05,
"loss": 1.0824,
"step": 131
},
{
"epoch": 0.013788060792813495,
"grad_norm": 0.9186935424804688,
"learning_rate": 3.604e-05,
"loss": 0.9936,
"step": 132
},
{
"epoch": 0.013892515798819659,
"grad_norm": 0.9905325770378113,
"learning_rate": 3.551e-05,
"loss": 1.0225,
"step": 133
},
{
"epoch": 0.013996970804825822,
"grad_norm": 1.0167120695114136,
"learning_rate": 3.498e-05,
"loss": 1.1188,
"step": 134
},
{
"epoch": 0.014101425810831984,
"grad_norm": 0.9497846961021423,
"learning_rate": 3.445e-05,
"loss": 0.9271,
"step": 135
},
{
"epoch": 0.014205880816838146,
"grad_norm": 1.0277209281921387,
"learning_rate": 3.392e-05,
"loss": 1.0421,
"step": 136
},
{
"epoch": 0.01431033582284431,
"grad_norm": 0.9843363761901855,
"learning_rate": 3.339e-05,
"loss": 0.982,
"step": 137
},
{
"epoch": 0.014414790828850473,
"grad_norm": 1.0494071245193481,
"learning_rate": 3.286e-05,
"loss": 0.9804,
"step": 138
},
{
"epoch": 0.014519245834856635,
"grad_norm": 1.0694974660873413,
"learning_rate": 3.233e-05,
"loss": 0.9624,
"step": 139
},
{
"epoch": 0.014623700840862798,
"grad_norm": 1.0880765914916992,
"learning_rate": 3.1799999999999994e-05,
"loss": 0.9901,
"step": 140
},
{
"epoch": 0.014728155846868962,
"grad_norm": 1.053983211517334,
"learning_rate": 3.1270000000000004e-05,
"loss": 0.9555,
"step": 141
},
{
"epoch": 0.014832610852875124,
"grad_norm": 1.0926487445831299,
"learning_rate": 3.074e-05,
"loss": 1.03,
"step": 142
},
{
"epoch": 0.014937065858881287,
"grad_norm": 1.1903960704803467,
"learning_rate": 3.0209999999999997e-05,
"loss": 1.0765,
"step": 143
},
{
"epoch": 0.015041520864887449,
"grad_norm": 1.2311145067214966,
"learning_rate": 2.9679999999999997e-05,
"loss": 1.0678,
"step": 144
},
{
"epoch": 0.015145975870893613,
"grad_norm": 1.1940836906433105,
"learning_rate": 2.915e-05,
"loss": 1.0461,
"step": 145
},
{
"epoch": 0.015250430876899776,
"grad_norm": 1.228232979774475,
"learning_rate": 2.8619999999999997e-05,
"loss": 0.9819,
"step": 146
},
{
"epoch": 0.015354885882905938,
"grad_norm": 1.2038990259170532,
"learning_rate": 2.8089999999999997e-05,
"loss": 0.9445,
"step": 147
},
{
"epoch": 0.0154593408889121,
"grad_norm": 1.2821253538131714,
"learning_rate": 2.756e-05,
"loss": 1.1162,
"step": 148
},
{
"epoch": 0.015563795894918265,
"grad_norm": 1.437116265296936,
"learning_rate": 2.703e-05,
"loss": 1.0603,
"step": 149
},
{
"epoch": 0.015668250900924427,
"grad_norm": 1.6678568124771118,
"learning_rate": 2.6499999999999997e-05,
"loss": 1.0682,
"step": 150
},
{
"epoch": 0.015668250900924427,
"eval_loss": 0.9961364269256592,
"eval_runtime": 118.6077,
"eval_samples_per_second": 33.986,
"eval_steps_per_second": 8.499,
"step": 150
},
{
"epoch": 0.01577270590693059,
"grad_norm": 0.5151348114013672,
"learning_rate": 2.597e-05,
"loss": 0.7635,
"step": 151
},
{
"epoch": 0.015877160912936752,
"grad_norm": 0.5203879475593567,
"learning_rate": 2.544e-05,
"loss": 0.7112,
"step": 152
},
{
"epoch": 0.015981615918942916,
"grad_norm": 0.5102455019950867,
"learning_rate": 2.4909999999999997e-05,
"loss": 0.8134,
"step": 153
},
{
"epoch": 0.016086070924949077,
"grad_norm": 0.5462666153907776,
"learning_rate": 2.438e-05,
"loss": 0.925,
"step": 154
},
{
"epoch": 0.01619052593095524,
"grad_norm": 0.5957190990447998,
"learning_rate": 2.3849999999999997e-05,
"loss": 0.9079,
"step": 155
},
{
"epoch": 0.016294980936961405,
"grad_norm": 0.6015512347221375,
"learning_rate": 2.3319999999999997e-05,
"loss": 0.9956,
"step": 156
},
{
"epoch": 0.016399435942967566,
"grad_norm": 0.5997916460037231,
"learning_rate": 2.279e-05,
"loss": 0.9413,
"step": 157
},
{
"epoch": 0.01650389094897373,
"grad_norm": 0.5999729037284851,
"learning_rate": 2.2259999999999997e-05,
"loss": 0.8335,
"step": 158
},
{
"epoch": 0.016608345954979894,
"grad_norm": 0.6232542991638184,
"learning_rate": 2.173e-05,
"loss": 0.9134,
"step": 159
},
{
"epoch": 0.016712800960986054,
"grad_norm": 0.607313334941864,
"learning_rate": 2.1199999999999997e-05,
"loss": 0.874,
"step": 160
},
{
"epoch": 0.01681725596699222,
"grad_norm": 0.6412212252616882,
"learning_rate": 2.067e-05,
"loss": 0.9721,
"step": 161
},
{
"epoch": 0.01692171097299838,
"grad_norm": 0.650705099105835,
"learning_rate": 2.014e-05,
"loss": 0.9523,
"step": 162
},
{
"epoch": 0.017026165979004543,
"grad_norm": 0.6729899644851685,
"learning_rate": 1.9609999999999997e-05,
"loss": 0.9684,
"step": 163
},
{
"epoch": 0.017130620985010708,
"grad_norm": 0.6449539065361023,
"learning_rate": 1.908e-05,
"loss": 0.808,
"step": 164
},
{
"epoch": 0.01723507599101687,
"grad_norm": 0.6991842985153198,
"learning_rate": 1.8549999999999997e-05,
"loss": 0.9929,
"step": 165
},
{
"epoch": 0.017339530997023032,
"grad_norm": 0.7484295964241028,
"learning_rate": 1.802e-05,
"loss": 0.9746,
"step": 166
},
{
"epoch": 0.017443986003029197,
"grad_norm": 0.7161227464675903,
"learning_rate": 1.749e-05,
"loss": 0.9454,
"step": 167
},
{
"epoch": 0.017548441009035357,
"grad_norm": 0.7815462946891785,
"learning_rate": 1.696e-05,
"loss": 1.0301,
"step": 168
},
{
"epoch": 0.01765289601504152,
"grad_norm": 0.8647356033325195,
"learning_rate": 1.643e-05,
"loss": 1.0621,
"step": 169
},
{
"epoch": 0.017757351021047682,
"grad_norm": 0.9504815340042114,
"learning_rate": 1.5899999999999997e-05,
"loss": 1.0426,
"step": 170
},
{
"epoch": 0.017861806027053846,
"grad_norm": 0.8482909202575684,
"learning_rate": 1.537e-05,
"loss": 0.9898,
"step": 171
},
{
"epoch": 0.01796626103306001,
"grad_norm": 0.8360997438430786,
"learning_rate": 1.4839999999999999e-05,
"loss": 0.9783,
"step": 172
},
{
"epoch": 0.01807071603906617,
"grad_norm": 0.9085504412651062,
"learning_rate": 1.4309999999999999e-05,
"loss": 0.9865,
"step": 173
},
{
"epoch": 0.018175171045072335,
"grad_norm": 0.8988630771636963,
"learning_rate": 1.378e-05,
"loss": 1.0591,
"step": 174
},
{
"epoch": 0.0182796260510785,
"grad_norm": 0.8486796617507935,
"learning_rate": 1.3249999999999999e-05,
"loss": 0.9894,
"step": 175
},
{
"epoch": 0.01838408105708466,
"grad_norm": 0.8764381408691406,
"learning_rate": 1.272e-05,
"loss": 0.9253,
"step": 176
},
{
"epoch": 0.018488536063090824,
"grad_norm": 0.9448692798614502,
"learning_rate": 1.219e-05,
"loss": 1.0425,
"step": 177
},
{
"epoch": 0.018592991069096985,
"grad_norm": 0.9180240631103516,
"learning_rate": 1.1659999999999998e-05,
"loss": 0.9328,
"step": 178
},
{
"epoch": 0.01869744607510315,
"grad_norm": 0.9340706467628479,
"learning_rate": 1.1129999999999998e-05,
"loss": 1.0015,
"step": 179
},
{
"epoch": 0.018801901081109313,
"grad_norm": 0.8770861029624939,
"learning_rate": 1.0599999999999998e-05,
"loss": 0.9812,
"step": 180
},
{
"epoch": 0.018906356087115474,
"grad_norm": 0.9975367188453674,
"learning_rate": 1.007e-05,
"loss": 1.0984,
"step": 181
},
{
"epoch": 0.019010811093121638,
"grad_norm": 0.9696022868156433,
"learning_rate": 9.54e-06,
"loss": 0.9888,
"step": 182
},
{
"epoch": 0.019115266099127802,
"grad_norm": 0.8801543116569519,
"learning_rate": 9.01e-06,
"loss": 0.8672,
"step": 183
},
{
"epoch": 0.019219721105133963,
"grad_norm": 0.9794437885284424,
"learning_rate": 8.48e-06,
"loss": 1.0545,
"step": 184
},
{
"epoch": 0.019324176111140127,
"grad_norm": 0.9684680700302124,
"learning_rate": 7.949999999999998e-06,
"loss": 0.9275,
"step": 185
},
{
"epoch": 0.01942863111714629,
"grad_norm": 0.956508219242096,
"learning_rate": 7.419999999999999e-06,
"loss": 0.9679,
"step": 186
},
{
"epoch": 0.01953308612315245,
"grad_norm": 1.0241084098815918,
"learning_rate": 6.89e-06,
"loss": 1.0333,
"step": 187
},
{
"epoch": 0.019637541129158616,
"grad_norm": 1.13876211643219,
"learning_rate": 6.36e-06,
"loss": 1.189,
"step": 188
},
{
"epoch": 0.019741996135164776,
"grad_norm": 1.0502783060073853,
"learning_rate": 5.829999999999999e-06,
"loss": 1.0062,
"step": 189
},
{
"epoch": 0.01984645114117094,
"grad_norm": 1.0701584815979004,
"learning_rate": 5.299999999999999e-06,
"loss": 0.9934,
"step": 190
},
{
"epoch": 0.019950906147177105,
"grad_norm": 1.1496695280075073,
"learning_rate": 4.77e-06,
"loss": 1.0933,
"step": 191
},
{
"epoch": 0.020055361153183265,
"grad_norm": 1.1266313791275024,
"learning_rate": 4.24e-06,
"loss": 1.0909,
"step": 192
},
{
"epoch": 0.02015981615918943,
"grad_norm": 1.1178048849105835,
"learning_rate": 3.7099999999999996e-06,
"loss": 0.9263,
"step": 193
},
{
"epoch": 0.020264271165195594,
"grad_norm": 1.1649036407470703,
"learning_rate": 3.18e-06,
"loss": 1.0362,
"step": 194
},
{
"epoch": 0.020368726171201754,
"grad_norm": 1.1672587394714355,
"learning_rate": 2.6499999999999996e-06,
"loss": 1.0083,
"step": 195
},
{
"epoch": 0.02047318117720792,
"grad_norm": 1.1676815748214722,
"learning_rate": 2.12e-06,
"loss": 0.9924,
"step": 196
},
{
"epoch": 0.02057763618321408,
"grad_norm": 1.3110767602920532,
"learning_rate": 1.59e-06,
"loss": 0.9344,
"step": 197
},
{
"epoch": 0.020682091189220243,
"grad_norm": 1.4102957248687744,
"learning_rate": 1.06e-06,
"loss": 1.1094,
"step": 198
},
{
"epoch": 0.020786546195226407,
"grad_norm": 1.5473552942276,
"learning_rate": 5.3e-07,
"loss": 1.0472,
"step": 199
},
{
"epoch": 0.020891001201232568,
"grad_norm": 2.232775926589966,
"learning_rate": 0.0,
"loss": 1.3862,
"step": 200
},
{
"epoch": 0.020891001201232568,
"eval_loss": 0.9818174242973328,
"eval_runtime": 118.3422,
"eval_samples_per_second": 34.062,
"eval_steps_per_second": 8.518,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.06657392623616e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}