End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +780 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: apache-2.0
 base_model: Qwen/Qwen2.5-7B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: hp_ablations_qwen_scheduler_cosine_warmup0.10_dcftv1.2
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # hp_ablations_qwen_scheduler_cosine_warmup0.10_dcftv1.2
-This model is a fine-tuned version of [Qwen/Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6357

 base_model: Qwen/Qwen2.5-7B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: hp_ablations_qwen_scheduler_cosine_warmup0.10_dcftv1.2
 # hp_ablations_qwen_scheduler_cosine_warmup0.10_dcftv1.2
+This model is a fine-tuned version of [Qwen/Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B) on the mlfoundations-dev/oh-dcft-v1.2_no-curation_gpt-4o-mini dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6357

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.9956108266276518,
+    "eval_loss": 0.6357362270355225,
+    "eval_runtime": 342.5337,
+    "eval_samples_per_second": 26.885,
+    "eval_steps_per_second": 0.42,
+    "total_flos": 2144987064041472.0,
+    "train_loss": 0.6275725323899867,
+    "train_runtime": 54929.7845,
+    "train_samples_per_second": 9.555,
+    "train_steps_per_second": 0.019
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.9956108266276518,
+    "eval_loss": 0.6357362270355225,
+    "eval_runtime": 342.5337,
+    "eval_samples_per_second": 26.885,
+    "eval_steps_per_second": 0.42
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.9956108266276518,
+    "total_flos": 2144987064041472.0,
+    "train_loss": 0.6275725323899867,
+    "train_runtime": 54929.7845,
+    "train_samples_per_second": 9.555,
+    "train_steps_per_second": 0.019
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,780 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9956108266276518,
+  "eval_steps": 500,
+  "global_step": 1023,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.029261155815654718,
+      "grad_norm": 13.94059376383442,
+      "learning_rate": 4.854368932038835e-07,
+      "loss": 0.8889,
+      "step": 10
+    },
+    {
+      "epoch": 0.058522311631309436,
+      "grad_norm": 9.399275788852298,
+      "learning_rate": 9.70873786407767e-07,
+      "loss": 0.8672,
+      "step": 20
+    },
+    {
+      "epoch": 0.08778346744696415,
+      "grad_norm": 1.6907766019894934,
+      "learning_rate": 1.4563106796116506e-06,
+      "loss": 0.7912,
+      "step": 30
+    },
+    {
+      "epoch": 0.11704462326261887,
+      "grad_norm": 1.2860426174509976,
+      "learning_rate": 1.941747572815534e-06,
+      "loss": 0.7565,
+      "step": 40
+    },
+    {
+      "epoch": 0.14630577907827358,
+      "grad_norm": 1.3375853087497869,
+      "learning_rate": 2.427184466019418e-06,
+      "loss": 0.7353,
+      "step": 50
+    },
+    {
+      "epoch": 0.1755669348939283,
+      "grad_norm": 2.907056106049806,
+      "learning_rate": 2.912621359223301e-06,
+      "loss": 0.7201,
+      "step": 60
+    },
+    {
+      "epoch": 0.20482809070958302,
+      "grad_norm": 1.1476046831775073,
+      "learning_rate": 3.398058252427185e-06,
+      "loss": 0.7089,
+      "step": 70
+    },
+    {
+      "epoch": 0.23408924652523774,
+      "grad_norm": 1.1135226365031068,
+      "learning_rate": 3.883495145631068e-06,
+      "loss": 0.699,
+      "step": 80
+    },
+    {
+      "epoch": 0.26335040234089246,
+      "grad_norm": 1.146206569363389,
+      "learning_rate": 4.368932038834952e-06,
+      "loss": 0.6889,
+      "step": 90
+    },
+    {
+      "epoch": 0.29261155815654716,
+      "grad_norm": 0.9140774724299752,
+      "learning_rate": 4.854368932038836e-06,
+      "loss": 0.6734,
+      "step": 100
+    },
+    {
+      "epoch": 0.3218727139722019,
+      "grad_norm": 0.588701958865582,
+      "learning_rate": 4.999285817005508e-06,
+      "loss": 0.6756,
+      "step": 110
+    },
+    {
+      "epoch": 0.3511338697878566,
+      "grad_norm": 0.4082554657073974,
+      "learning_rate": 4.995788760114507e-06,
+      "loss": 0.6771,
+      "step": 120
+    },
+    {
+      "epoch": 0.38039502560351135,
+      "grad_norm": 0.4058528373173876,
+      "learning_rate": 4.9893817251717034e-06,
+      "loss": 0.6593,
+      "step": 130
+    },
+    {
+      "epoch": 0.40965618141916604,
+      "grad_norm": 0.4247857730723825,
+      "learning_rate": 4.98007218249324e-06,
+      "loss": 0.659,
+      "step": 140
+    },
+    {
+      "epoch": 0.4389173372348208,
+      "grad_norm": 0.36161516478754074,
+      "learning_rate": 4.96787098658944e-06,
+      "loss": 0.6537,
+      "step": 150
+    },
+    {
+      "epoch": 0.4681784930504755,
+      "grad_norm": 0.40515744300029327,
+      "learning_rate": 4.952792363508943e-06,
+      "loss": 0.6551,
+      "step": 160
+    },
+    {
+      "epoch": 0.49743964886613024,
+      "grad_norm": 0.3651631470091895,
+      "learning_rate": 4.934853894251754e-06,
+      "loss": 0.6597,
+      "step": 170
+    },
+    {
+      "epoch": 0.5267008046817849,
+      "grad_norm": 0.33671351155261603,
+      "learning_rate": 4.914076494270601e-06,
+      "loss": 0.659,
+      "step": 180
+    },
+    {
+      "epoch": 0.5559619604974396,
+      "grad_norm": 0.36601574444406737,
+      "learning_rate": 4.890484389084437e-06,
+      "loss": 0.6472,
+      "step": 190
+    },
+    {
+      "epoch": 0.5852231163130943,
+      "grad_norm": 0.35099231151506444,
+      "learning_rate": 4.864105086032581e-06,
+      "loss": 0.6481,
+      "step": 200
+    },
+    {
+      "epoch": 0.6144842721287491,
+      "grad_norm": 0.3432304898955751,
+      "learning_rate": 4.834969342202383e-06,
+      "loss": 0.6545,
+      "step": 210
+    },
+    {
+      "epoch": 0.6437454279444038,
+      "grad_norm": 0.3544675295601586,
+      "learning_rate": 4.803111128567838e-06,
+      "loss": 0.6422,
+      "step": 220
+    },
+    {
+      "epoch": 0.6730065837600585,
+      "grad_norm": 0.3638869764486878,
+      "learning_rate": 4.768567590380951e-06,
+      "loss": 0.642,
+      "step": 230
+    },
+    {
+      "epoch": 0.7022677395757132,
+      "grad_norm": 0.33982788262633873,
+      "learning_rate": 4.731379003862036e-06,
+      "loss": 0.6488,
+      "step": 240
+    },
+    {
+      "epoch": 0.731528895391368,
+      "grad_norm": 0.33830424532637865,
+      "learning_rate": 4.6915887292394395e-06,
+      "loss": 0.647,
+      "step": 250
+    },
+    {
+      "epoch": 0.7607900512070227,
+      "grad_norm": 0.3487388233248073,
+      "learning_rate": 4.64924316019346e-06,
+      "loss": 0.6495,
+      "step": 260
+    },
+    {
+      "epoch": 0.7900512070226774,
+      "grad_norm": 0.38614924762564196,
+      "learning_rate": 4.604391669763386e-06,
+      "loss": 0.6517,
+      "step": 270
+    },
+    {
+      "epoch": 0.8193123628383321,
+      "grad_norm": 0.3473530362547047,
+      "learning_rate": 4.55708655278075e-06,
+      "loss": 0.6504,
+      "step": 280
+    },
+    {
+      "epoch": 0.8485735186539868,
+      "grad_norm": 0.3337564100743828,
+      "learning_rate": 4.507382964895885e-06,
+      "loss": 0.6425,
+      "step": 290
+    },
+    {
+      "epoch": 0.8778346744696416,
+      "grad_norm": 0.34801291927165795,
+      "learning_rate": 4.455338858268903e-06,
+      "loss": 0.6342,
+      "step": 300
+    },
+    {
+      "epoch": 0.9070958302852963,
+      "grad_norm": 0.3493046699878889,
+      "learning_rate": 4.401014914000078e-06,
+      "loss": 0.6473,
+      "step": 310
+    },
+    {
+      "epoch": 0.936356986100951,
+      "grad_norm": 0.33072375208045185,
+      "learning_rate": 4.344474471378389e-06,
+      "loss": 0.6392,
+      "step": 320
+    },
+    {
+      "epoch": 0.9656181419166057,
+      "grad_norm": 0.37834684345351405,
+      "learning_rate": 4.285783454030748e-06,
+      "loss": 0.6424,
+      "step": 330
+    },
+    {
+      "epoch": 0.9948792977322605,
+      "grad_norm": 0.3525861974780877,
+      "learning_rate": 4.225010293057994e-06,
+      "loss": 0.6383,
+      "step": 340
+    },
+    {
+      "epoch": 0.9978054133138259,
+      "eval_loss": 0.6428639888763428,
+      "eval_runtime": 344.3081,
+      "eval_samples_per_second": 26.746,
+      "eval_steps_per_second": 0.418,
+      "step": 341
+    },
+    {
+      "epoch": 1.025237746891002,
+      "grad_norm": 0.3653650833935323,
+      "learning_rate": 4.1622258472472955e-06,
+      "loss": 0.6573,
+      "step": 350
+    },
+    {
+      "epoch": 1.054498902706657,
+      "grad_norm": 0.3784080463365135,
+      "learning_rate": 4.097503320453971e-06,
+      "loss": 0.612,
+      "step": 360
+    },
+    {
+      "epoch": 1.0837600585223117,
+      "grad_norm": 0.3829697605686294,
+      "learning_rate": 4.030918176249072e-06,
+      "loss": 0.6153,
+      "step": 370
+    },
+    {
+      "epoch": 1.1130212143379663,
+      "grad_norm": 0.3137051948696285,
+      "learning_rate": 3.962548049932232e-06,
+      "loss": 0.6125,
+      "step": 380
+    },
+    {
+      "epoch": 1.142282370153621,
+      "grad_norm": 0.3929486566480964,
+      "learning_rate": 3.892472658012379e-06,
+      "loss": 0.6177,
+      "step": 390
+    },
+    {
+      "epoch": 1.1715435259692757,
+      "grad_norm": 0.35216535521228837,
+      "learning_rate": 3.820773705261854e-06,
+      "loss": 0.6193,
+      "step": 400
+    },
+    {
+      "epoch": 1.2008046817849305,
+      "grad_norm": 0.35034638383878064,
+      "learning_rate": 3.747534789452304e-06,
+      "loss": 0.619,
+      "step": 410
+    },
+    {
+      "epoch": 1.2300658376005853,
+      "grad_norm": 0.349728048184571,
+      "learning_rate": 3.6728413038834132e-06,
+      "loss": 0.6068,
+      "step": 420
+    },
+    {
+      "epoch": 1.2593269934162399,
+      "grad_norm": 0.32360580145766005,
+      "learning_rate": 3.5967803378181387e-06,
+      "loss": 0.6171,
+      "step": 430
+    },
+    {
+      "epoch": 1.2885881492318947,
+      "grad_norm": 0.39293541402530385,
+      "learning_rate": 3.519440574940529e-06,
+      "loss": 0.6093,
+      "step": 440
+    },
+    {
+      "epoch": 1.3178493050475493,
+      "grad_norm": 0.3414483236947501,
+      "learning_rate": 3.4409121899545087e-06,
+      "loss": 0.6133,
+      "step": 450
+    },
+    {
+      "epoch": 1.347110460863204,
+      "grad_norm": 0.3278844037107844,
+      "learning_rate": 3.3612867434442135e-06,
+      "loss": 0.6106,
+      "step": 460
+    },
+    {
+      "epoch": 1.3763716166788589,
+      "grad_norm": 0.321882966456194,
+      "learning_rate": 3.2806570751184406e-06,
+      "loss": 0.608,
+      "step": 470
+    },
+    {
+      "epoch": 1.4056327724945135,
+      "grad_norm": 0.3313301079174284,
+      "learning_rate": 3.1991171955637036e-06,
+      "loss": 0.6085,
+      "step": 480
+    },
+    {
+      "epoch": 1.4348939283101683,
+      "grad_norm": 0.3618931122135215,
+      "learning_rate": 3.1167621766320932e-06,
+      "loss": 0.615,
+      "step": 490
+    },
+    {
+      "epoch": 1.464155084125823,
+      "grad_norm": 0.3379032463009361,
+      "learning_rate": 3.0336880405917496e-06,
+      "loss": 0.6033,
+      "step": 500
+    },
+    {
+      "epoch": 1.4934162399414777,
+      "grad_norm": 0.33139858469368133,
+      "learning_rate": 2.949991648169196e-06,
+      "loss": 0.6113,
+      "step": 510
+    },
+    {
+      "epoch": 1.5226773957571325,
+      "grad_norm": 0.3347833206243534,
+      "learning_rate": 2.8657705856140596e-06,
+      "loss": 0.6136,
+      "step": 520
+    },
+    {
+      "epoch": 1.5519385515727873,
+      "grad_norm": 0.355429846245808,
+      "learning_rate": 2.7811230509178745e-06,
+      "loss": 0.6123,
+      "step": 530
+    },
+    {
+      "epoch": 1.5811997073884418,
+      "grad_norm": 0.32961205810710803,
+      "learning_rate": 2.696147739319613e-06,
+      "loss": 0.6188,
+      "step": 540
+    },
+    {
+      "epoch": 1.6104608632040964,
+      "grad_norm": 0.34292268734676556,
+      "learning_rate": 2.6109437282314535e-06,
+      "loss": 0.6096,
+      "step": 550
+    },
+    {
+      "epoch": 1.6397220190197512,
+      "grad_norm": 0.3361710529625224,
+      "learning_rate": 2.5256103617189504e-06,
+      "loss": 0.6113,
+      "step": 560
+    },
+    {
+      "epoch": 1.668983174835406,
+      "grad_norm": 0.3287644537344635,
+      "learning_rate": 2.440247134670294e-06,
+      "loss": 0.6018,
+      "step": 570
+    },
+    {
+      "epoch": 1.6982443306510606,
+      "grad_norm": 0.3350539367772555,
+      "learning_rate": 2.354953576789727e-06,
+      "loss": 0.6115,
+      "step": 580
+    },
+    {
+      "epoch": 1.7275054864667154,
+      "grad_norm": 0.3231456537602999,
+      "learning_rate": 2.269829136550355e-06,
+      "loss": 0.6111,
+      "step": 590
+    },
+    {
+      "epoch": 1.7567666422823702,
+      "grad_norm": 0.3397466635497397,
+      "learning_rate": 2.1849730652416825e-06,
+      "loss": 0.6081,
+      "step": 600
+    },
+    {
+      "epoch": 1.7860277980980248,
+      "grad_norm": 0.35598143646367475,
+      "learning_rate": 2.1004843012470437e-06,
+      "loss": 0.6106,
+      "step": 610
+    },
+    {
+      "epoch": 1.8152889539136796,
+      "grad_norm": 0.31311913029998645,
+      "learning_rate": 2.016461354685876e-06,
+      "loss": 0.6109,
+      "step": 620
+    },
+    {
+      "epoch": 1.8445501097293344,
+      "grad_norm": 0.3177142604141687,
+      "learning_rate": 1.9330021925553253e-06,
+      "loss": 0.6101,
+      "step": 630
+    },
+    {
+      "epoch": 1.873811265544989,
+      "grad_norm": 0.3195475865970179,
+      "learning_rate": 1.8502041245051114e-06,
+      "loss": 0.6047,
+      "step": 640
+    },
+    {
+      "epoch": 1.9030724213606436,
+      "grad_norm": 0.34485831325499466,
+      "learning_rate": 1.7681636893788302e-06,
+      "loss": 0.6106,
+      "step": 650
+    },
+    {
+      "epoch": 1.9323335771762986,
+      "grad_norm": 0.3101316144954113,
+      "learning_rate": 1.6869765426539759e-06,
+      "loss": 0.611,
+      "step": 660
+    },
+    {
+      "epoch": 1.9615947329919532,
+      "grad_norm": 0.3256937223780742,
+      "learning_rate": 1.6067373449119387e-06,
+      "loss": 0.5998,
+      "step": 670
+    },
+    {
+      "epoch": 1.9908558888076078,
+      "grad_norm": 0.3162290387753267,
+      "learning_rate": 1.5275396514679986e-06,
+      "loss": 0.6143,
+      "step": 680
+    },
+    {
+      "epoch": 1.9967081199707388,
+      "eval_loss": 0.6351743936538696,
+      "eval_runtime": 344.6425,
+      "eval_samples_per_second": 26.72,
+      "eval_steps_per_second": 0.418,
+      "step": 682
+    },
+    {
+      "epoch": 2.0212143379663496,
+      "grad_norm": 0.32018427402743455,
+      "learning_rate": 1.4494758032900119e-06,
+      "loss": 0.6336,
+      "step": 690
+    },
+    {
+      "epoch": 2.050475493782004,
+      "grad_norm": 0.31577974435085937,
+      "learning_rate": 1.372636819332976e-06,
+      "loss": 0.5845,
+      "step": 700
+    },
+    {
+      "epoch": 2.0797366495976592,
+      "grad_norm": 0.3029171910952351,
+      "learning_rate": 1.2971122904149944e-06,
+      "loss": 0.5867,
+      "step": 710
+    },
+    {
+      "epoch": 2.108997805413314,
+      "grad_norm": 0.33511772012389357,
+      "learning_rate": 1.2229902747583972e-06,
+      "loss": 0.5876,
+      "step": 720
+    },
+    {
+      "epoch": 2.1382589612289684,
+      "grad_norm": 0.3156015487656,
+      "learning_rate": 1.1503571953177884e-06,
+      "loss": 0.5893,
+      "step": 730
+    },
+    {
+      "epoch": 2.1675201170446234,
+      "grad_norm": 0.3371950741889132,
+      "learning_rate": 1.0792977390147474e-06,
+      "loss": 0.5941,
+      "step": 740
+    },
+    {
+      "epoch": 2.196781272860278,
+      "grad_norm": 0.32897346750056816,
+      "learning_rate": 1.009894757996668e-06,
+      "loss": 0.5928,
+      "step": 750
+    },
+    {
+      "epoch": 2.2260424286759326,
+      "grad_norm": 0.3394392762287294,
+      "learning_rate": 9.422291730348565e-07,
+      "loss": 0.5899,
+      "step": 760
+    },
+    {
+      "epoch": 2.255303584491587,
+      "grad_norm": 0.3115602039131131,
+      "learning_rate": 8.763798791745413e-07,
+      "loss": 0.5882,
+      "step": 770
+    },
+    {
+      "epoch": 2.284564740307242,
+      "grad_norm": 0.3137752524034506,
+      "learning_rate": 8.12423653746767e-07,
+      "loss": 0.5875,
+      "step": 780
+    },
+    {
+      "epoch": 2.313825896122897,
+      "grad_norm": 0.3017797829362983,
+      "learning_rate": 7.504350668494725e-07,
+      "loss": 0.5855,
+      "step": 790
+    },
+    {
+      "epoch": 2.3430870519385514,
+      "grad_norm": 0.3114255491758221,
+      "learning_rate": 6.904863944020834e-07,
+      "loss": 0.5932,
+      "step": 800
+    },
+    {
+      "epoch": 2.3723482077542064,
+      "grad_norm": 0.3074179184847908,
+      "learning_rate": 6.326475338750288e-07,
+      "loss": 0.5882,
+      "step": 810
+    },
+    {
+      "epoch": 2.401609363569861,
+      "grad_norm": 0.31539346295499116,
+      "learning_rate": 5.769859227924154e-07,
+      "loss": 0.5942,
+      "step": 820
+    },
+    {
+      "epoch": 2.4308705193855156,
+      "grad_norm": 0.29321748526986546,
+      "learning_rate": 5.235664601028911e-07,
+      "loss": 0.5866,
+      "step": 830
+    },
+    {
+      "epoch": 2.4601316752011706,
+      "grad_norm": 0.3034123122737905,
+      "learning_rate": 4.7245143051037475e-07,
+      "loss": 0.5868,
+      "step": 840
+    },
+    {
+      "epoch": 2.489392831016825,
+      "grad_norm": 0.31024627211609695,
+      "learning_rate": 4.2370043185287397e-07,
+      "loss": 0.5886,
+      "step": 850
+    },
+    {
+      "epoch": 2.5186539868324798,
+      "grad_norm": 0.3154876519991477,
+      "learning_rate": 3.773703056140779e-07,
+      "loss": 0.5818,
+      "step": 860
+    },
+    {
+      "epoch": 2.547915142648135,
+      "grad_norm": 0.3074925297268889,
+      "learning_rate": 3.335150706487256e-07,
+      "loss": 0.5842,
+      "step": 870
+    },
+    {
+      "epoch": 2.5771762984637894,
+      "grad_norm": 0.2945650517625053,
+      "learning_rate": 2.921858601990396e-07,
+      "loss": 0.5882,
+      "step": 880
+    },
+    {
+      "epoch": 2.606437454279444,
+      "grad_norm": 0.2997163431719521,
+      "learning_rate": 2.5343086227565037e-07,
+      "loss": 0.5877,
+      "step": 890
+    },
+    {
+      "epoch": 2.6356986100950985,
+      "grad_norm": 0.306350940960521,
+      "learning_rate": 2.172952634725345e-07,
+      "loss": 0.5922,
+      "step": 900
+    },
+    {
+      "epoch": 2.6649597659107536,
+      "grad_norm": 0.30476358150124866,
+      "learning_rate": 1.838211962814679e-07,
+      "loss": 0.5925,
+      "step": 910
+    },
+    {
+      "epoch": 2.694220921726408,
+      "grad_norm": 0.31398774450844835,
+      "learning_rate": 1.530476899674202e-07,
+      "loss": 0.5865,
+      "step": 920
+    },
+    {
+      "epoch": 2.723482077542063,
+      "grad_norm": 0.3168655235083733,
+      "learning_rate": 1.2501062506218354e-07,
+      "loss": 0.5898,
+      "step": 930
+    },
+    {
+      "epoch": 2.7527432333577178,
+      "grad_norm": 0.3105039434391971,
+      "learning_rate": 9.974269152927235e-08,
+      "loss": 0.5888,
+      "step": 940
+    },
+    {
+      "epoch": 2.7820043891733723,
+      "grad_norm": 0.2957515945421737,
+      "learning_rate": 7.727335064889264e-08,
+      "loss": 0.5874,
+      "step": 950
+    },
+    {
+      "epoch": 2.811265544989027,
+      "grad_norm": 0.30706809152212006,
+      "learning_rate": 5.762880066740473e-08,
+      "loss": 0.5914,
+      "step": 960
+    },
+    {
+      "epoch": 2.840526700804682,
+      "grad_norm": 0.29567229572647563,
+      "learning_rate": 4.083194625134013e-08,
+      "loss": 0.587,
+      "step": 970
+    },
+    {
+      "epoch": 2.8697878566203365,
+      "grad_norm": 0.3006938272467725,
+      "learning_rate": 2.690237178158528e-08,
+      "loss": 0.5871,
+      "step": 980
+    },
+    {
+      "epoch": 2.899049012435991,
+      "grad_norm": 0.29892548674551433,
+      "learning_rate": 1.5856318518868986e-08,
+      "loss": 0.5796,
+      "step": 990
+    },
+    {
+      "epoch": 2.928310168251646,
+      "grad_norm": 0.30298562726883044,
+      "learning_rate": 7.70666566718009e-09,
+      "loss": 0.5875,
+      "step": 1000
+    },
+    {
+      "epoch": 2.9575713240673007,
+      "grad_norm": 0.29958579696478904,
+      "learning_rate": 2.462915357190343e-09,
+      "loss": 0.5789,
+      "step": 1010
+    },
+    {
+      "epoch": 2.9868324798829553,
+      "grad_norm": 0.3013888809338584,
+      "learning_rate": 1.311815671958816e-10,
+      "loss": 0.5908,
+      "step": 1020
+    },
+    {
+      "epoch": 2.9956108266276518,
+      "eval_loss": 0.6357362270355225,
+      "eval_runtime": 343.5975,
+      "eval_samples_per_second": 26.802,
+      "eval_steps_per_second": 0.419,
+      "step": 1023
+    },
+    {
+      "epoch": 2.9956108266276518,
+      "step": 1023,
+      "total_flos": 2144987064041472.0,
+      "train_loss": 0.6275725323899867,
+      "train_runtime": 54929.7845,
+      "train_samples_per_second": 9.555,
+      "train_steps_per_second": 0.019
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1023,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2144987064041472.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed