{ "best_metric": 18.4285, "best_model_checkpoint": "./runtime/tFINE-base-300m-samsum/checkpoint-345", "epoch": 3.995656894679696, "eval_steps": 500, "global_step": 460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04343105320304017, "grad_norm": 6.462469577789307, "learning_rate": 2.173913043478261e-05, "loss": 5.2862, "step": 5 }, { "epoch": 0.08686210640608034, "grad_norm": 4.075676918029785, "learning_rate": 4.347826086956522e-05, "loss": 4.0973, "step": 10 }, { "epoch": 0.13029315960912052, "grad_norm": 2.891948938369751, "learning_rate": 6.521739130434783e-05, "loss": 3.2568, "step": 15 }, { "epoch": 0.1737242128121607, "grad_norm": 1.456381916999817, "learning_rate": 8.695652173913044e-05, "loss": 2.6852, "step": 20 }, { "epoch": 0.21715526601520088, "grad_norm": 1.6086758375167847, "learning_rate": 9.999483191807244e-05, "loss": 2.4395, "step": 25 }, { "epoch": 0.26058631921824105, "grad_norm": 1.1808384656906128, "learning_rate": 9.993670326516924e-05, "loss": 2.3317, "step": 30 }, { "epoch": 0.30401737242128124, "grad_norm": 1.2403559684753418, "learning_rate": 9.981406120397172e-05, "loss": 2.2786, "step": 35 }, { "epoch": 0.3474484256243214, "grad_norm": 1.1080374717712402, "learning_rate": 9.962706417620413e-05, "loss": 2.2042, "step": 40 }, { "epoch": 0.39087947882736157, "grad_norm": 1.1210006475448608, "learning_rate": 9.93759537640057e-05, "loss": 2.1659, "step": 45 }, { "epoch": 0.43431053203040176, "grad_norm": 1.0776363611221313, "learning_rate": 9.90610543778299e-05, "loss": 2.1975, "step": 50 }, { "epoch": 0.4777415852334419, "grad_norm": 1.2220784425735474, "learning_rate": 9.868277283733726e-05, "loss": 2.1266, "step": 55 }, { "epoch": 0.5211726384364821, "grad_norm": 1.5034546852111816, "learning_rate": 9.824159784582368e-05, "loss": 2.1108, "step": 60 }, { "epoch": 0.5646036916395223, "grad_norm": 1.4478706121444702, "learning_rate": 9.773809935886287e-05, "loss": 2.0641, "step": 65 }, { "epoch": 0.6080347448425625, "grad_norm": 1.1443687677383423, "learning_rate": 9.717292784797854e-05, "loss": 2.0728, "step": 70 }, { "epoch": 0.6514657980456026, "grad_norm": 1.1472234725952148, "learning_rate": 9.654681346029808e-05, "loss": 2.0482, "step": 75 }, { "epoch": 0.6948968512486428, "grad_norm": 1.109851360321045, "learning_rate": 9.586056507527266e-05, "loss": 2.0456, "step": 80 }, { "epoch": 0.738327904451683, "grad_norm": 1.109726905822754, "learning_rate": 9.5115069259683e-05, "loss": 2.0477, "step": 85 }, { "epoch": 0.7817589576547231, "grad_norm": 1.1185649633407593, "learning_rate": 9.43112891222806e-05, "loss": 2.0252, "step": 90 }, { "epoch": 0.8251900108577633, "grad_norm": 1.1537866592407227, "learning_rate": 9.345026306954386e-05, "loss": 2.0184, "step": 95 }, { "epoch": 0.8686210640608035, "grad_norm": 1.1786285638809204, "learning_rate": 9.253310346415714e-05, "loss": 1.9877, "step": 100 }, { "epoch": 0.9120521172638436, "grad_norm": 1.202744722366333, "learning_rate": 9.156099518794534e-05, "loss": 1.9814, "step": 105 }, { "epoch": 0.9554831704668838, "grad_norm": 1.3231650590896606, "learning_rate": 9.053519411112075e-05, "loss": 1.9585, "step": 110 }, { "epoch": 0.998914223669924, "grad_norm": 1.330356240272522, "learning_rate": 8.945702546981969e-05, "loss": 1.9528, "step": 115 }, { "epoch": 0.998914223669924, "eval_gen_len": 29.333333333333332, "eval_loss": 1.9189409017562866, "eval_rouge1": 40.093, "eval_rouge2": 18.2018, "eval_rougeL": 33.9749, "eval_rougeLsum": 36.9071, "eval_runtime": 64.3388, "eval_samples_per_second": 4.663, "eval_steps_per_second": 0.295, "step": 115 }, { "epoch": 1.0423452768729642, "grad_norm": 1.36496901512146, "learning_rate": 8.832788215402527e-05, "loss": 1.6338, "step": 120 }, { "epoch": 1.0857763300760044, "grad_norm": 1.3041751384735107, "learning_rate": 8.714922290808766e-05, "loss": 1.6039, "step": 125 }, { "epoch": 1.1292073832790446, "grad_norm": 1.2985814809799194, "learning_rate": 8.592257044616702e-05, "loss": 1.6221, "step": 130 }, { "epoch": 1.1726384364820848, "grad_norm": 1.2213962078094482, "learning_rate": 8.464950948503349e-05, "loss": 1.5671, "step": 135 }, { "epoch": 1.216069489685125, "grad_norm": 1.110490322113037, "learning_rate": 8.333168469676595e-05, "loss": 1.6163, "step": 140 }, { "epoch": 1.2595005428881652, "grad_norm": 1.22842538356781, "learning_rate": 8.197079858399403e-05, "loss": 1.6156, "step": 145 }, { "epoch": 1.3029315960912053, "grad_norm": 1.2326569557189941, "learning_rate": 8.05686092804289e-05, "loss": 1.6263, "step": 150 }, { "epoch": 1.3463626492942453, "grad_norm": 1.2563903331756592, "learning_rate": 7.912692827952394e-05, "loss": 1.5923, "step": 155 }, { "epoch": 1.3897937024972855, "grad_norm": 1.1862818002700806, "learning_rate": 7.76476180941997e-05, "loss": 1.6199, "step": 160 }, { "epoch": 1.4332247557003257, "grad_norm": 1.186631202697754, "learning_rate": 7.613258985065672e-05, "loss": 1.6409, "step": 165 }, { "epoch": 1.476655808903366, "grad_norm": 1.1485611200332642, "learning_rate": 7.45838008193847e-05, "loss": 1.6194, "step": 170 }, { "epoch": 1.520086862106406, "grad_norm": 1.158892273902893, "learning_rate": 7.300325188655761e-05, "loss": 1.561, "step": 175 }, { "epoch": 1.5635179153094463, "grad_norm": 1.1595680713653564, "learning_rate": 7.139298496908154e-05, "loss": 1.5814, "step": 180 }, { "epoch": 1.6069489685124865, "grad_norm": 1.2705223560333252, "learning_rate": 6.97550803766349e-05, "loss": 1.5873, "step": 185 }, { "epoch": 1.6503800217155264, "grad_norm": 1.5138100385665894, "learning_rate": 6.809165412410876e-05, "loss": 1.6218, "step": 190 }, { "epoch": 1.6938110749185666, "grad_norm": 1.7399356365203857, "learning_rate": 6.640485519791953e-05, "loss": 1.5717, "step": 195 }, { "epoch": 1.7372421281216068, "grad_norm": 1.1342989206314087, "learning_rate": 6.469686277972556e-05, "loss": 1.5773, "step": 200 }, { "epoch": 1.780673181324647, "grad_norm": 1.1253174543380737, "learning_rate": 6.296988343113452e-05, "loss": 1.584, "step": 205 }, { "epoch": 1.8241042345276872, "grad_norm": 1.1998904943466187, "learning_rate": 6.122614824303845e-05, "loss": 1.6189, "step": 210 }, { "epoch": 1.8675352877307274, "grad_norm": 1.1804780960083008, "learning_rate": 5.946790995325924e-05, "loss": 1.5844, "step": 215 }, { "epoch": 1.9109663409337676, "grad_norm": 1.3260307312011719, "learning_rate": 5.769744003622851e-05, "loss": 1.5731, "step": 220 }, { "epoch": 1.9543973941368078, "grad_norm": 1.1990879774093628, "learning_rate": 5.59170257684616e-05, "loss": 1.6082, "step": 225 }, { "epoch": 1.997828447339848, "grad_norm": 1.164106011390686, "learning_rate": 5.4128967273616625e-05, "loss": 1.5346, "step": 230 }, { "epoch": 1.997828447339848, "eval_gen_len": 27.663333333333334, "eval_loss": 1.8827488422393799, "eval_rouge1": 41.4676, "eval_rouge2": 18.3467, "eval_rougeL": 34.1909, "eval_rougeLsum": 38.2131, "eval_runtime": 41.5739, "eval_samples_per_second": 7.216, "eval_steps_per_second": 0.457, "step": 230 }, { "epoch": 2.041259500542888, "grad_norm": 1.166826844215393, "learning_rate": 5.2335574550956446e-05, "loss": 1.268, "step": 235 }, { "epoch": 2.0846905537459284, "grad_norm": 1.2919505834579468, "learning_rate": 5.053916449105219e-05, "loss": 1.2186, "step": 240 }, { "epoch": 2.1281216069489686, "grad_norm": 1.4431166648864746, "learning_rate": 4.874205788258397e-05, "loss": 1.1827, "step": 245 }, { "epoch": 2.1715526601520088, "grad_norm": 1.4269115924835205, "learning_rate": 4.694657641410549e-05, "loss": 1.1784, "step": 250 }, { "epoch": 2.214983713355049, "grad_norm": 1.4717003107070923, "learning_rate": 4.515503967464618e-05, "loss": 1.1932, "step": 255 }, { "epoch": 2.258414766558089, "grad_norm": 1.2849444150924683, "learning_rate": 4.336976215702574e-05, "loss": 1.1578, "step": 260 }, { "epoch": 2.3018458197611293, "grad_norm": 1.2847343683242798, "learning_rate": 4.1593050267752485e-05, "loss": 1.1344, "step": 265 }, { "epoch": 2.3452768729641695, "grad_norm": 1.281315565109253, "learning_rate": 3.982719934736832e-05, "loss": 1.1719, "step": 270 }, { "epoch": 2.3887079261672097, "grad_norm": 1.2924513816833496, "learning_rate": 3.807449070508998e-05, "loss": 1.187, "step": 275 }, { "epoch": 2.43213897937025, "grad_norm": 1.2814276218414307, "learning_rate": 3.633718867157746e-05, "loss": 1.2178, "step": 280 }, { "epoch": 2.47557003257329, "grad_norm": 1.450994849205017, "learning_rate": 3.4617537673636866e-05, "loss": 1.1659, "step": 285 }, { "epoch": 2.5190010857763303, "grad_norm": 1.2326401472091675, "learning_rate": 3.2917759334637374e-05, "loss": 1.1816, "step": 290 }, { "epoch": 2.5624321389793705, "grad_norm": 1.2742615938186646, "learning_rate": 3.124004960438796e-05, "loss": 1.1543, "step": 295 }, { "epoch": 2.6058631921824107, "grad_norm": 1.4592117071151733, "learning_rate": 2.9586575922181724e-05, "loss": 1.214, "step": 300 }, { "epoch": 2.6492942453854504, "grad_norm": 1.3233025074005127, "learning_rate": 2.7959474416673336e-05, "loss": 1.1767, "step": 305 }, { "epoch": 2.6927252985884906, "grad_norm": 1.187286376953125, "learning_rate": 2.6360847146206623e-05, "loss": 1.1769, "step": 310 }, { "epoch": 2.736156351791531, "grad_norm": 1.3445571660995483, "learning_rate": 2.4792759383157748e-05, "loss": 1.2048, "step": 315 }, { "epoch": 2.779587404994571, "grad_norm": 1.399775505065918, "learning_rate": 2.325723694580229e-05, "loss": 1.1756, "step": 320 }, { "epoch": 2.823018458197611, "grad_norm": 1.3379682302474976, "learning_rate": 2.1756263581153424e-05, "loss": 1.1694, "step": 325 }, { "epoch": 2.8664495114006514, "grad_norm": 1.4017045497894287, "learning_rate": 2.0291778402151685e-05, "loss": 1.1876, "step": 330 }, { "epoch": 2.9098805646036916, "grad_norm": 1.3478162288665771, "learning_rate": 1.8865673382518145e-05, "loss": 1.1993, "step": 335 }, { "epoch": 2.953311617806732, "grad_norm": 1.2747628688812256, "learning_rate": 1.7479790912506626e-05, "loss": 1.1913, "step": 340 }, { "epoch": 2.996742671009772, "grad_norm": 1.3648200035095215, "learning_rate": 1.6135921418712956e-05, "loss": 1.1696, "step": 345 }, { "epoch": 2.996742671009772, "eval_gen_len": 27.803333333333335, "eval_loss": 1.9820051193237305, "eval_rouge1": 42.3629, "eval_rouge2": 18.4285, "eval_rougeL": 34.6339, "eval_rougeLsum": 38.7792, "eval_runtime": 38.5794, "eval_samples_per_second": 7.776, "eval_steps_per_second": 0.492, "step": 345 }, { "epoch": 3.040173724212812, "grad_norm": 1.135827660560608, "learning_rate": 1.4835801051016463e-05, "loss": 0.9797, "step": 350 }, { "epoch": 3.0836047774158524, "grad_norm": 1.2965835332870483, "learning_rate": 1.3581109439641588e-05, "loss": 0.9405, "step": 355 }, { "epoch": 3.1270358306188926, "grad_norm": 1.2207958698272705, "learning_rate": 1.237346752523752e-05, "loss": 0.9258, "step": 360 }, { "epoch": 3.1704668838219328, "grad_norm": 1.3027771711349487, "learning_rate": 1.1214435464779006e-05, "loss": 0.8973, "step": 365 }, { "epoch": 3.213897937024973, "grad_norm": 1.2429888248443604, "learning_rate": 1.0105510615994051e-05, "loss": 0.8882, "step": 370 }, { "epoch": 3.257328990228013, "grad_norm": 1.2552655935287476, "learning_rate": 9.048125602921842e-06, "loss": 0.9351, "step": 375 }, { "epoch": 3.3007600434310533, "grad_norm": 1.290592074394226, "learning_rate": 8.043646465100697e-06, "loss": 0.9189, "step": 380 }, { "epoch": 3.3441910966340935, "grad_norm": 1.3471736907958984, "learning_rate": 7.093370892776558e-06, "loss": 0.8981, "step": 385 }, { "epoch": 3.3876221498371337, "grad_norm": 1.4423019886016846, "learning_rate": 6.1985265504122314e-06, "loss": 0.914, "step": 390 }, { "epoch": 3.431053203040174, "grad_norm": 1.3276619911193848, "learning_rate": 5.360269490663278e-06, "loss": 0.8998, "step": 395 }, { "epoch": 3.4744842562432137, "grad_norm": 1.3065807819366455, "learning_rate": 4.5796826608693274e-06, "loss": 0.9208, "step": 400 }, { "epoch": 3.517915309446254, "grad_norm": 1.4401154518127441, "learning_rate": 3.857774503990514e-06, "loss": 0.9586, "step": 405 }, { "epoch": 3.561346362649294, "grad_norm": 1.4484052658081055, "learning_rate": 3.1954776557963085e-06, "loss": 0.9459, "step": 410 }, { "epoch": 3.6047774158523342, "grad_norm": 1.2900787591934204, "learning_rate": 2.593647739990068e-06, "loss": 0.9139, "step": 415 }, { "epoch": 3.6482084690553744, "grad_norm": 1.4173898696899414, "learning_rate": 2.0530622628255615e-06, "loss": 0.9515, "step": 420 }, { "epoch": 3.6916395222584146, "grad_norm": 1.3273446559906006, "learning_rate": 1.574419608643879e-06, "loss": 0.9536, "step": 425 }, { "epoch": 3.735070575461455, "grad_norm": 1.28830087184906, "learning_rate": 1.1583381376281731e-06, "loss": 0.9209, "step": 430 }, { "epoch": 3.778501628664495, "grad_norm": 1.2910932302474976, "learning_rate": 8.053553869418418e-07, "loss": 0.9536, "step": 435 }, { "epoch": 3.821932681867535, "grad_norm": 1.351585030555725, "learning_rate": 5.159273762823657e-07, "loss": 0.908, "step": 440 }, { "epoch": 3.8653637350705754, "grad_norm": 1.3552790880203247, "learning_rate": 2.9042801874777927e-07, "loss": 0.8984, "step": 445 }, { "epoch": 3.9087947882736156, "grad_norm": 1.3222675323486328, "learning_rate": 1.2914863777698792e-07, "loss": 0.9384, "step": 450 }, { "epoch": 3.952225841476656, "grad_norm": 1.4266947507858276, "learning_rate": 3.229759078795524e-08, "loss": 0.9457, "step": 455 }, { "epoch": 3.995656894679696, "grad_norm": 1.3441340923309326, "learning_rate": 0.0, "loss": 0.9359, "step": 460 }, { "epoch": 3.995656894679696, "eval_gen_len": 30.18, "eval_loss": 2.1588149070739746, "eval_rouge1": 41.2237, "eval_rouge2": 17.8161, "eval_rougeL": 33.7101, "eval_rougeLsum": 37.9569, "eval_runtime": 43.7657, "eval_samples_per_second": 6.855, "eval_steps_per_second": 0.434, "step": 460 }, { "epoch": 3.995656894679696, "step": 460, "total_flos": 3.414853029293568e+16, "train_loss": 1.5300355652104254, "train_runtime": 1725.057, "train_samples_per_second": 34.158, "train_steps_per_second": 0.267 } ], "logging_steps": 5, "max_steps": 460, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.414853029293568e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }