{ "best_metric": 0.10928191244602203, "best_model_checkpoint": "/media/user/Expansion/flan-t5-small-untokenizer2/checkpoint-22500", "epoch": 1.0, "eval_steps": 2500, "global_step": 25000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 0.623769998550415, "learning_rate": 4.9e-05, "loss": 0.2187, "num_input_tokens_seen": 1550704, "step": 500 }, { "epoch": 0.04, "grad_norm": 0.6147541403770447, "learning_rate": 4.8e-05, "loss": 0.1616, "num_input_tokens_seen": 3113636, "step": 1000 }, { "epoch": 0.06, "grad_norm": 0.8041962385177612, "learning_rate": 4.7e-05, "loss": 0.1465, "num_input_tokens_seen": 4634732, "step": 1500 }, { "epoch": 0.08, "grad_norm": 0.6499223709106445, "learning_rate": 4.600000000000001e-05, "loss": 0.1523, "num_input_tokens_seen": 6206076, "step": 2000 }, { "epoch": 0.1, "grad_norm": 0.6867215633392334, "learning_rate": 4.5e-05, "loss": 0.1477, "num_input_tokens_seen": 7758904, "step": 2500 }, { "epoch": 0.1, "eval_loss": 0.11968539655208588, "eval_runtime": 45.394, "eval_samples_per_second": 55.073, "eval_steps_per_second": 6.895, "num_input_tokens_seen": 7758904, "step": 2500 }, { "epoch": 0.12, "grad_norm": 0.5227354764938354, "learning_rate": 4.4000000000000006e-05, "loss": 0.1422, "num_input_tokens_seen": 9286000, "step": 3000 }, { "epoch": 0.14, "grad_norm": 0.6782410144805908, "learning_rate": 4.3e-05, "loss": 0.1417, "num_input_tokens_seen": 10857236, "step": 3500 }, { "epoch": 0.16, "grad_norm": 2.188730001449585, "learning_rate": 4.2e-05, "loss": 0.1296, "num_input_tokens_seen": 12388284, "step": 4000 }, { "epoch": 0.18, "grad_norm": 0.5791600346565247, "learning_rate": 4.1e-05, "loss": 0.1345, "num_input_tokens_seen": 13970100, "step": 4500 }, { "epoch": 0.2, "grad_norm": 1.1668968200683594, "learning_rate": 4e-05, "loss": 0.1331, "num_input_tokens_seen": 15535000, "step": 5000 }, { "epoch": 0.2, "eval_loss": 0.11603260785341263, "eval_runtime": 45.3659, "eval_samples_per_second": 55.107, "eval_steps_per_second": 6.899, "num_input_tokens_seen": 15535000, "step": 5000 }, { "epoch": 0.22, "grad_norm": 0.5479309558868408, "learning_rate": 3.9000000000000006e-05, "loss": 0.1319, "num_input_tokens_seen": 17098792, "step": 5500 }, { "epoch": 0.24, "grad_norm": 0.6089518666267395, "learning_rate": 3.8e-05, "loss": 0.1325, "num_input_tokens_seen": 18665500, "step": 6000 }, { "epoch": 0.26, "grad_norm": 0.9072483777999878, "learning_rate": 3.7e-05, "loss": 0.1327, "num_input_tokens_seen": 20242492, "step": 6500 }, { "epoch": 0.28, "grad_norm": 0.6144304275512695, "learning_rate": 3.6e-05, "loss": 0.1317, "num_input_tokens_seen": 21840216, "step": 7000 }, { "epoch": 0.3, "grad_norm": 0.6446326375007629, "learning_rate": 3.5e-05, "loss": 0.1295, "num_input_tokens_seen": 23379996, "step": 7500 }, { "epoch": 0.3, "eval_loss": 0.11423994600772858, "eval_runtime": 45.4091, "eval_samples_per_second": 55.055, "eval_steps_per_second": 6.893, "num_input_tokens_seen": 23379996, "step": 7500 }, { "epoch": 0.32, "grad_norm": 0.35889774560928345, "learning_rate": 3.4000000000000007e-05, "loss": 0.1279, "num_input_tokens_seen": 24947356, "step": 8000 }, { "epoch": 0.34, "grad_norm": 0.6276640892028809, "learning_rate": 3.3e-05, "loss": 0.1296, "num_input_tokens_seen": 26524048, "step": 8500 }, { "epoch": 0.36, "grad_norm": 0.16105327010154724, "learning_rate": 3.2000000000000005e-05, "loss": 0.1257, "num_input_tokens_seen": 28056324, "step": 9000 }, { "epoch": 0.38, "grad_norm": 1.7733196020126343, "learning_rate": 3.1e-05, "loss": 0.1198, "num_input_tokens_seen": 29608912, "step": 9500 }, { "epoch": 0.4, "grad_norm": 0.6086151599884033, "learning_rate": 3e-05, "loss": 0.1257, "num_input_tokens_seen": 31164076, "step": 10000 }, { "epoch": 0.4, "eval_loss": 0.1128210574388504, "eval_runtime": 45.589, "eval_samples_per_second": 54.838, "eval_steps_per_second": 6.866, "num_input_tokens_seen": 31164076, "step": 10000 }, { "epoch": 0.42, "grad_norm": 0.40136536955833435, "learning_rate": 2.9e-05, "loss": 0.131, "num_input_tokens_seen": 32749448, "step": 10500 }, { "epoch": 0.44, "grad_norm": 0.9361308217048645, "learning_rate": 2.8000000000000003e-05, "loss": 0.114, "num_input_tokens_seen": 34266344, "step": 11000 }, { "epoch": 0.46, "grad_norm": 1.6654324531555176, "learning_rate": 2.7000000000000002e-05, "loss": 0.1401, "num_input_tokens_seen": 35845312, "step": 11500 }, { "epoch": 0.48, "grad_norm": 0.22417037189006805, "learning_rate": 2.6000000000000002e-05, "loss": 0.1183, "num_input_tokens_seen": 37392204, "step": 12000 }, { "epoch": 0.5, "grad_norm": 0.354526549577713, "learning_rate": 2.5e-05, "loss": 0.1148, "num_input_tokens_seen": 38943032, "step": 12500 }, { "epoch": 0.5, "eval_loss": 0.11148922145366669, "eval_runtime": 45.5906, "eval_samples_per_second": 54.836, "eval_steps_per_second": 6.865, "num_input_tokens_seen": 38943032, "step": 12500 }, { "epoch": 0.52, "grad_norm": 0.5592128038406372, "learning_rate": 2.4e-05, "loss": 0.1237, "num_input_tokens_seen": 40522284, "step": 13000 }, { "epoch": 0.54, "grad_norm": 0.5023311376571655, "learning_rate": 2.3000000000000003e-05, "loss": 0.1188, "num_input_tokens_seen": 42078660, "step": 13500 }, { "epoch": 0.56, "grad_norm": 0.528414249420166, "learning_rate": 2.2000000000000003e-05, "loss": 0.1306, "num_input_tokens_seen": 43633484, "step": 14000 }, { "epoch": 0.58, "grad_norm": 0.7868823409080505, "learning_rate": 2.1e-05, "loss": 0.1167, "num_input_tokens_seen": 45163904, "step": 14500 }, { "epoch": 0.6, "grad_norm": 0.30285605788230896, "learning_rate": 2e-05, "loss": 0.1219, "num_input_tokens_seen": 46747616, "step": 15000 }, { "epoch": 0.6, "eval_loss": 0.11066804826259613, "eval_runtime": 45.6215, "eval_samples_per_second": 54.799, "eval_steps_per_second": 6.861, "num_input_tokens_seen": 46747616, "step": 15000 }, { "epoch": 0.62, "grad_norm": 0.18638424575328827, "learning_rate": 1.9e-05, "loss": 0.1295, "num_input_tokens_seen": 48319848, "step": 15500 }, { "epoch": 0.64, "grad_norm": 0.22312051057815552, "learning_rate": 1.8e-05, "loss": 0.116, "num_input_tokens_seen": 49864448, "step": 16000 }, { "epoch": 0.66, "grad_norm": 0.3801974356174469, "learning_rate": 1.7000000000000003e-05, "loss": 0.1321, "num_input_tokens_seen": 51461076, "step": 16500 }, { "epoch": 0.68, "grad_norm": 2.719569206237793, "learning_rate": 1.6000000000000003e-05, "loss": 0.1172, "num_input_tokens_seen": 53024800, "step": 17000 }, { "epoch": 0.7, "grad_norm": 0.3563300669193268, "learning_rate": 1.5e-05, "loss": 0.1112, "num_input_tokens_seen": 54513880, "step": 17500 }, { "epoch": 0.7, "eval_loss": 0.11027801036834717, "eval_runtime": 45.5331, "eval_samples_per_second": 54.905, "eval_steps_per_second": 6.874, "num_input_tokens_seen": 54513880, "step": 17500 }, { "epoch": 0.72, "grad_norm": 0.36904799938201904, "learning_rate": 1.4000000000000001e-05, "loss": 0.1228, "num_input_tokens_seen": 56077248, "step": 18000 }, { "epoch": 0.74, "grad_norm": 0.20697401463985443, "learning_rate": 1.3000000000000001e-05, "loss": 0.1228, "num_input_tokens_seen": 57639976, "step": 18500 }, { "epoch": 0.76, "grad_norm": 0.25590789318084717, "learning_rate": 1.2e-05, "loss": 0.1193, "num_input_tokens_seen": 59184096, "step": 19000 }, { "epoch": 0.78, "grad_norm": 0.8957023620605469, "learning_rate": 1.1000000000000001e-05, "loss": 0.1233, "num_input_tokens_seen": 60758920, "step": 19500 }, { "epoch": 0.8, "grad_norm": 0.3526408076286316, "learning_rate": 1e-05, "loss": 0.1161, "num_input_tokens_seen": 62302092, "step": 20000 }, { "epoch": 0.8, "eval_loss": 0.10997864603996277, "eval_runtime": 45.5162, "eval_samples_per_second": 54.925, "eval_steps_per_second": 6.877, "num_input_tokens_seen": 62302092, "step": 20000 }, { "epoch": 0.82, "grad_norm": 1.0477633476257324, "learning_rate": 9e-06, "loss": 0.1143, "num_input_tokens_seen": 63829116, "step": 20500 }, { "epoch": 0.84, "grad_norm": 0.30011287331581116, "learning_rate": 8.000000000000001e-06, "loss": 0.1187, "num_input_tokens_seen": 65361664, "step": 21000 }, { "epoch": 0.86, "grad_norm": 0.3339782953262329, "learning_rate": 7.000000000000001e-06, "loss": 0.1149, "num_input_tokens_seen": 66920576, "step": 21500 }, { "epoch": 0.88, "grad_norm": 0.2934107780456543, "learning_rate": 6e-06, "loss": 0.1153, "num_input_tokens_seen": 68429876, "step": 22000 }, { "epoch": 0.9, "grad_norm": 0.3181125521659851, "learning_rate": 5e-06, "loss": 0.1215, "num_input_tokens_seen": 70000044, "step": 22500 }, { "epoch": 0.9, "eval_loss": 0.10928191244602203, "eval_runtime": 45.5863, "eval_samples_per_second": 54.841, "eval_steps_per_second": 6.866, "num_input_tokens_seen": 70000044, "step": 22500 }, { "epoch": 0.92, "grad_norm": 0.5395247936248779, "learning_rate": 4.000000000000001e-06, "loss": 0.119, "num_input_tokens_seen": 71571012, "step": 23000 }, { "epoch": 0.94, "grad_norm": 1.1183191537857056, "learning_rate": 3e-06, "loss": 0.1196, "num_input_tokens_seen": 73134988, "step": 23500 }, { "epoch": 0.96, "grad_norm": 0.40107282996177673, "learning_rate": 2.0000000000000003e-06, "loss": 0.1107, "num_input_tokens_seen": 74666536, "step": 24000 }, { "epoch": 0.98, "grad_norm": 0.1762804388999939, "learning_rate": 1.0000000000000002e-06, "loss": 0.1096, "num_input_tokens_seen": 76182928, "step": 24500 }, { "epoch": 1.0, "grad_norm": 0.2036857306957245, "learning_rate": 0.0, "loss": 0.1207, "num_input_tokens_seen": 77750728, "step": 25000 }, { "epoch": 1.0, "eval_loss": 0.10940483212471008, "eval_runtime": 45.4609, "eval_samples_per_second": 54.992, "eval_steps_per_second": 6.885, "num_input_tokens_seen": 77750728, "step": 25000 }, { "epoch": 1.0, "num_input_tokens_seen": 77750728, "step": 25000, "total_flos": 2.8228754041061376e+16, "train_loss": 0.12799154373168944, "train_runtime": 4779.3877, "train_samples_per_second": 20.923, "train_steps_per_second": 5.231, "train_tokens_per_second": 16267.926 } ], "logging_steps": 500, "max_steps": 25000, "num_input_tokens_seen": 77750728, "num_train_epochs": 1, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8228754041061376e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }