diff --git "a/running_log.txt" "b/running_log.txt" new file mode 100644--- /dev/null +++ "b/running_log.txt" @@ -0,0 +1,2533 @@ +[INFO|2025-01-23 00:44:53] configuration_utils.py:679 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/164af3f02f5b3aedf8cbdb94aee9e2705ae31be5/config.json + +[INFO|2025-01-23 00:44:53] configuration_utils.py:746 >> Model config Qwen2Config { + "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} + + +[INFO|2025-01-23 00:45:33] tokenization_utils_base.py:2211 >> loading file tokenizer.model from cache at None + +[INFO|2025-01-23 00:45:33] tokenization_utils_base.py:2211 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/164af3f02f5b3aedf8cbdb94aee9e2705ae31be5/tokenizer.json + +[INFO|2025-01-23 00:45:33] tokenization_utils_base.py:2211 >> loading file added_tokens.json from cache at None + +[INFO|2025-01-23 00:45:33] tokenization_utils_base.py:2211 >> loading file special_tokens_map.json from cache at None + +[INFO|2025-01-23 00:45:33] tokenization_utils_base.py:2211 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/164af3f02f5b3aedf8cbdb94aee9e2705ae31be5/tokenizer_config.json + +[INFO|2025-01-23 00:45:33] tokenization_utils_base.py:2475 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. + +[INFO|2025-01-23 00:46:13] configuration_utils.py:679 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/164af3f02f5b3aedf8cbdb94aee9e2705ae31be5/config.json + +[INFO|2025-01-23 00:46:13] configuration_utils.py:746 >> Model config Qwen2Config { + "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} + + +[INFO|2025-01-23 00:46:54] tokenization_utils_base.py:2211 >> loading file tokenizer.model from cache at None + +[INFO|2025-01-23 00:46:54] tokenization_utils_base.py:2211 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/164af3f02f5b3aedf8cbdb94aee9e2705ae31be5/tokenizer.json + +[INFO|2025-01-23 00:46:54] tokenization_utils_base.py:2211 >> loading file added_tokens.json from cache at None + +[INFO|2025-01-23 00:46:54] tokenization_utils_base.py:2211 >> loading file special_tokens_map.json from cache at None + +[INFO|2025-01-23 00:46:54] tokenization_utils_base.py:2211 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/164af3f02f5b3aedf8cbdb94aee9e2705ae31be5/tokenizer_config.json + +[INFO|2025-01-23 00:46:54] tokenization_utils_base.py:2475 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. + +[INFO|2025-01-23 00:46:54] logging.py:157 >> Add <|end▁of▁sentence|> to stop words. + +[INFO|2025-01-23 00:46:54] logging.py:157 >> Loading dataset /root/LLaMA-Factory/data/opencsg/smoltalk-chinese/filtered_data/all_data.jsonl... + +[INFO|2025-01-23 00:50:46] configuration_utils.py:679 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/164af3f02f5b3aedf8cbdb94aee9e2705ae31be5/config.json + +[INFO|2025-01-23 00:50:46] configuration_utils.py:746 >> Model config Qwen2Config { + "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} + + +[INFO|2025-01-23 00:50:46] logging.py:157 >> Using block diagonal attention for sequence packing without cross-attention. + +[INFO|2025-01-23 00:50:56] modeling_utils.py:3937 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/164af3f02f5b3aedf8cbdb94aee9e2705ae31be5/model.safetensors + +[INFO|2025-01-23 00:50:56] modeling_utils.py:4080 >> Detected DeepSpeed ZeRO-3: activating zero.init() for this model + +[WARNING|2025-01-23 00:50:56] logging.py:328 >> You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour + +[WARNING|2025-01-23 00:50:56] logging.py:328 >> You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. + +[INFO|2025-01-23 00:50:56] configuration_utils.py:1096 >> Generate config GenerationConfig { + "bos_token_id": 151643, + "eos_token_id": 151643, + "use_cache": false +} + + +[WARNING|2025-01-23 00:51:06] logging.py:328 >> You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour + +[WARNING|2025-01-23 00:51:06] logging.py:328 >> You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour + +[WARNING|2025-01-23 00:51:06] logging.py:328 >> You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. + +[WARNING|2025-01-23 00:51:06] logging.py:328 >> You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. + +[WARNING|2025-01-23 00:51:06] logging.py:328 >> You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour + +[WARNING|2025-01-23 00:51:06] logging.py:328 >> You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. + +[INFO|2025-01-23 00:51:07] modeling_utils.py:4800 >> All model checkpoint weights were used when initializing Qwen2ForCausalLM. + + +[INFO|2025-01-23 00:51:07] modeling_utils.py:4808 >> All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. + +[INFO|2025-01-23 00:51:17] configuration_utils.py:1051 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/164af3f02f5b3aedf8cbdb94aee9e2705ae31be5/generation_config.json + +[INFO|2025-01-23 00:51:17] configuration_utils.py:1096 >> Generate config GenerationConfig { + "bos_token_id": 151646, + "do_sample": true, + "eos_token_id": 151643, + "temperature": 0.6, + "top_p": 0.95 +} + + +[INFO|2025-01-23 00:51:17] logging.py:157 >> Gradient checkpointing enabled. + +[INFO|2025-01-23 00:51:17] logging.py:157 >> Using FlashAttention-2 for faster training and inference. + +[INFO|2025-01-23 00:51:17] logging.py:157 >> Pure bf16 / BAdam detected, remaining trainable params in half precision. + +[INFO|2025-01-23 00:51:17] logging.py:157 >> Fine-tuning method: Full + +[INFO|2025-01-23 00:51:17] logging.py:157 >> trainable params: 1,777,088,000 || all params: 1,777,088,000 || trainable%: 100.0000 + +[INFO|2025-01-23 00:51:18] trainer.py:698 >> Using auto half precision backend + +[INFO|2025-01-23 00:51:18] logging.py:157 >> Using BAdam optimizer with layer-wise update, switch mode is ascending, switch block every 100 steps, default start block is None + +[INFO|2025-01-23 00:51:20] trainer.py:2313 >> ***** Running training ***** + +[INFO|2025-01-23 00:51:20] trainer.py:2314 >> Num examples = 181,301 + +[INFO|2025-01-23 00:51:20] trainer.py:2315 >> Num Epochs = 2 + +[INFO|2025-01-23 00:51:20] trainer.py:2316 >> Instantaneous batch size per device = 4 + +[INFO|2025-01-23 00:51:20] trainer.py:2319 >> Total train batch size (w. parallel, distributed & accumulation) = 64 + +[INFO|2025-01-23 00:51:20] trainer.py:2320 >> Gradient Accumulation steps = 4 + +[INFO|2025-01-23 00:51:20] trainer.py:2321 >> Total optimization steps = 5,666 + +[INFO|2025-01-23 00:51:20] trainer.py:2322 >> Number of trainable parameters = 46,797,824 + +[INFO|2025-01-23 00:51:49] logging.py:157 >> {'loss': 2.3057, 'learning_rate': 1.5000e-05, 'epoch': 0.00} + +[INFO|2025-01-23 00:52:17] logging.py:157 >> {'loss': 2.4562, 'learning_rate': 3.0000e-05, 'epoch': 0.00} + +[INFO|2025-01-23 00:52:44] logging.py:157 >> {'loss': 2.2577, 'learning_rate': 4.5000e-05, 'epoch': 0.01} + +[INFO|2025-01-23 00:53:12] logging.py:157 >> {'loss': 2.4244, 'learning_rate': 6.0000e-05, 'epoch': 0.01} + +[INFO|2025-01-23 00:53:39] logging.py:157 >> {'loss': 2.4041, 'learning_rate': 7.5000e-05, 'epoch': 0.01} + +[INFO|2025-01-23 00:54:07] logging.py:157 >> {'loss': 2.2040, 'learning_rate': 9.0000e-05, 'epoch': 0.01} + +[INFO|2025-01-23 00:54:34] logging.py:157 >> {'loss': 2.3191, 'learning_rate': 1.0500e-04, 'epoch': 0.01} + +[INFO|2025-01-23 00:55:02] logging.py:157 >> {'loss': 2.2965, 'learning_rate': 1.2000e-04, 'epoch': 0.01} + +[INFO|2025-01-23 00:55:29] logging.py:157 >> {'loss': 2.1950, 'learning_rate': 1.3500e-04, 'epoch': 0.02} + +[INFO|2025-01-23 00:55:56] logging.py:157 >> {'loss': 2.3032, 'learning_rate': 1.5000e-04, 'epoch': 0.02} + +[INFO|2025-01-23 00:56:24] logging.py:157 >> {'loss': 2.1918, 'learning_rate': 1.6500e-04, 'epoch': 0.02} + +[INFO|2025-01-23 00:56:51] logging.py:157 >> {'loss': 2.3392, 'learning_rate': 1.8000e-04, 'epoch': 0.02} + +[INFO|2025-01-23 00:57:19] logging.py:157 >> {'loss': 2.2509, 'learning_rate': 1.9500e-04, 'epoch': 0.02} + +[INFO|2025-01-23 00:57:46] logging.py:157 >> {'loss': 2.2620, 'learning_rate': 2.1000e-04, 'epoch': 0.02} + +[INFO|2025-01-23 00:58:14] logging.py:157 >> {'loss': 2.0138, 'learning_rate': 2.2500e-04, 'epoch': 0.03} + +[INFO|2025-01-23 00:58:42] logging.py:157 >> {'loss': 2.2382, 'learning_rate': 2.4000e-04, 'epoch': 0.03} + +[INFO|2025-01-23 00:59:09] logging.py:157 >> {'loss': 2.2271, 'learning_rate': 2.5500e-04, 'epoch': 0.03} + +[INFO|2025-01-23 00:59:36] logging.py:157 >> {'loss': 2.3561, 'learning_rate': 2.7000e-04, 'epoch': 0.03} + +[INFO|2025-01-23 01:00:04] logging.py:157 >> {'loss': 2.2833, 'learning_rate': 2.8500e-04, 'epoch': 0.03} + +[INFO|2025-01-23 01:00:33] logging.py:157 >> {'loss': 2.1165, 'learning_rate': 3.0000e-04, 'epoch': 0.04} + +[INFO|2025-01-23 01:01:00] logging.py:157 >> {'loss': 2.0447, 'learning_rate': 3.0000e-04, 'epoch': 0.04} + +[INFO|2025-01-23 01:01:27] logging.py:157 >> {'loss': 2.4483, 'learning_rate': 3.0000e-04, 'epoch': 0.04} + +[INFO|2025-01-23 01:01:54] logging.py:157 >> {'loss': 2.2005, 'learning_rate': 2.9999e-04, 'epoch': 0.04} + +[INFO|2025-01-23 01:02:20] logging.py:157 >> {'loss': 2.4502, 'learning_rate': 2.9999e-04, 'epoch': 0.04} + +[INFO|2025-01-23 01:02:47] logging.py:157 >> {'loss': 2.2069, 'learning_rate': 2.9999e-04, 'epoch': 0.04} + +[INFO|2025-01-23 01:03:14] logging.py:157 >> {'loss': 2.2102, 'learning_rate': 2.9998e-04, 'epoch': 0.05} + +[INFO|2025-01-23 01:03:41] logging.py:157 >> {'loss': 2.2909, 'learning_rate': 2.9997e-04, 'epoch': 0.05} + +[INFO|2025-01-23 01:04:08] logging.py:157 >> {'loss': 2.0928, 'learning_rate': 2.9996e-04, 'epoch': 0.05} + +[INFO|2025-01-23 01:04:35] logging.py:157 >> {'loss': 2.1318, 'learning_rate': 2.9995e-04, 'epoch': 0.05} + +[INFO|2025-01-23 01:05:02] logging.py:157 >> {'loss': 2.0537, 'learning_rate': 2.9994e-04, 'epoch': 0.05} + +[INFO|2025-01-23 01:05:29] logging.py:157 >> {'loss': 2.1201, 'learning_rate': 2.9993e-04, 'epoch': 0.05} + +[INFO|2025-01-23 01:05:56] logging.py:157 >> {'loss': 2.2252, 'learning_rate': 2.9991e-04, 'epoch': 0.06} + +[INFO|2025-01-23 01:06:23] logging.py:157 >> {'loss': 2.1946, 'learning_rate': 2.9990e-04, 'epoch': 0.06} + +[INFO|2025-01-23 01:06:50] logging.py:157 >> {'loss': 2.1067, 'learning_rate': 2.9988e-04, 'epoch': 0.06} + +[INFO|2025-01-23 01:07:16] logging.py:157 >> {'loss': 2.3251, 'learning_rate': 2.9987e-04, 'epoch': 0.06} + +[INFO|2025-01-23 01:07:43] logging.py:157 >> {'loss': 2.0071, 'learning_rate': 2.9985e-04, 'epoch': 0.06} + +[INFO|2025-01-23 01:08:10] logging.py:157 >> {'loss': 2.2113, 'learning_rate': 2.9983e-04, 'epoch': 0.07} + +[INFO|2025-01-23 01:08:37] logging.py:157 >> {'loss': 2.2285, 'learning_rate': 2.9981e-04, 'epoch': 0.07} + +[INFO|2025-01-23 01:09:04] logging.py:157 >> {'loss': 2.1710, 'learning_rate': 2.9978e-04, 'epoch': 0.07} + +[INFO|2025-01-23 01:09:32] logging.py:157 >> {'loss': 2.0817, 'learning_rate': 2.9976e-04, 'epoch': 0.07} + +[INFO|2025-01-23 01:09:59] logging.py:157 >> {'loss': 2.2159, 'learning_rate': 2.9974e-04, 'epoch': 0.07} + +[INFO|2025-01-23 01:10:25] logging.py:157 >> {'loss': 2.1484, 'learning_rate': 2.9971e-04, 'epoch': 0.07} + +[INFO|2025-01-23 01:10:51] logging.py:157 >> {'loss': 2.1547, 'learning_rate': 2.9968e-04, 'epoch': 0.08} + +[INFO|2025-01-23 01:11:17] logging.py:157 >> {'loss': 2.1373, 'learning_rate': 2.9966e-04, 'epoch': 0.08} + +[INFO|2025-01-23 01:11:44] logging.py:157 >> {'loss': 2.2049, 'learning_rate': 2.9963e-04, 'epoch': 0.08} + +[INFO|2025-01-23 01:12:10] logging.py:157 >> {'loss': 2.1362, 'learning_rate': 2.9960e-04, 'epoch': 0.08} + +[INFO|2025-01-23 01:12:36] logging.py:157 >> {'loss': 2.0550, 'learning_rate': 2.9956e-04, 'epoch': 0.08} + +[INFO|2025-01-23 01:13:03] logging.py:157 >> {'loss': 2.0891, 'learning_rate': 2.9953e-04, 'epoch': 0.08} + +[INFO|2025-01-23 01:13:29] logging.py:157 >> {'loss': 2.2344, 'learning_rate': 2.9950e-04, 'epoch': 0.09} + +[INFO|2025-01-23 01:13:56] logging.py:157 >> {'loss': 2.2645, 'learning_rate': 2.9946e-04, 'epoch': 0.09} + +[INFO|2025-01-23 01:14:22] logging.py:157 >> {'loss': 2.1878, 'learning_rate': 2.9943e-04, 'epoch': 0.09} + +[INFO|2025-01-23 01:14:48] logging.py:157 >> {'loss': 2.3129, 'learning_rate': 2.9939e-04, 'epoch': 0.09} + +[INFO|2025-01-23 01:15:15] logging.py:157 >> {'loss': 1.9386, 'learning_rate': 2.9935e-04, 'epoch': 0.09} + +[INFO|2025-01-23 01:15:41] logging.py:157 >> {'loss': 2.2377, 'learning_rate': 2.9931e-04, 'epoch': 0.10} + +[INFO|2025-01-23 01:16:07] logging.py:157 >> {'loss': 2.3703, 'learning_rate': 2.9927e-04, 'epoch': 0.10} + +[INFO|2025-01-23 01:16:33] logging.py:157 >> {'loss': 2.0394, 'learning_rate': 2.9923e-04, 'epoch': 0.10} + +[INFO|2025-01-23 01:17:00] logging.py:157 >> {'loss': 2.2186, 'learning_rate': 2.9918e-04, 'epoch': 0.10} + +[INFO|2025-01-23 01:17:26] logging.py:157 >> {'loss': 2.1372, 'learning_rate': 2.9914e-04, 'epoch': 0.10} + +[INFO|2025-01-23 01:17:52] logging.py:157 >> {'loss': 2.1068, 'learning_rate': 2.9909e-04, 'epoch': 0.10} + +[INFO|2025-01-23 01:18:20] logging.py:157 >> {'loss': 2.2192, 'learning_rate': 2.9905e-04, 'epoch': 0.11} + +[INFO|2025-01-23 01:18:46] logging.py:157 >> {'loss': 2.1785, 'learning_rate': 2.9900e-04, 'epoch': 0.11} + +[INFO|2025-01-23 01:19:12] logging.py:157 >> {'loss': 2.2577, 'learning_rate': 2.9895e-04, 'epoch': 0.11} + +[INFO|2025-01-23 01:19:37] logging.py:157 >> {'loss': 2.2192, 'learning_rate': 2.9890e-04, 'epoch': 0.11} + +[INFO|2025-01-23 01:20:03] logging.py:157 >> {'loss': 2.2923, 'learning_rate': 2.9885e-04, 'epoch': 0.11} + +[INFO|2025-01-23 01:20:29] logging.py:157 >> {'loss': 2.1331, 'learning_rate': 2.9879e-04, 'epoch': 0.11} + +[INFO|2025-01-23 01:20:55] logging.py:157 >> {'loss': 2.0629, 'learning_rate': 2.9874e-04, 'epoch': 0.12} + +[INFO|2025-01-23 01:21:21] logging.py:157 >> {'loss': 2.0498, 'learning_rate': 2.9868e-04, 'epoch': 0.12} + +[INFO|2025-01-23 01:21:47] logging.py:157 >> {'loss': 2.2615, 'learning_rate': 2.9863e-04, 'epoch': 0.12} + +[INFO|2025-01-23 01:22:13] logging.py:157 >> {'loss': 2.1939, 'learning_rate': 2.9857e-04, 'epoch': 0.12} + +[INFO|2025-01-23 01:22:39] logging.py:157 >> {'loss': 2.2365, 'learning_rate': 2.9851e-04, 'epoch': 0.12} + +[INFO|2025-01-23 01:23:04] logging.py:157 >> {'loss': 2.1711, 'learning_rate': 2.9845e-04, 'epoch': 0.13} + +[INFO|2025-01-23 01:23:30] logging.py:157 >> {'loss': 2.2582, 'learning_rate': 2.9839e-04, 'epoch': 0.13} + +[INFO|2025-01-23 01:23:56] logging.py:157 >> {'loss': 2.1152, 'learning_rate': 2.9833e-04, 'epoch': 0.13} + +[INFO|2025-01-23 01:24:22] logging.py:157 >> {'loss': 2.0446, 'learning_rate': 2.9826e-04, 'epoch': 0.13} + +[INFO|2025-01-23 01:24:48] logging.py:157 >> {'loss': 2.1343, 'learning_rate': 2.9820e-04, 'epoch': 0.13} + +[INFO|2025-01-23 01:25:13] logging.py:157 >> {'loss': 2.2858, 'learning_rate': 2.9813e-04, 'epoch': 0.13} + +[INFO|2025-01-23 01:25:39] logging.py:157 >> {'loss': 2.0600, 'learning_rate': 2.9806e-04, 'epoch': 0.14} + +[INFO|2025-01-23 01:26:05] logging.py:157 >> {'loss': 2.2976, 'learning_rate': 2.9800e-04, 'epoch': 0.14} + +[INFO|2025-01-23 01:26:31] logging.py:157 >> {'loss': 2.2986, 'learning_rate': 2.9793e-04, 'epoch': 0.14} + +[INFO|2025-01-23 01:26:58] logging.py:157 >> {'loss': 2.2626, 'learning_rate': 2.9785e-04, 'epoch': 0.14} + +[INFO|2025-01-23 01:27:23] logging.py:157 >> {'loss': 2.0912, 'learning_rate': 2.9778e-04, 'epoch': 0.14} + +[INFO|2025-01-23 01:27:48] logging.py:157 >> {'loss': 2.2395, 'learning_rate': 2.9771e-04, 'epoch': 0.14} + +[INFO|2025-01-23 01:28:14] logging.py:157 >> {'loss': 2.1716, 'learning_rate': 2.9764e-04, 'epoch': 0.15} + +[INFO|2025-01-23 01:28:39] logging.py:157 >> {'loss': 2.1567, 'learning_rate': 2.9756e-04, 'epoch': 0.15} + +[INFO|2025-01-23 01:29:04] logging.py:157 >> {'loss': 2.0570, 'learning_rate': 2.9748e-04, 'epoch': 0.15} + +[INFO|2025-01-23 01:29:30] logging.py:157 >> {'loss': 2.3069, 'learning_rate': 2.9741e-04, 'epoch': 0.15} + +[INFO|2025-01-23 01:29:55] logging.py:157 >> {'loss': 2.1936, 'learning_rate': 2.9733e-04, 'epoch': 0.15} + +[INFO|2025-01-23 01:30:20] logging.py:157 >> {'loss': 2.3136, 'learning_rate': 2.9725e-04, 'epoch': 0.16} + +[INFO|2025-01-23 01:30:45] logging.py:157 >> {'loss': 2.1568, 'learning_rate': 2.9717e-04, 'epoch': 0.16} + +[INFO|2025-01-23 01:31:11] logging.py:157 >> {'loss': 2.1949, 'learning_rate': 2.9708e-04, 'epoch': 0.16} + +[INFO|2025-01-23 01:31:36] logging.py:157 >> {'loss': 2.1642, 'learning_rate': 2.9700e-04, 'epoch': 0.16} + +[INFO|2025-01-23 01:32:01] logging.py:157 >> {'loss': 2.1097, 'learning_rate': 2.9691e-04, 'epoch': 0.16} + +[INFO|2025-01-23 01:32:27] logging.py:157 >> {'loss': 2.2478, 'learning_rate': 2.9683e-04, 'epoch': 0.16} + +[INFO|2025-01-23 01:32:52] logging.py:157 >> {'loss': 2.0988, 'learning_rate': 2.9674e-04, 'epoch': 0.17} + +[INFO|2025-01-23 01:33:17] logging.py:157 >> {'loss': 2.1584, 'learning_rate': 2.9665e-04, 'epoch': 0.17} + +[INFO|2025-01-23 01:33:42] logging.py:157 >> {'loss': 2.2052, 'learning_rate': 2.9656e-04, 'epoch': 0.17} + +[INFO|2025-01-23 01:34:08] logging.py:157 >> {'loss': 2.3180, 'learning_rate': 2.9647e-04, 'epoch': 0.17} + +[INFO|2025-01-23 01:34:33] logging.py:157 >> {'loss': 2.2473, 'learning_rate': 2.9638e-04, 'epoch': 0.17} + +[INFO|2025-01-23 01:34:58] logging.py:157 >> {'loss': 2.1800, 'learning_rate': 2.9629e-04, 'epoch': 0.17} + +[INFO|2025-01-23 01:35:25] logging.py:157 >> {'loss': 2.1010, 'learning_rate': 2.9619e-04, 'epoch': 0.18} + +[INFO|2025-01-23 01:35:50] logging.py:157 >> {'loss': 2.1359, 'learning_rate': 2.9610e-04, 'epoch': 0.18} + +[INFO|2025-01-23 01:36:15] logging.py:157 >> {'loss': 2.1597, 'learning_rate': 2.9600e-04, 'epoch': 0.18} + +[INFO|2025-01-23 01:36:39] logging.py:157 >> {'loss': 2.2543, 'learning_rate': 2.9590e-04, 'epoch': 0.18} + +[INFO|2025-01-23 01:37:04] logging.py:157 >> {'loss': 2.0531, 'learning_rate': 2.9580e-04, 'epoch': 0.18} + +[INFO|2025-01-23 01:37:29] logging.py:157 >> {'loss': 2.0309, 'learning_rate': 2.9570e-04, 'epoch': 0.19} + +[INFO|2025-01-23 01:37:54] logging.py:157 >> {'loss': 2.1264, 'learning_rate': 2.9560e-04, 'epoch': 0.19} + +[INFO|2025-01-23 01:38:18] logging.py:157 >> {'loss': 2.0760, 'learning_rate': 2.9550e-04, 'epoch': 0.19} + +[INFO|2025-01-23 01:38:43] logging.py:157 >> {'loss': 2.1084, 'learning_rate': 2.9540e-04, 'epoch': 0.19} + +[INFO|2025-01-23 01:39:08] logging.py:157 >> {'loss': 2.2279, 'learning_rate': 2.9529e-04, 'epoch': 0.19} + +[INFO|2025-01-23 01:39:33] logging.py:157 >> {'loss': 2.1829, 'learning_rate': 2.9519e-04, 'epoch': 0.19} + +[INFO|2025-01-23 01:39:57] logging.py:157 >> {'loss': 1.9802, 'learning_rate': 2.9508e-04, 'epoch': 0.20} + +[INFO|2025-01-23 01:40:22] logging.py:157 >> {'loss': 2.0829, 'learning_rate': 2.9497e-04, 'epoch': 0.20} + +[INFO|2025-01-23 01:40:47] logging.py:157 >> {'loss': 2.1448, 'learning_rate': 2.9486e-04, 'epoch': 0.20} + +[INFO|2025-01-23 01:41:12] logging.py:157 >> {'loss': 2.0970, 'learning_rate': 2.9475e-04, 'epoch': 0.20} + +[INFO|2025-01-23 01:41:36] logging.py:157 >> {'loss': 2.0590, 'learning_rate': 2.9464e-04, 'epoch': 0.20} + +[INFO|2025-01-23 01:42:01] logging.py:157 >> {'loss': 2.1296, 'learning_rate': 2.9453e-04, 'epoch': 0.20} + +[INFO|2025-01-23 01:42:26] logging.py:157 >> {'loss': 2.1708, 'learning_rate': 2.9441e-04, 'epoch': 0.21} + +[INFO|2025-01-23 01:42:50] logging.py:157 >> {'loss': 2.0103, 'learning_rate': 2.9430e-04, 'epoch': 0.21} + +[INFO|2025-01-23 01:43:15] logging.py:157 >> {'loss': 2.1780, 'learning_rate': 2.9418e-04, 'epoch': 0.21} + +[INFO|2025-01-23 01:43:41] logging.py:157 >> {'loss': 2.1516, 'learning_rate': 2.9407e-04, 'epoch': 0.21} + +[INFO|2025-01-23 01:44:05] logging.py:157 >> {'loss': 2.0929, 'learning_rate': 2.9395e-04, 'epoch': 0.21} + +[INFO|2025-01-23 01:44:30] logging.py:157 >> {'loss': 2.2197, 'learning_rate': 2.9383e-04, 'epoch': 0.22} + +[INFO|2025-01-23 01:44:54] logging.py:157 >> {'loss': 2.3015, 'learning_rate': 2.9371e-04, 'epoch': 0.22} + +[INFO|2025-01-23 01:45:18] logging.py:157 >> {'loss': 2.0767, 'learning_rate': 2.9359e-04, 'epoch': 0.22} + +[INFO|2025-01-23 01:45:42] logging.py:157 >> {'loss': 1.9616, 'learning_rate': 2.9346e-04, 'epoch': 0.22} + +[INFO|2025-01-23 01:46:07] logging.py:157 >> {'loss': 1.9926, 'learning_rate': 2.9334e-04, 'epoch': 0.22} + +[INFO|2025-01-23 01:46:31] logging.py:157 >> {'loss': 2.1212, 'learning_rate': 2.9321e-04, 'epoch': 0.22} + +[INFO|2025-01-23 01:46:55] logging.py:157 >> {'loss': 2.0959, 'learning_rate': 2.9309e-04, 'epoch': 0.23} + +[INFO|2025-01-23 01:47:19] logging.py:157 >> {'loss': 2.1843, 'learning_rate': 2.9296e-04, 'epoch': 0.23} + +[INFO|2025-01-23 01:47:44] logging.py:157 >> {'loss': 2.0457, 'learning_rate': 2.9283e-04, 'epoch': 0.23} + +[INFO|2025-01-23 01:48:08] logging.py:157 >> {'loss': 2.0672, 'learning_rate': 2.9270e-04, 'epoch': 0.23} + +[INFO|2025-01-23 01:48:32] logging.py:157 >> {'loss': 2.2223, 'learning_rate': 2.9257e-04, 'epoch': 0.23} + +[INFO|2025-01-23 01:48:57] logging.py:157 >> {'loss': 2.2904, 'learning_rate': 2.9244e-04, 'epoch': 0.23} + +[INFO|2025-01-23 01:49:21] logging.py:157 >> {'loss': 2.0731, 'learning_rate': 2.9230e-04, 'epoch': 0.24} + +[INFO|2025-01-23 01:49:45] logging.py:157 >> {'loss': 2.2583, 'learning_rate': 2.9217e-04, 'epoch': 0.24} + +[INFO|2025-01-23 01:50:09] logging.py:157 >> {'loss': 2.2264, 'learning_rate': 2.9203e-04, 'epoch': 0.24} + +[INFO|2025-01-23 01:50:33] logging.py:157 >> {'loss': 1.9708, 'learning_rate': 2.9190e-04, 'epoch': 0.24} + +[INFO|2025-01-23 01:50:57] logging.py:157 >> {'loss': 2.1891, 'learning_rate': 2.9176e-04, 'epoch': 0.24} + +[INFO|2025-01-23 01:51:22] logging.py:157 >> {'loss': 2.0362, 'learning_rate': 2.9162e-04, 'epoch': 0.25} + +[INFO|2025-01-23 01:51:47] logging.py:157 >> {'loss': 2.0718, 'learning_rate': 2.9148e-04, 'epoch': 0.25} + +[INFO|2025-01-23 01:52:11] logging.py:157 >> {'loss': 2.1238, 'learning_rate': 2.9134e-04, 'epoch': 0.25} + +[INFO|2025-01-23 01:52:35] logging.py:157 >> {'loss': 2.1241, 'learning_rate': 2.9120e-04, 'epoch': 0.25} + +[INFO|2025-01-23 01:52:59] logging.py:157 >> {'loss': 2.1697, 'learning_rate': 2.9105e-04, 'epoch': 0.25} + +[INFO|2025-01-23 01:53:23] logging.py:157 >> {'loss': 2.2821, 'learning_rate': 2.9091e-04, 'epoch': 0.25} + +[INFO|2025-01-23 01:53:47] logging.py:157 >> {'loss': 2.2057, 'learning_rate': 2.9076e-04, 'epoch': 0.26} + +[INFO|2025-01-23 01:54:11] logging.py:157 >> {'loss': 2.1832, 'learning_rate': 2.9062e-04, 'epoch': 0.26} + +[INFO|2025-01-23 01:54:35] logging.py:157 >> {'loss': 1.9967, 'learning_rate': 2.9047e-04, 'epoch': 0.26} + +[INFO|2025-01-23 01:54:59] logging.py:157 >> {'loss': 2.2037, 'learning_rate': 2.9032e-04, 'epoch': 0.26} + +[INFO|2025-01-23 01:55:22] logging.py:157 >> {'loss': 2.1817, 'learning_rate': 2.9017e-04, 'epoch': 0.26} + +[INFO|2025-01-23 01:55:46] logging.py:157 >> {'loss': 2.2453, 'learning_rate': 2.9002e-04, 'epoch': 0.26} + +[INFO|2025-01-23 01:56:10] logging.py:157 >> {'loss': 2.1835, 'learning_rate': 2.8987e-04, 'epoch': 0.27} + +[INFO|2025-01-23 01:56:34] logging.py:157 >> {'loss': 2.2215, 'learning_rate': 2.8971e-04, 'epoch': 0.27} + +[INFO|2025-01-23 01:56:58] logging.py:157 >> {'loss': 2.0813, 'learning_rate': 2.8956e-04, 'epoch': 0.27} + +[INFO|2025-01-23 01:57:22] logging.py:157 >> {'loss': 2.1511, 'learning_rate': 2.8940e-04, 'epoch': 0.27} + +[INFO|2025-01-23 01:57:46] logging.py:157 >> {'loss': 2.0847, 'learning_rate': 2.8924e-04, 'epoch': 0.27} + +[INFO|2025-01-23 01:58:09] logging.py:157 >> {'loss': 2.1602, 'learning_rate': 2.8909e-04, 'epoch': 0.28} + +[INFO|2025-01-23 01:58:33] logging.py:157 >> {'loss': 2.1164, 'learning_rate': 2.8893e-04, 'epoch': 0.28} + +[INFO|2025-01-23 01:58:57] logging.py:157 >> {'loss': 2.2996, 'learning_rate': 2.8877e-04, 'epoch': 0.28} + +[INFO|2025-01-23 01:59:21] logging.py:157 >> {'loss': 2.1099, 'learning_rate': 2.8861e-04, 'epoch': 0.28} + +[INFO|2025-01-23 01:59:46] logging.py:157 >> {'loss': 2.2077, 'learning_rate': 2.8844e-04, 'epoch': 0.28} + +[INFO|2025-01-23 02:00:10] logging.py:157 >> {'loss': 2.1466, 'learning_rate': 2.8828e-04, 'epoch': 0.28} + +[INFO|2025-01-23 02:00:33] logging.py:157 >> {'loss': 2.1529, 'learning_rate': 2.8812e-04, 'epoch': 0.29} + +[INFO|2025-01-23 02:00:57] logging.py:157 >> {'loss': 2.1245, 'learning_rate': 2.8795e-04, 'epoch': 0.29} + +[INFO|2025-01-23 02:01:20] logging.py:157 >> {'loss': 2.2219, 'learning_rate': 2.8778e-04, 'epoch': 0.29} + +[INFO|2025-01-23 02:01:43] logging.py:157 >> {'loss': 2.0120, 'learning_rate': 2.8762e-04, 'epoch': 0.29} + +[INFO|2025-01-23 02:02:07] logging.py:157 >> {'loss': 2.2461, 'learning_rate': 2.8745e-04, 'epoch': 0.29} + +[INFO|2025-01-23 02:02:30] logging.py:157 >> {'loss': 2.0860, 'learning_rate': 2.8728e-04, 'epoch': 0.29} + +[INFO|2025-01-23 02:02:54] logging.py:157 >> {'loss': 2.1209, 'learning_rate': 2.8711e-04, 'epoch': 0.30} + +[INFO|2025-01-23 02:03:17] logging.py:157 >> {'loss': 2.0898, 'learning_rate': 2.8693e-04, 'epoch': 0.30} + +[INFO|2025-01-23 02:03:40] logging.py:157 >> {'loss': 2.0256, 'learning_rate': 2.8676e-04, 'epoch': 0.30} + +[INFO|2025-01-23 02:04:04] logging.py:157 >> {'loss': 2.1845, 'learning_rate': 2.8659e-04, 'epoch': 0.30} + +[INFO|2025-01-23 02:04:27] logging.py:157 >> {'loss': 2.0018, 'learning_rate': 2.8641e-04, 'epoch': 0.30} + +[INFO|2025-01-23 02:04:50] logging.py:157 >> {'loss': 2.3826, 'learning_rate': 2.8623e-04, 'epoch': 0.31} + +[INFO|2025-01-23 02:05:14] logging.py:157 >> {'loss': 2.1097, 'learning_rate': 2.8606e-04, 'epoch': 0.31} + +[INFO|2025-01-23 02:05:37] logging.py:157 >> {'loss': 2.1060, 'learning_rate': 2.8588e-04, 'epoch': 0.31} + +[INFO|2025-01-23 02:06:01] logging.py:157 >> {'loss': 2.1243, 'learning_rate': 2.8570e-04, 'epoch': 0.31} + +[INFO|2025-01-23 02:06:24] logging.py:157 >> {'loss': 2.0857, 'learning_rate': 2.8552e-04, 'epoch': 0.31} + +[INFO|2025-01-23 02:06:47] logging.py:157 >> {'loss': 2.1684, 'learning_rate': 2.8533e-04, 'epoch': 0.31} + +[INFO|2025-01-23 02:07:11] logging.py:157 >> {'loss': 2.0861, 'learning_rate': 2.8515e-04, 'epoch': 0.32} + +[INFO|2025-01-23 02:07:35] logging.py:157 >> {'loss': 2.1541, 'learning_rate': 2.8497e-04, 'epoch': 0.32} + +[INFO|2025-01-23 02:07:58] logging.py:157 >> {'loss': 2.0715, 'learning_rate': 2.8478e-04, 'epoch': 0.32} + +[INFO|2025-01-23 02:08:21] logging.py:157 >> {'loss': 2.2075, 'learning_rate': 2.8459e-04, 'epoch': 0.32} + +[INFO|2025-01-23 02:08:44] logging.py:157 >> {'loss': 2.0532, 'learning_rate': 2.8441e-04, 'epoch': 0.32} + +[INFO|2025-01-23 02:09:06] logging.py:157 >> {'loss': 2.2120, 'learning_rate': 2.8422e-04, 'epoch': 0.32} + +[INFO|2025-01-23 02:09:29] logging.py:157 >> {'loss': 2.2024, 'learning_rate': 2.8403e-04, 'epoch': 0.33} + +[INFO|2025-01-23 02:09:51] logging.py:157 >> {'loss': 2.2150, 'learning_rate': 2.8384e-04, 'epoch': 0.33} + +[INFO|2025-01-23 02:10:14] logging.py:157 >> {'loss': 2.1015, 'learning_rate': 2.8365e-04, 'epoch': 0.33} + +[INFO|2025-01-23 02:10:37] logging.py:157 >> {'loss': 2.0577, 'learning_rate': 2.8345e-04, 'epoch': 0.33} + +[INFO|2025-01-23 02:10:59] logging.py:157 >> {'loss': 2.1047, 'learning_rate': 2.8326e-04, 'epoch': 0.33} + +[INFO|2025-01-23 02:11:22] logging.py:157 >> {'loss': 2.0823, 'learning_rate': 2.8307e-04, 'epoch': 0.34} + +[INFO|2025-01-23 02:11:44] logging.py:157 >> {'loss': 2.2383, 'learning_rate': 2.8287e-04, 'epoch': 0.34} + +[INFO|2025-01-23 02:12:07] logging.py:157 >> {'loss': 2.1302, 'learning_rate': 2.8267e-04, 'epoch': 0.34} + +[INFO|2025-01-23 02:12:29] logging.py:157 >> {'loss': 2.2524, 'learning_rate': 2.8247e-04, 'epoch': 0.34} + +[INFO|2025-01-23 02:12:52] logging.py:157 >> {'loss': 1.9753, 'learning_rate': 2.8228e-04, 'epoch': 0.34} + +[INFO|2025-01-23 02:13:15] logging.py:157 >> {'loss': 1.9766, 'learning_rate': 2.8208e-04, 'epoch': 0.34} + +[INFO|2025-01-23 02:13:37] logging.py:157 >> {'loss': 2.0491, 'learning_rate': 2.8187e-04, 'epoch': 0.35} + +[INFO|2025-01-23 02:14:00] logging.py:157 >> {'loss': 2.0357, 'learning_rate': 2.8167e-04, 'epoch': 0.35} + +[INFO|2025-01-23 02:14:23] logging.py:157 >> {'loss': 2.0826, 'learning_rate': 2.8147e-04, 'epoch': 0.35} + +[INFO|2025-01-23 02:14:45] logging.py:157 >> {'loss': 2.1531, 'learning_rate': 2.8126e-04, 'epoch': 0.35} + +[INFO|2025-01-23 02:15:09] logging.py:157 >> {'loss': 2.1923, 'learning_rate': 2.8106e-04, 'epoch': 0.35} + +[INFO|2025-01-23 02:15:31] logging.py:157 >> {'loss': 2.0221, 'learning_rate': 2.8085e-04, 'epoch': 0.35} + +[INFO|2025-01-23 02:15:53] logging.py:157 >> {'loss': 1.9867, 'learning_rate': 2.8065e-04, 'epoch': 0.36} + +[INFO|2025-01-23 02:16:16] logging.py:157 >> {'loss': 2.1052, 'learning_rate': 2.8044e-04, 'epoch': 0.36} + +[INFO|2025-01-23 02:16:38] logging.py:157 >> {'loss': 2.0658, 'learning_rate': 2.8023e-04, 'epoch': 0.36} + +[INFO|2025-01-23 02:17:00] logging.py:157 >> {'loss': 2.0377, 'learning_rate': 2.8002e-04, 'epoch': 0.36} + +[INFO|2025-01-23 02:17:22] logging.py:157 >> {'loss': 2.0155, 'learning_rate': 2.7980e-04, 'epoch': 0.36} + +[INFO|2025-01-23 02:17:44] logging.py:157 >> {'loss': 2.1023, 'learning_rate': 2.7959e-04, 'epoch': 0.37} + +[INFO|2025-01-23 02:18:06] logging.py:157 >> {'loss': 2.0894, 'learning_rate': 2.7938e-04, 'epoch': 0.37} + +[INFO|2025-01-23 02:18:28] logging.py:157 >> {'loss': 2.1916, 'learning_rate': 2.7916e-04, 'epoch': 0.37} + +[INFO|2025-01-23 02:18:50] logging.py:157 >> {'loss': 2.0841, 'learning_rate': 2.7895e-04, 'epoch': 0.37} + +[INFO|2025-01-23 02:19:12] logging.py:157 >> {'loss': 2.0588, 'learning_rate': 2.7873e-04, 'epoch': 0.37} + +[INFO|2025-01-23 02:19:34] logging.py:157 >> {'loss': 2.0739, 'learning_rate': 2.7851e-04, 'epoch': 0.37} + +[INFO|2025-01-23 02:19:56] logging.py:157 >> {'loss': 2.1457, 'learning_rate': 2.7829e-04, 'epoch': 0.38} + +[INFO|2025-01-23 02:20:18] logging.py:157 >> {'loss': 2.0158, 'learning_rate': 2.7807e-04, 'epoch': 0.38} + +[INFO|2025-01-23 02:20:40] logging.py:157 >> {'loss': 2.0311, 'learning_rate': 2.7785e-04, 'epoch': 0.38} + +[INFO|2025-01-23 02:21:02] logging.py:157 >> {'loss': 2.1531, 'learning_rate': 2.7763e-04, 'epoch': 0.38} + +[INFO|2025-01-23 02:21:24] logging.py:157 >> {'loss': 2.1248, 'learning_rate': 2.7741e-04, 'epoch': 0.38} + +[INFO|2025-01-23 02:21:46] logging.py:157 >> {'loss': 2.2637, 'learning_rate': 2.7719e-04, 'epoch': 0.38} + +[INFO|2025-01-23 02:22:08] logging.py:157 >> {'loss': 2.0331, 'learning_rate': 2.7696e-04, 'epoch': 0.39} + +[INFO|2025-01-23 02:22:32] logging.py:157 >> {'loss': 2.0959, 'learning_rate': 2.7673e-04, 'epoch': 0.39} + +[INFO|2025-01-23 02:22:53] logging.py:157 >> {'loss': 2.1869, 'learning_rate': 2.7651e-04, 'epoch': 0.39} + +[INFO|2025-01-23 02:23:15] logging.py:157 >> {'loss': 2.1399, 'learning_rate': 2.7628e-04, 'epoch': 0.39} + +[INFO|2025-01-23 02:23:36] logging.py:157 >> {'loss': 2.0850, 'learning_rate': 2.7605e-04, 'epoch': 0.39} + +[INFO|2025-01-23 02:23:58] logging.py:157 >> {'loss': 2.0886, 'learning_rate': 2.7582e-04, 'epoch': 0.40} + +[INFO|2025-01-23 02:24:19] logging.py:157 >> {'loss': 2.0927, 'learning_rate': 2.7559e-04, 'epoch': 0.40} + +[INFO|2025-01-23 02:24:41] logging.py:157 >> {'loss': 2.1753, 'learning_rate': 2.7536e-04, 'epoch': 0.40} + +[INFO|2025-01-23 02:25:02] logging.py:157 >> {'loss': 2.0145, 'learning_rate': 2.7512e-04, 'epoch': 0.40} + +[INFO|2025-01-23 02:25:24] logging.py:157 >> {'loss': 2.2001, 'learning_rate': 2.7489e-04, 'epoch': 0.40} + +[INFO|2025-01-23 02:25:45] logging.py:157 >> {'loss': 2.1179, 'learning_rate': 2.7466e-04, 'epoch': 0.40} + +[INFO|2025-01-23 02:26:06] logging.py:157 >> {'loss': 2.0346, 'learning_rate': 2.7442e-04, 'epoch': 0.41} + +[INFO|2025-01-23 02:26:28] logging.py:157 >> {'loss': 2.0631, 'learning_rate': 2.7418e-04, 'epoch': 0.41} + +[INFO|2025-01-23 02:26:49] logging.py:157 >> {'loss': 2.0288, 'learning_rate': 2.7394e-04, 'epoch': 0.41} + +[INFO|2025-01-23 02:27:10] logging.py:157 >> {'loss': 2.1027, 'learning_rate': 2.7371e-04, 'epoch': 0.41} + +[INFO|2025-01-23 02:27:32] logging.py:157 >> {'loss': 2.1220, 'learning_rate': 2.7347e-04, 'epoch': 0.41} + +[INFO|2025-01-23 02:27:53] logging.py:157 >> {'loss': 2.0809, 'learning_rate': 2.7323e-04, 'epoch': 0.41} + +[INFO|2025-01-23 02:28:15] logging.py:157 >> {'loss': 2.0885, 'learning_rate': 2.7298e-04, 'epoch': 0.42} + +[INFO|2025-01-23 02:28:36] logging.py:157 >> {'loss': 2.1349, 'learning_rate': 2.7274e-04, 'epoch': 0.42} + +[INFO|2025-01-23 02:28:58] logging.py:157 >> {'loss': 1.9886, 'learning_rate': 2.7250e-04, 'epoch': 0.42} + +[INFO|2025-01-23 02:29:19] logging.py:157 >> {'loss': 2.0894, 'learning_rate': 2.7225e-04, 'epoch': 0.42} + +[INFO|2025-01-23 02:29:42] logging.py:157 >> {'loss': 2.0764, 'learning_rate': 2.7201e-04, 'epoch': 0.42} + +[INFO|2025-01-23 02:30:03] logging.py:157 >> {'loss': 2.2330, 'learning_rate': 2.7176e-04, 'epoch': 0.43} + +[INFO|2025-01-23 02:30:24] logging.py:157 >> {'loss': 2.2500, 'learning_rate': 2.7151e-04, 'epoch': 0.43} + +[INFO|2025-01-23 02:30:45] logging.py:157 >> {'loss': 2.0767, 'learning_rate': 2.7126e-04, 'epoch': 0.43} + +[INFO|2025-01-23 02:31:06] logging.py:157 >> {'loss': 2.0126, 'learning_rate': 2.7101e-04, 'epoch': 0.43} + +[INFO|2025-01-23 02:31:27] logging.py:157 >> {'loss': 2.0627, 'learning_rate': 2.7076e-04, 'epoch': 0.43} + +[INFO|2025-01-23 02:31:48] logging.py:157 >> {'loss': 2.0426, 'learning_rate': 2.7051e-04, 'epoch': 0.43} + +[INFO|2025-01-23 02:32:08] logging.py:157 >> {'loss': 2.1374, 'learning_rate': 2.7026e-04, 'epoch': 0.44} + +[INFO|2025-01-23 02:32:29] logging.py:157 >> {'loss': 2.0922, 'learning_rate': 2.7001e-04, 'epoch': 0.44} + +[INFO|2025-01-23 02:32:50] logging.py:157 >> {'loss': 2.0486, 'learning_rate': 2.6975e-04, 'epoch': 0.44} + +[INFO|2025-01-23 02:33:11] logging.py:157 >> {'loss': 1.9698, 'learning_rate': 2.6950e-04, 'epoch': 0.44} + +[INFO|2025-01-23 02:33:32] logging.py:157 >> {'loss': 1.9343, 'learning_rate': 2.6924e-04, 'epoch': 0.44} + +[INFO|2025-01-23 02:33:52] logging.py:157 >> {'loss': 2.0842, 'learning_rate': 2.6898e-04, 'epoch': 0.44} + +[INFO|2025-01-23 02:34:14] logging.py:157 >> {'loss': 2.1045, 'learning_rate': 2.6872e-04, 'epoch': 0.45} + +[INFO|2025-01-23 02:34:34] logging.py:157 >> {'loss': 2.0529, 'learning_rate': 2.6846e-04, 'epoch': 0.45} + +[INFO|2025-01-23 02:34:55] logging.py:157 >> {'loss': 2.0879, 'learning_rate': 2.6820e-04, 'epoch': 0.45} + +[INFO|2025-01-23 02:35:16] logging.py:157 >> {'loss': 2.1399, 'learning_rate': 2.6794e-04, 'epoch': 0.45} + +[INFO|2025-01-23 02:35:37] logging.py:157 >> {'loss': 2.0434, 'learning_rate': 2.6768e-04, 'epoch': 0.45} + +[INFO|2025-01-23 02:35:58] logging.py:157 >> {'loss': 2.1592, 'learning_rate': 2.6742e-04, 'epoch': 0.46} + +[INFO|2025-01-23 02:36:19] logging.py:157 >> {'loss': 1.9876, 'learning_rate': 2.6715e-04, 'epoch': 0.46} + +[INFO|2025-01-23 02:36:41] logging.py:157 >> {'loss': 2.1152, 'learning_rate': 2.6689e-04, 'epoch': 0.46} + +[INFO|2025-01-23 02:37:01] logging.py:157 >> {'loss': 2.0441, 'learning_rate': 2.6662e-04, 'epoch': 0.46} + +[INFO|2025-01-23 02:37:22] logging.py:157 >> {'loss': 2.0483, 'learning_rate': 2.6636e-04, 'epoch': 0.46} + +[INFO|2025-01-23 02:37:42] logging.py:157 >> {'loss': 2.1069, 'learning_rate': 2.6609e-04, 'epoch': 0.46} + +[INFO|2025-01-23 02:38:02] logging.py:157 >> {'loss': 1.9740, 'learning_rate': 2.6582e-04, 'epoch': 0.47} + +[INFO|2025-01-23 02:38:22] logging.py:157 >> {'loss': 2.1250, 'learning_rate': 2.6555e-04, 'epoch': 0.47} + +[INFO|2025-01-23 02:38:43] logging.py:157 >> {'loss': 2.0957, 'learning_rate': 2.6528e-04, 'epoch': 0.47} + +[INFO|2025-01-23 02:39:03] logging.py:157 >> {'loss': 1.8768, 'learning_rate': 2.6501e-04, 'epoch': 0.47} + +[INFO|2025-01-23 02:39:24] logging.py:157 >> {'loss': 2.2316, 'learning_rate': 2.6474e-04, 'epoch': 0.47} + +[INFO|2025-01-23 02:39:44] logging.py:157 >> {'loss': 2.0952, 'learning_rate': 2.6446e-04, 'epoch': 0.47} + +[INFO|2025-01-23 02:40:04] logging.py:157 >> {'loss': 1.9688, 'learning_rate': 2.6419e-04, 'epoch': 0.48} + +[INFO|2025-01-23 02:40:25] logging.py:157 >> {'loss': 2.1006, 'learning_rate': 2.6392e-04, 'epoch': 0.48} + +[INFO|2025-01-23 02:40:45] logging.py:157 >> {'loss': 2.1486, 'learning_rate': 2.6364e-04, 'epoch': 0.48} + +[INFO|2025-01-23 02:41:06] logging.py:157 >> {'loss': 1.9104, 'learning_rate': 2.6336e-04, 'epoch': 0.48} + +[INFO|2025-01-23 02:41:26] logging.py:157 >> {'loss': 2.0982, 'learning_rate': 2.6308e-04, 'epoch': 0.48} + +[INFO|2025-01-23 02:41:46] logging.py:157 >> {'loss': 2.0679, 'learning_rate': 2.6281e-04, 'epoch': 0.49} + +[INFO|2025-01-23 02:42:07] logging.py:157 >> {'loss': 2.0957, 'learning_rate': 2.6253e-04, 'epoch': 0.49} + +[INFO|2025-01-23 02:42:27] logging.py:157 >> {'loss': 2.0209, 'learning_rate': 2.6225e-04, 'epoch': 0.49} + +[INFO|2025-01-23 02:42:47] logging.py:157 >> {'loss': 2.1004, 'learning_rate': 2.6197e-04, 'epoch': 0.49} + +[INFO|2025-01-23 02:43:08] logging.py:157 >> {'loss': 1.8670, 'learning_rate': 2.6168e-04, 'epoch': 0.49} + +[INFO|2025-01-23 02:43:30] logging.py:157 >> {'loss': 2.1669, 'learning_rate': 2.6140e-04, 'epoch': 0.49} + +[INFO|2025-01-23 02:43:50] logging.py:157 >> {'loss': 2.0645, 'learning_rate': 2.6112e-04, 'epoch': 0.50} + +[INFO|2025-01-23 02:44:09] logging.py:157 >> {'loss': 2.0385, 'learning_rate': 2.6083e-04, 'epoch': 0.50} + +[INFO|2025-01-23 02:44:29] logging.py:157 >> {'loss': 2.1873, 'learning_rate': 2.6055e-04, 'epoch': 0.50} + +[INFO|2025-01-23 02:44:49] logging.py:157 >> {'loss': 2.0318, 'learning_rate': 2.6026e-04, 'epoch': 0.50} + +[INFO|2025-01-23 02:45:09] logging.py:157 >> {'loss': 2.0639, 'learning_rate': 2.5997e-04, 'epoch': 0.50} + +[INFO|2025-01-23 02:45:29] logging.py:157 >> {'loss': 1.9877, 'learning_rate': 2.5968e-04, 'epoch': 0.50} + +[INFO|2025-01-23 02:45:48] logging.py:157 >> {'loss': 2.1717, 'learning_rate': 2.5939e-04, 'epoch': 0.51} + +[INFO|2025-01-23 02:46:08] logging.py:157 >> {'loss': 2.1521, 'learning_rate': 2.5910e-04, 'epoch': 0.51} + +[INFO|2025-01-23 02:46:28] logging.py:157 >> {'loss': 2.0867, 'learning_rate': 2.5881e-04, 'epoch': 0.51} + +[INFO|2025-01-23 02:46:48] logging.py:157 >> {'loss': 2.2151, 'learning_rate': 2.5852e-04, 'epoch': 0.51} + +[INFO|2025-01-23 02:47:07] logging.py:157 >> {'loss': 2.1646, 'learning_rate': 2.5823e-04, 'epoch': 0.51} + +[INFO|2025-01-23 02:47:27] logging.py:157 >> {'loss': 2.0988, 'learning_rate': 2.5793e-04, 'epoch': 0.52} + +[INFO|2025-01-23 02:47:47] logging.py:157 >> {'loss': 2.0323, 'learning_rate': 2.5764e-04, 'epoch': 0.52} + +[INFO|2025-01-23 02:48:07] logging.py:157 >> {'loss': 2.2742, 'learning_rate': 2.5735e-04, 'epoch': 0.52} + +[INFO|2025-01-23 02:48:26] logging.py:157 >> {'loss': 2.0879, 'learning_rate': 2.5705e-04, 'epoch': 0.52} + +[INFO|2025-01-23 02:48:46] logging.py:157 >> {'loss': 2.1998, 'learning_rate': 2.5675e-04, 'epoch': 0.52} + +[INFO|2025-01-23 02:49:06] logging.py:157 >> {'loss': 2.2052, 'learning_rate': 2.5645e-04, 'epoch': 0.52} + +[INFO|2025-01-23 02:49:26] logging.py:157 >> {'loss': 2.2198, 'learning_rate': 2.5616e-04, 'epoch': 0.53} + +[INFO|2025-01-23 02:49:46] logging.py:157 >> {'loss': 2.0614, 'learning_rate': 2.5586e-04, 'epoch': 0.53} + +[INFO|2025-01-23 02:50:07] logging.py:157 >> {'loss': 2.2121, 'learning_rate': 2.5556e-04, 'epoch': 0.53} + +[INFO|2025-01-23 02:50:26] logging.py:157 >> {'loss': 2.0787, 'learning_rate': 2.5525e-04, 'epoch': 0.53} + +[INFO|2025-01-23 02:50:45] logging.py:157 >> {'loss': 1.9809, 'learning_rate': 2.5495e-04, 'epoch': 0.53} + +[INFO|2025-01-23 02:51:05] logging.py:157 >> {'loss': 2.0939, 'learning_rate': 2.5465e-04, 'epoch': 0.53} + +[INFO|2025-01-23 02:51:24] logging.py:157 >> {'loss': 1.9998, 'learning_rate': 2.5435e-04, 'epoch': 0.54} + +[INFO|2025-01-23 02:51:44] logging.py:157 >> {'loss': 1.9074, 'learning_rate': 2.5404e-04, 'epoch': 0.54} + +[INFO|2025-01-23 02:52:03] logging.py:157 >> {'loss': 2.0548, 'learning_rate': 2.5374e-04, 'epoch': 0.54} + +[INFO|2025-01-23 02:52:22] logging.py:157 >> {'loss': 1.9747, 'learning_rate': 2.5343e-04, 'epoch': 0.54} + +[INFO|2025-01-23 02:52:41] logging.py:157 >> {'loss': 2.0348, 'learning_rate': 2.5312e-04, 'epoch': 0.54} + +[INFO|2025-01-23 02:53:01] logging.py:157 >> {'loss': 2.1421, 'learning_rate': 2.5282e-04, 'epoch': 0.55} + +[INFO|2025-01-23 02:53:20] logging.py:157 >> {'loss': 2.0219, 'learning_rate': 2.5251e-04, 'epoch': 0.55} + +[INFO|2025-01-23 02:53:39] logging.py:157 >> {'loss': 2.1173, 'learning_rate': 2.5220e-04, 'epoch': 0.55} + +[INFO|2025-01-23 02:53:58] logging.py:157 >> {'loss': 2.1262, 'learning_rate': 2.5189e-04, 'epoch': 0.55} + +[INFO|2025-01-23 02:54:18] logging.py:157 >> {'loss': 2.1402, 'learning_rate': 2.5158e-04, 'epoch': 0.55} + +[INFO|2025-01-23 02:54:37] logging.py:157 >> {'loss': 2.0886, 'learning_rate': 2.5126e-04, 'epoch': 0.55} + +[INFO|2025-01-23 02:54:56] logging.py:157 >> {'loss': 2.1521, 'learning_rate': 2.5095e-04, 'epoch': 0.56} + +[INFO|2025-01-23 02:55:15] logging.py:157 >> {'loss': 2.0807, 'learning_rate': 2.5064e-04, 'epoch': 0.56} + +[INFO|2025-01-23 02:55:35] logging.py:157 >> {'loss': 2.0106, 'learning_rate': 2.5032e-04, 'epoch': 0.56} + +[INFO|2025-01-23 02:55:54] logging.py:157 >> {'loss': 2.0087, 'learning_rate': 2.5001e-04, 'epoch': 0.56} + +[INFO|2025-01-23 02:56:13] logging.py:157 >> {'loss': 1.9312, 'learning_rate': 2.4969e-04, 'epoch': 0.56} + +[INFO|2025-01-23 02:56:34] logging.py:157 >> {'loss': 2.2043, 'learning_rate': 2.4938e-04, 'epoch': 0.56} + +[INFO|2025-01-23 02:56:52] logging.py:157 >> {'loss': 2.2137, 'learning_rate': 2.4906e-04, 'epoch': 0.57} + +[INFO|2025-01-23 02:57:11] logging.py:157 >> {'loss': 1.9218, 'learning_rate': 2.4874e-04, 'epoch': 0.57} + +[INFO|2025-01-23 02:57:30] logging.py:157 >> {'loss': 2.2093, 'learning_rate': 2.4842e-04, 'epoch': 0.57} + +[INFO|2025-01-23 02:57:48] logging.py:157 >> {'loss': 2.0617, 'learning_rate': 2.4810e-04, 'epoch': 0.57} + +[INFO|2025-01-23 02:58:07] logging.py:157 >> {'loss': 2.1588, 'learning_rate': 2.4778e-04, 'epoch': 0.57} + +[INFO|2025-01-23 02:58:26] logging.py:157 >> {'loss': 2.0023, 'learning_rate': 2.4746e-04, 'epoch': 0.58} + +[INFO|2025-01-23 02:58:45] logging.py:157 >> {'loss': 2.0192, 'learning_rate': 2.4714e-04, 'epoch': 0.58} + +[INFO|2025-01-23 02:59:03] logging.py:157 >> {'loss': 2.1808, 'learning_rate': 2.4681e-04, 'epoch': 0.58} + +[INFO|2025-01-23 02:59:22] logging.py:157 >> {'loss': 1.9863, 'learning_rate': 2.4649e-04, 'epoch': 0.58} + +[INFO|2025-01-23 02:59:41] logging.py:157 >> {'loss': 2.1675, 'learning_rate': 2.4617e-04, 'epoch': 0.58} + +[INFO|2025-01-23 03:00:00] logging.py:157 >> {'loss': 2.1104, 'learning_rate': 2.4584e-04, 'epoch': 0.58} + +[INFO|2025-01-23 03:00:18] logging.py:157 >> {'loss': 1.9928, 'learning_rate': 2.4551e-04, 'epoch': 0.59} + +[INFO|2025-01-23 03:00:37] logging.py:157 >> {'loss': 2.0685, 'learning_rate': 2.4519e-04, 'epoch': 0.59} + +[INFO|2025-01-23 03:00:56] logging.py:157 >> {'loss': 2.3390, 'learning_rate': 2.4486e-04, 'epoch': 0.59} + +[INFO|2025-01-23 03:01:14] logging.py:157 >> {'loss': 2.2510, 'learning_rate': 2.4453e-04, 'epoch': 0.59} + +[INFO|2025-01-23 03:01:33] logging.py:157 >> {'loss': 2.0822, 'learning_rate': 2.4420e-04, 'epoch': 0.59} + +[INFO|2025-01-23 03:01:52] logging.py:157 >> {'loss': 2.2187, 'learning_rate': 2.4387e-04, 'epoch': 0.59} + +[INFO|2025-01-23 03:02:11] logging.py:157 >> {'loss': 2.0735, 'learning_rate': 2.4354e-04, 'epoch': 0.60} + +[INFO|2025-01-23 03:02:30] logging.py:157 >> {'loss': 1.9990, 'learning_rate': 2.4321e-04, 'epoch': 0.60} + +[INFO|2025-01-23 03:02:50] logging.py:157 >> {'loss': 1.9622, 'learning_rate': 2.4288e-04, 'epoch': 0.60} + +[INFO|2025-01-23 03:03:08] logging.py:157 >> {'loss': 1.9453, 'learning_rate': 2.4255e-04, 'epoch': 0.60} + +[INFO|2025-01-23 03:03:26] logging.py:157 >> {'loss': 2.0796, 'learning_rate': 2.4221e-04, 'epoch': 0.60} + +[INFO|2025-01-23 03:03:45] logging.py:157 >> {'loss': 2.1459, 'learning_rate': 2.4188e-04, 'epoch': 0.61} + +[INFO|2025-01-23 03:04:03] logging.py:157 >> {'loss': 2.0044, 'learning_rate': 2.4154e-04, 'epoch': 0.61} + +[INFO|2025-01-23 03:04:21] logging.py:157 >> {'loss': 2.0982, 'learning_rate': 2.4121e-04, 'epoch': 0.61} + +[INFO|2025-01-23 03:04:39] logging.py:157 >> {'loss': 2.1232, 'learning_rate': 2.4087e-04, 'epoch': 0.61} + +[INFO|2025-01-23 03:04:57] logging.py:157 >> {'loss': 2.0146, 'learning_rate': 2.4053e-04, 'epoch': 0.61} + +[INFO|2025-01-23 03:05:16] logging.py:157 >> {'loss': 2.1244, 'learning_rate': 2.4020e-04, 'epoch': 0.61} + +[INFO|2025-01-23 03:05:34] logging.py:157 >> {'loss': 2.0166, 'learning_rate': 2.3986e-04, 'epoch': 0.62} + +[INFO|2025-01-23 03:05:52] logging.py:157 >> {'loss': 2.0037, 'learning_rate': 2.3952e-04, 'epoch': 0.62} + +[INFO|2025-01-23 03:06:10] logging.py:157 >> {'loss': 2.0314, 'learning_rate': 2.3918e-04, 'epoch': 0.62} + +[INFO|2025-01-23 03:06:28] logging.py:157 >> {'loss': 2.2235, 'learning_rate': 2.3884e-04, 'epoch': 0.62} + +[INFO|2025-01-23 03:06:46] logging.py:157 >> {'loss': 2.1317, 'learning_rate': 2.3850e-04, 'epoch': 0.62} + +[INFO|2025-01-23 03:07:04] logging.py:157 >> {'loss': 2.1241, 'learning_rate': 2.3815e-04, 'epoch': 0.62} + +[INFO|2025-01-23 03:07:23] logging.py:157 >> {'loss': 1.9438, 'learning_rate': 2.3781e-04, 'epoch': 0.63} + +[INFO|2025-01-23 03:07:41] logging.py:157 >> {'loss': 1.9621, 'learning_rate': 2.3747e-04, 'epoch': 0.63} + +[INFO|2025-01-23 03:07:59] logging.py:157 >> {'loss': 2.1609, 'learning_rate': 2.3712e-04, 'epoch': 0.63} + +[INFO|2025-01-23 03:08:17] logging.py:157 >> {'loss': 2.2099, 'learning_rate': 2.3678e-04, 'epoch': 0.63} + +[INFO|2025-01-23 03:08:35] logging.py:157 >> {'loss': 2.0158, 'learning_rate': 2.3643e-04, 'epoch': 0.63} + +[INFO|2025-01-23 03:08:55] logging.py:157 >> {'loss': 2.0521, 'learning_rate': 2.3609e-04, 'epoch': 0.64} + +[INFO|2025-01-23 03:09:13] logging.py:157 >> {'loss': 2.1523, 'learning_rate': 2.3574e-04, 'epoch': 0.64} + +[INFO|2025-01-23 03:09:30] logging.py:157 >> {'loss': 2.3370, 'learning_rate': 2.3539e-04, 'epoch': 0.64} + +[INFO|2025-01-23 03:09:48] logging.py:157 >> {'loss': 1.8912, 'learning_rate': 2.3504e-04, 'epoch': 0.64} + +[INFO|2025-01-23 03:10:05] logging.py:157 >> {'loss': 1.9723, 'learning_rate': 2.3469e-04, 'epoch': 0.64} + +[INFO|2025-01-23 03:10:23] logging.py:157 >> {'loss': 2.0922, 'learning_rate': 2.3434e-04, 'epoch': 0.64} + +[INFO|2025-01-23 03:10:41] logging.py:157 >> {'loss': 1.9364, 'learning_rate': 2.3399e-04, 'epoch': 0.65} + +[INFO|2025-01-23 03:10:58] logging.py:157 >> {'loss': 1.9636, 'learning_rate': 2.3364e-04, 'epoch': 0.65} + +[INFO|2025-01-23 03:11:16] logging.py:157 >> {'loss': 2.0558, 'learning_rate': 2.3329e-04, 'epoch': 0.65} + +[INFO|2025-01-23 03:11:34] logging.py:157 >> {'loss': 2.0943, 'learning_rate': 2.3294e-04, 'epoch': 0.65} + +[INFO|2025-01-23 03:11:51] logging.py:157 >> {'loss': 1.9965, 'learning_rate': 2.3259e-04, 'epoch': 0.65} + +[INFO|2025-01-23 03:12:10] logging.py:157 >> {'loss': 2.0822, 'learning_rate': 2.3223e-04, 'epoch': 0.65} + +[INFO|2025-01-23 03:12:27] logging.py:157 >> {'loss': 2.0828, 'learning_rate': 2.3188e-04, 'epoch': 0.66} + +[INFO|2025-01-23 03:12:45] logging.py:157 >> {'loss': 1.8954, 'learning_rate': 2.3152e-04, 'epoch': 0.66} + +[INFO|2025-01-23 03:13:02] logging.py:157 >> {'loss': 2.1379, 'learning_rate': 2.3117e-04, 'epoch': 0.66} + +[INFO|2025-01-23 03:13:20] logging.py:157 >> {'loss': 1.9456, 'learning_rate': 2.3081e-04, 'epoch': 0.66} + +[INFO|2025-01-23 03:13:38] logging.py:157 >> {'loss': 1.9262, 'learning_rate': 2.3045e-04, 'epoch': 0.66} + +[INFO|2025-01-23 03:13:56] logging.py:157 >> {'loss': 2.0516, 'learning_rate': 2.3010e-04, 'epoch': 0.67} + +[INFO|2025-01-23 03:14:13] logging.py:157 >> {'loss': 2.0916, 'learning_rate': 2.2974e-04, 'epoch': 0.67} + +[INFO|2025-01-23 03:14:31] logging.py:157 >> {'loss': 1.9777, 'learning_rate': 2.2938e-04, 'epoch': 0.67} + +[INFO|2025-01-23 03:14:50] logging.py:157 >> {'loss': 2.0911, 'learning_rate': 2.2902e-04, 'epoch': 0.67} + +[INFO|2025-01-23 03:15:07] logging.py:157 >> {'loss': 2.1849, 'learning_rate': 2.2866e-04, 'epoch': 0.67} + +[INFO|2025-01-23 03:15:24] logging.py:157 >> {'loss': 1.9319, 'learning_rate': 2.2830e-04, 'epoch': 0.67} + +[INFO|2025-01-23 03:15:41] logging.py:157 >> {'loss': 2.1074, 'learning_rate': 2.2794e-04, 'epoch': 0.68} + +[INFO|2025-01-23 03:15:59] logging.py:157 >> {'loss': 1.9762, 'learning_rate': 2.2758e-04, 'epoch': 0.68} + +[INFO|2025-01-23 03:16:16] logging.py:157 >> {'loss': 1.9072, 'learning_rate': 2.2721e-04, 'epoch': 0.68} + +[INFO|2025-01-23 03:16:33] logging.py:157 >> {'loss': 1.8813, 'learning_rate': 2.2685e-04, 'epoch': 0.68} + +[INFO|2025-01-23 03:16:50] logging.py:157 >> {'loss': 2.0787, 'learning_rate': 2.2649e-04, 'epoch': 0.68} + +[INFO|2025-01-23 03:17:07] logging.py:157 >> {'loss': 1.9761, 'learning_rate': 2.2612e-04, 'epoch': 0.68} + +[INFO|2025-01-23 03:17:24] logging.py:157 >> {'loss': 2.0439, 'learning_rate': 2.2576e-04, 'epoch': 0.69} + +[INFO|2025-01-23 03:17:41] logging.py:157 >> {'loss': 1.9896, 'learning_rate': 2.2539e-04, 'epoch': 0.69} + +[INFO|2025-01-23 03:17:58] logging.py:157 >> {'loss': 2.0269, 'learning_rate': 2.2502e-04, 'epoch': 0.69} + +[INFO|2025-01-23 03:18:16] logging.py:157 >> {'loss': 1.9610, 'learning_rate': 2.2466e-04, 'epoch': 0.69} + +[INFO|2025-01-23 03:18:33] logging.py:157 >> {'loss': 2.1577, 'learning_rate': 2.2429e-04, 'epoch': 0.69} + +[INFO|2025-01-23 03:18:50] logging.py:157 >> {'loss': 2.0868, 'learning_rate': 2.2392e-04, 'epoch': 0.70} + +[INFO|2025-01-23 03:19:07] logging.py:157 >> {'loss': 2.1132, 'learning_rate': 2.2355e-04, 'epoch': 0.70} + +[INFO|2025-01-23 03:19:24] logging.py:157 >> {'loss': 2.0171, 'learning_rate': 2.2318e-04, 'epoch': 0.70} + +[INFO|2025-01-23 03:19:41] logging.py:157 >> {'loss': 2.1087, 'learning_rate': 2.2281e-04, 'epoch': 0.70} + +[INFO|2025-01-23 03:19:58] logging.py:157 >> {'loss': 1.9833, 'learning_rate': 2.2244e-04, 'epoch': 0.70} + +[INFO|2025-01-23 03:20:15] logging.py:157 >> {'loss': 2.0165, 'learning_rate': 2.2207e-04, 'epoch': 0.70} + +[INFO|2025-01-23 03:20:34] logging.py:157 >> {'loss': 2.1203, 'learning_rate': 2.2170e-04, 'epoch': 0.71} + +[INFO|2025-01-23 03:20:50] logging.py:157 >> {'loss': 2.0111, 'learning_rate': 2.2133e-04, 'epoch': 0.71} + +[INFO|2025-01-23 03:21:07] logging.py:157 >> {'loss': 2.0769, 'learning_rate': 2.2096e-04, 'epoch': 0.71} + +[INFO|2025-01-23 03:21:23] logging.py:157 >> {'loss': 1.9393, 'learning_rate': 2.2058e-04, 'epoch': 0.71} + +[INFO|2025-01-23 03:21:40] logging.py:157 >> {'loss': 2.1660, 'learning_rate': 2.2021e-04, 'epoch': 0.71} + +[INFO|2025-01-23 03:21:57] logging.py:157 >> {'loss': 2.0830, 'learning_rate': 2.1984e-04, 'epoch': 0.71} + +[INFO|2025-01-23 03:22:13] logging.py:157 >> {'loss': 2.2259, 'learning_rate': 2.1946e-04, 'epoch': 0.72} + +[INFO|2025-01-23 03:22:30] logging.py:157 >> {'loss': 2.0790, 'learning_rate': 2.1908e-04, 'epoch': 0.72} + +[INFO|2025-01-23 03:22:46] logging.py:157 >> {'loss': 2.1396, 'learning_rate': 2.1871e-04, 'epoch': 0.72} + +[INFO|2025-01-23 03:23:03] logging.py:157 >> {'loss': 1.9795, 'learning_rate': 2.1833e-04, 'epoch': 0.72} + +[INFO|2025-01-23 03:23:20] logging.py:157 >> {'loss': 2.0263, 'learning_rate': 2.1796e-04, 'epoch': 0.72} + +[INFO|2025-01-23 03:23:36] logging.py:157 >> {'loss': 2.0680, 'learning_rate': 2.1758e-04, 'epoch': 0.73} + +[INFO|2025-01-23 03:23:53] logging.py:157 >> {'loss': 2.0887, 'learning_rate': 2.1720e-04, 'epoch': 0.73} + +[INFO|2025-01-23 03:24:10] logging.py:157 >> {'loss': 2.0686, 'learning_rate': 2.1682e-04, 'epoch': 0.73} + +[INFO|2025-01-23 03:24:26] logging.py:157 >> {'loss': 2.0438, 'learning_rate': 2.1644e-04, 'epoch': 0.73} + +[INFO|2025-01-23 03:24:43] logging.py:157 >> {'loss': 2.0746, 'learning_rate': 2.1606e-04, 'epoch': 0.73} + +[INFO|2025-01-23 03:25:00] logging.py:157 >> {'loss': 2.0029, 'learning_rate': 2.1568e-04, 'epoch': 0.73} + +[INFO|2025-01-23 03:25:16] logging.py:157 >> {'loss': 2.0028, 'learning_rate': 2.1530e-04, 'epoch': 0.74} + +[INFO|2025-01-23 03:25:33] logging.py:157 >> {'loss': 1.9794, 'learning_rate': 2.1492e-04, 'epoch': 0.74} + +[INFO|2025-01-23 03:25:50] logging.py:157 >> {'loss': 2.0336, 'learning_rate': 2.1454e-04, 'epoch': 0.74} + +[INFO|2025-01-23 03:26:08] logging.py:157 >> {'loss': 1.9170, 'learning_rate': 2.1415e-04, 'epoch': 0.74} + +[INFO|2025-01-23 03:26:24] logging.py:157 >> {'loss': 1.9402, 'learning_rate': 2.1377e-04, 'epoch': 0.74} + +[INFO|2025-01-23 03:26:40] logging.py:157 >> {'loss': 2.0277, 'learning_rate': 2.1339e-04, 'epoch': 0.74} + +[INFO|2025-01-23 03:26:56] logging.py:157 >> {'loss': 2.0032, 'learning_rate': 2.1300e-04, 'epoch': 0.75} + +[INFO|2025-01-23 03:27:12] logging.py:157 >> {'loss': 1.9176, 'learning_rate': 2.1262e-04, 'epoch': 0.75} + +[INFO|2025-01-23 03:27:28] logging.py:157 >> {'loss': 1.9901, 'learning_rate': 2.1224e-04, 'epoch': 0.75} + +[INFO|2025-01-23 03:27:44] logging.py:157 >> {'loss': 2.0598, 'learning_rate': 2.1185e-04, 'epoch': 0.75} + +[INFO|2025-01-23 03:28:00] logging.py:157 >> {'loss': 1.9927, 'learning_rate': 2.1146e-04, 'epoch': 0.75} + +[INFO|2025-01-23 03:28:16] logging.py:157 >> {'loss': 2.1043, 'learning_rate': 2.1108e-04, 'epoch': 0.76} + +[INFO|2025-01-23 03:28:33] logging.py:157 >> {'loss': 1.9659, 'learning_rate': 2.1069e-04, 'epoch': 0.76} + +[INFO|2025-01-23 03:28:49] logging.py:157 >> {'loss': 1.9538, 'learning_rate': 2.1030e-04, 'epoch': 0.76} + +[INFO|2025-01-23 03:29:05] logging.py:157 >> {'loss': 2.0388, 'learning_rate': 2.0992e-04, 'epoch': 0.76} + +[INFO|2025-01-23 03:29:21] logging.py:157 >> {'loss': 2.0764, 'learning_rate': 2.0953e-04, 'epoch': 0.76} + +[INFO|2025-01-23 03:29:37] logging.py:157 >> {'loss': 1.9851, 'learning_rate': 2.0914e-04, 'epoch': 0.76} + +[INFO|2025-01-23 03:29:53] logging.py:157 >> {'loss': 2.1236, 'learning_rate': 2.0875e-04, 'epoch': 0.77} + +[INFO|2025-01-23 03:30:09] logging.py:157 >> {'loss': 2.0835, 'learning_rate': 2.0836e-04, 'epoch': 0.77} + +[INFO|2025-01-23 03:30:25] logging.py:157 >> {'loss': 1.9415, 'learning_rate': 2.0797e-04, 'epoch': 0.77} + +[INFO|2025-01-23 03:30:42] logging.py:157 >> {'loss': 2.0053, 'learning_rate': 2.0758e-04, 'epoch': 0.77} + +[INFO|2025-01-23 03:30:58] logging.py:157 >> {'loss': 1.9590, 'learning_rate': 2.0719e-04, 'epoch': 0.77} + +[INFO|2025-01-23 03:31:14] logging.py:157 >> {'loss': 2.0356, 'learning_rate': 2.0680e-04, 'epoch': 0.77} + +[INFO|2025-01-23 03:31:31] logging.py:157 >> {'loss': 1.9538, 'learning_rate': 2.0640e-04, 'epoch': 0.78} + +[INFO|2025-01-23 03:31:46] logging.py:157 >> {'loss': 2.1364, 'learning_rate': 2.0601e-04, 'epoch': 0.78} + +[INFO|2025-01-23 03:32:02] logging.py:157 >> {'loss': 2.0756, 'learning_rate': 2.0562e-04, 'epoch': 0.78} + +[INFO|2025-01-23 03:32:17] logging.py:157 >> {'loss': 2.0517, 'learning_rate': 2.0522e-04, 'epoch': 0.78} + +[INFO|2025-01-23 03:32:33] logging.py:157 >> {'loss': 2.0356, 'learning_rate': 2.0483e-04, 'epoch': 0.78} + +[INFO|2025-01-23 03:32:48] logging.py:157 >> {'loss': 2.0107, 'learning_rate': 2.0444e-04, 'epoch': 0.79} + +[INFO|2025-01-23 03:33:04] logging.py:157 >> {'loss': 1.9988, 'learning_rate': 2.0404e-04, 'epoch': 0.79} + +[INFO|2025-01-23 03:33:19] logging.py:157 >> {'loss': 1.9492, 'learning_rate': 2.0365e-04, 'epoch': 0.79} + +[INFO|2025-01-23 03:33:35] logging.py:157 >> {'loss': 1.9293, 'learning_rate': 2.0325e-04, 'epoch': 0.79} + +[INFO|2025-01-23 03:33:50] logging.py:157 >> {'loss': 2.1581, 'learning_rate': 2.0286e-04, 'epoch': 0.79} + +[INFO|2025-01-23 03:34:06] logging.py:157 >> {'loss': 1.9208, 'learning_rate': 2.0246e-04, 'epoch': 0.79} + +[INFO|2025-01-23 03:34:21] logging.py:157 >> {'loss': 1.8854, 'learning_rate': 2.0206e-04, 'epoch': 0.80} + +[INFO|2025-01-23 03:34:37] logging.py:157 >> {'loss': 2.0812, 'learning_rate': 2.0167e-04, 'epoch': 0.80} + +[INFO|2025-01-23 03:34:53] logging.py:157 >> {'loss': 1.9840, 'learning_rate': 2.0127e-04, 'epoch': 0.80} + +[INFO|2025-01-23 03:35:08] logging.py:157 >> {'loss': 1.9724, 'learning_rate': 2.0087e-04, 'epoch': 0.80} + +[INFO|2025-01-23 03:35:24] logging.py:157 >> {'loss': 2.2208, 'learning_rate': 2.0047e-04, 'epoch': 0.80} + +[INFO|2025-01-23 03:35:39] logging.py:157 >> {'loss': 2.0375, 'learning_rate': 2.0007e-04, 'epoch': 0.80} + +[INFO|2025-01-23 03:35:55] logging.py:157 >> {'loss': 2.0931, 'learning_rate': 1.9967e-04, 'epoch': 0.81} + +[INFO|2025-01-23 03:36:10] logging.py:157 >> {'loss': 1.9972, 'learning_rate': 1.9927e-04, 'epoch': 0.81} + +[INFO|2025-01-23 03:36:26] logging.py:157 >> {'loss': 2.0413, 'learning_rate': 1.9887e-04, 'epoch': 0.81} + +[INFO|2025-01-23 03:36:43] logging.py:157 >> {'loss': 2.0348, 'learning_rate': 1.9847e-04, 'epoch': 0.81} + +[INFO|2025-01-23 03:36:58] logging.py:157 >> {'loss': 2.1065, 'learning_rate': 1.9807e-04, 'epoch': 0.81} + +[INFO|2025-01-23 03:37:13] logging.py:157 >> {'loss': 2.1208, 'learning_rate': 1.9767e-04, 'epoch': 0.82} + +[INFO|2025-01-23 03:37:28] logging.py:157 >> {'loss': 2.0006, 'learning_rate': 1.9727e-04, 'epoch': 0.82} + +[INFO|2025-01-23 03:37:43] logging.py:157 >> {'loss': 1.8923, 'learning_rate': 1.9687e-04, 'epoch': 0.82} + +[INFO|2025-01-23 03:37:57] logging.py:157 >> {'loss': 2.1583, 'learning_rate': 1.9647e-04, 'epoch': 0.82} + +[INFO|2025-01-23 03:38:13] logging.py:157 >> {'loss': 2.0751, 'learning_rate': 1.9606e-04, 'epoch': 0.82} + +[INFO|2025-01-23 03:38:27] logging.py:157 >> {'loss': 2.0197, 'learning_rate': 1.9566e-04, 'epoch': 0.82} + +[INFO|2025-01-23 03:38:42] logging.py:157 >> {'loss': 2.2069, 'learning_rate': 1.9526e-04, 'epoch': 0.83} + +[INFO|2025-01-23 03:38:57] logging.py:157 >> {'loss': 2.1048, 'learning_rate': 1.9485e-04, 'epoch': 0.83} + +[INFO|2025-01-23 03:39:12] logging.py:157 >> {'loss': 1.8618, 'learning_rate': 1.9445e-04, 'epoch': 0.83} + +[INFO|2025-01-23 03:39:27] logging.py:157 >> {'loss': 2.0147, 'learning_rate': 1.9404e-04, 'epoch': 0.83} + +[INFO|2025-01-23 03:39:42] logging.py:157 >> {'loss': 1.8918, 'learning_rate': 1.9364e-04, 'epoch': 0.83} + +[INFO|2025-01-23 03:39:57] logging.py:157 >> {'loss': 2.0407, 'learning_rate': 1.9323e-04, 'epoch': 0.83} + +[INFO|2025-01-23 03:40:12] logging.py:157 >> {'loss': 2.0526, 'learning_rate': 1.9283e-04, 'epoch': 0.84} + +[INFO|2025-01-23 03:40:27] logging.py:157 >> {'loss': 1.9411, 'learning_rate': 1.9242e-04, 'epoch': 0.84} + +[INFO|2025-01-23 03:40:42] logging.py:157 >> {'loss': 2.1269, 'learning_rate': 1.9202e-04, 'epoch': 0.84} + +[INFO|2025-01-23 03:40:57] logging.py:157 >> {'loss': 2.1054, 'learning_rate': 1.9161e-04, 'epoch': 0.84} + +[INFO|2025-01-23 03:41:12] logging.py:157 >> {'loss': 1.9528, 'learning_rate': 1.9120e-04, 'epoch': 0.84} + +[INFO|2025-01-23 03:41:27] logging.py:157 >> {'loss': 2.0606, 'learning_rate': 1.9080e-04, 'epoch': 0.85} + +[INFO|2025-01-23 03:41:43] logging.py:157 >> {'loss': 2.0169, 'learning_rate': 1.9039e-04, 'epoch': 0.85} + +[INFO|2025-01-23 03:41:58] logging.py:157 >> {'loss': 2.0958, 'learning_rate': 1.8998e-04, 'epoch': 0.85} + +[INFO|2025-01-23 03:42:12] logging.py:157 >> {'loss': 1.9126, 'learning_rate': 1.8957e-04, 'epoch': 0.85} + +[INFO|2025-01-23 03:42:26] logging.py:157 >> {'loss': 1.9843, 'learning_rate': 1.8916e-04, 'epoch': 0.85} + +[INFO|2025-01-23 03:42:41] logging.py:157 >> {'loss': 1.9633, 'learning_rate': 1.8875e-04, 'epoch': 0.85} + +[INFO|2025-01-23 03:42:55] logging.py:157 >> {'loss': 2.1546, 'learning_rate': 1.8835e-04, 'epoch': 0.86} + +[INFO|2025-01-23 03:43:10] logging.py:157 >> {'loss': 1.8770, 'learning_rate': 1.8794e-04, 'epoch': 0.86} + +[INFO|2025-01-23 03:43:24] logging.py:157 >> {'loss': 2.0058, 'learning_rate': 1.8753e-04, 'epoch': 0.86} + +[INFO|2025-01-23 03:43:38] logging.py:157 >> {'loss': 1.9685, 'learning_rate': 1.8712e-04, 'epoch': 0.86} + +[INFO|2025-01-23 03:43:53] logging.py:157 >> {'loss': 2.0074, 'learning_rate': 1.8671e-04, 'epoch': 0.86} + +[INFO|2025-01-23 03:44:07] logging.py:157 >> {'loss': 1.9922, 'learning_rate': 1.8630e-04, 'epoch': 0.86} + +[INFO|2025-01-23 03:44:22] logging.py:157 >> {'loss': 1.9966, 'learning_rate': 1.8588e-04, 'epoch': 0.87} + +[INFO|2025-01-23 03:44:36] logging.py:157 >> {'loss': 1.9703, 'learning_rate': 1.8547e-04, 'epoch': 0.87} + +[INFO|2025-01-23 03:44:50] logging.py:157 >> {'loss': 2.2116, 'learning_rate': 1.8506e-04, 'epoch': 0.87} + +[INFO|2025-01-23 03:45:05] logging.py:157 >> {'loss': 2.1126, 'learning_rate': 1.8465e-04, 'epoch': 0.87} + +[INFO|2025-01-23 03:45:19] logging.py:157 >> {'loss': 2.1365, 'learning_rate': 1.8424e-04, 'epoch': 0.87} + +[INFO|2025-01-23 03:45:34] logging.py:157 >> {'loss': 2.0955, 'learning_rate': 1.8383e-04, 'epoch': 0.88} + +[INFO|2025-01-23 03:45:49] logging.py:157 >> {'loss': 2.0701, 'learning_rate': 1.8341e-04, 'epoch': 0.88} + +[INFO|2025-01-23 03:46:03] logging.py:157 >> {'loss': 2.1663, 'learning_rate': 1.8300e-04, 'epoch': 0.88} + +[INFO|2025-01-23 03:46:17] logging.py:157 >> {'loss': 1.9167, 'learning_rate': 1.8259e-04, 'epoch': 0.88} + +[INFO|2025-01-23 03:46:33] logging.py:157 >> {'loss': 2.0097, 'learning_rate': 1.8217e-04, 'epoch': 0.88} + +[INFO|2025-01-23 03:46:47] logging.py:157 >> {'loss': 2.0314, 'learning_rate': 1.8176e-04, 'epoch': 0.88} + +[INFO|2025-01-23 03:47:01] logging.py:157 >> {'loss': 2.0366, 'learning_rate': 1.8135e-04, 'epoch': 0.89} + +[INFO|2025-01-23 03:47:15] logging.py:157 >> {'loss': 1.9290, 'learning_rate': 1.8093e-04, 'epoch': 0.89} + +[INFO|2025-01-23 03:47:28] logging.py:157 >> {'loss': 1.9234, 'learning_rate': 1.8052e-04, 'epoch': 0.89} + +[INFO|2025-01-23 03:47:42] logging.py:157 >> {'loss': 1.8586, 'learning_rate': 1.8010e-04, 'epoch': 0.89} + +[INFO|2025-01-23 03:47:56] logging.py:157 >> {'loss': 2.0306, 'learning_rate': 1.7969e-04, 'epoch': 0.89} + +[INFO|2025-01-23 03:48:10] logging.py:157 >> {'loss': 2.0074, 'learning_rate': 1.7927e-04, 'epoch': 0.89} + +[INFO|2025-01-23 03:48:24] logging.py:157 >> {'loss': 2.0077, 'learning_rate': 1.7886e-04, 'epoch': 0.90} + +[INFO|2025-01-23 03:48:38] logging.py:157 >> {'loss': 2.1483, 'learning_rate': 1.7844e-04, 'epoch': 0.90} + +[INFO|2025-01-23 03:48:52] logging.py:157 >> {'loss': 1.8316, 'learning_rate': 1.7803e-04, 'epoch': 0.90} + +[INFO|2025-01-23 03:49:06] logging.py:157 >> {'loss': 1.8651, 'learning_rate': 1.7761e-04, 'epoch': 0.90} + +[INFO|2025-01-23 03:49:19] logging.py:157 >> {'loss': 2.0009, 'learning_rate': 1.7720e-04, 'epoch': 0.90} + +[INFO|2025-01-23 03:49:33] logging.py:157 >> {'loss': 2.0482, 'learning_rate': 1.7678e-04, 'epoch': 0.91} + +[INFO|2025-01-23 03:49:47] logging.py:157 >> {'loss': 2.0247, 'learning_rate': 1.7636e-04, 'epoch': 0.91} + +[INFO|2025-01-23 03:50:01] logging.py:157 >> {'loss': 2.0250, 'learning_rate': 1.7595e-04, 'epoch': 0.91} + +[INFO|2025-01-23 03:50:15] logging.py:157 >> {'loss': 1.9410, 'learning_rate': 1.7553e-04, 'epoch': 0.91} + +[INFO|2025-01-23 03:50:29] logging.py:157 >> {'loss': 1.9883, 'learning_rate': 1.7511e-04, 'epoch': 0.91} + +[INFO|2025-01-23 03:50:42] logging.py:157 >> {'loss': 1.9776, 'learning_rate': 1.7469e-04, 'epoch': 0.91} + +[INFO|2025-01-23 03:50:56] logging.py:157 >> {'loss': 1.9624, 'learning_rate': 1.7428e-04, 'epoch': 0.92} + +[INFO|2025-01-23 03:51:12] logging.py:157 >> {'loss': 1.8998, 'learning_rate': 1.7386e-04, 'epoch': 0.92} + +[INFO|2025-01-23 03:51:25] logging.py:157 >> {'loss': 2.0375, 'learning_rate': 1.7344e-04, 'epoch': 0.92} + +[INFO|2025-01-23 03:51:38] logging.py:157 >> {'loss': 1.9619, 'learning_rate': 1.7302e-04, 'epoch': 0.92} + +[INFO|2025-01-23 03:51:52] logging.py:157 >> {'loss': 2.0921, 'learning_rate': 1.7260e-04, 'epoch': 0.92} + +[INFO|2025-01-23 03:52:05] logging.py:157 >> {'loss': 1.9965, 'learning_rate': 1.7218e-04, 'epoch': 0.92} + +[INFO|2025-01-23 03:52:19] logging.py:157 >> {'loss': 1.8418, 'learning_rate': 1.7177e-04, 'epoch': 0.93} + +[INFO|2025-01-23 03:52:32] logging.py:157 >> {'loss': 2.1117, 'learning_rate': 1.7135e-04, 'epoch': 0.93} + +[INFO|2025-01-23 03:52:46] logging.py:157 >> {'loss': 2.1308, 'learning_rate': 1.7093e-04, 'epoch': 0.93} + +[INFO|2025-01-23 03:52:59] logging.py:157 >> {'loss': 2.1006, 'learning_rate': 1.7051e-04, 'epoch': 0.93} + +[INFO|2025-01-23 03:53:12] logging.py:157 >> {'loss': 2.0499, 'learning_rate': 1.7009e-04, 'epoch': 0.93} + +[INFO|2025-01-23 03:53:26] logging.py:157 >> {'loss': 1.8886, 'learning_rate': 1.6967e-04, 'epoch': 0.94} + +[INFO|2025-01-23 03:53:39] logging.py:157 >> {'loss': 2.0256, 'learning_rate': 1.6925e-04, 'epoch': 0.94} + +[INFO|2025-01-23 03:53:52] logging.py:157 >> {'loss': 1.9635, 'learning_rate': 1.6883e-04, 'epoch': 0.94} + +[INFO|2025-01-23 03:54:06] logging.py:157 >> {'loss': 2.0084, 'learning_rate': 1.6841e-04, 'epoch': 0.94} + +[INFO|2025-01-23 03:54:19] logging.py:157 >> {'loss': 1.9940, 'learning_rate': 1.6799e-04, 'epoch': 0.94} + +[INFO|2025-01-23 03:54:32] logging.py:157 >> {'loss': 2.1470, 'learning_rate': 1.6757e-04, 'epoch': 0.94} + +[INFO|2025-01-23 03:54:46] logging.py:157 >> {'loss': 2.1125, 'learning_rate': 1.6715e-04, 'epoch': 0.95} + +[INFO|2025-01-23 03:54:59] logging.py:157 >> {'loss': 1.9652, 'learning_rate': 1.6673e-04, 'epoch': 0.95} + +[INFO|2025-01-23 03:55:12] logging.py:157 >> {'loss': 1.9353, 'learning_rate': 1.6631e-04, 'epoch': 0.95} + +[INFO|2025-01-23 03:55:26] logging.py:157 >> {'loss': 2.0403, 'learning_rate': 1.6589e-04, 'epoch': 0.95} + +[INFO|2025-01-23 03:55:40] logging.py:157 >> {'loss': 1.9162, 'learning_rate': 1.6547e-04, 'epoch': 0.95} + +[INFO|2025-01-23 03:55:53] logging.py:157 >> {'loss': 2.0594, 'learning_rate': 1.6504e-04, 'epoch': 0.95} + +[INFO|2025-01-23 03:56:06] logging.py:157 >> {'loss': 1.8185, 'learning_rate': 1.6462e-04, 'epoch': 0.96} + +[INFO|2025-01-23 03:56:19] logging.py:157 >> {'loss': 2.1007, 'learning_rate': 1.6420e-04, 'epoch': 0.96} + +[INFO|2025-01-23 03:56:32] logging.py:157 >> {'loss': 1.9731, 'learning_rate': 1.6378e-04, 'epoch': 0.96} + +[INFO|2025-01-23 03:56:44] logging.py:157 >> {'loss': 2.0191, 'learning_rate': 1.6336e-04, 'epoch': 0.96} + +[INFO|2025-01-23 03:56:57] logging.py:157 >> {'loss': 2.1089, 'learning_rate': 1.6294e-04, 'epoch': 0.96} + +[INFO|2025-01-23 03:57:10] logging.py:157 >> {'loss': 2.1129, 'learning_rate': 1.6252e-04, 'epoch': 0.97} + +[INFO|2025-01-23 03:57:23] logging.py:157 >> {'loss': 2.0357, 'learning_rate': 1.6209e-04, 'epoch': 0.97} + +[INFO|2025-01-23 03:57:36] logging.py:157 >> {'loss': 1.9399, 'learning_rate': 1.6167e-04, 'epoch': 0.97} + +[INFO|2025-01-23 03:57:48] logging.py:157 >> {'loss': 2.0116, 'learning_rate': 1.6125e-04, 'epoch': 0.97} + +[INFO|2025-01-23 03:58:01] logging.py:157 >> {'loss': 2.0625, 'learning_rate': 1.6083e-04, 'epoch': 0.97} + +[INFO|2025-01-23 03:58:14] logging.py:157 >> {'loss': 2.0045, 'learning_rate': 1.6041e-04, 'epoch': 0.97} + +[INFO|2025-01-23 03:58:27] logging.py:157 >> {'loss': 1.8777, 'learning_rate': 1.5998e-04, 'epoch': 0.98} + +[INFO|2025-01-23 03:58:40] logging.py:157 >> {'loss': 2.0501, 'learning_rate': 1.5956e-04, 'epoch': 0.98} + +[INFO|2025-01-23 03:58:53] logging.py:157 >> {'loss': 2.0181, 'learning_rate': 1.5914e-04, 'epoch': 0.98} + +[INFO|2025-01-23 03:59:06] logging.py:157 >> {'loss': 2.0077, 'learning_rate': 1.5872e-04, 'epoch': 0.98} + +[INFO|2025-01-23 03:59:19] logging.py:157 >> {'loss': 1.9184, 'learning_rate': 1.5829e-04, 'epoch': 0.98} + +[INFO|2025-01-23 03:59:31] logging.py:157 >> {'loss': 1.9212, 'learning_rate': 1.5787e-04, 'epoch': 0.98} + +[INFO|2025-01-23 03:59:44] logging.py:157 >> {'loss': 2.0666, 'learning_rate': 1.5745e-04, 'epoch': 0.99} + +[INFO|2025-01-23 04:00:02] logging.py:157 >> {'loss': 1.9224, 'learning_rate': 1.5702e-04, 'epoch': 0.99} + +[INFO|2025-01-23 04:00:29] logging.py:157 >> {'loss': 1.9829, 'learning_rate': 1.5660e-04, 'epoch': 0.99} + +[INFO|2025-01-23 04:00:57] logging.py:157 >> {'loss': 2.0970, 'learning_rate': 1.5618e-04, 'epoch': 0.99} + +[INFO|2025-01-23 04:01:24] logging.py:157 >> {'loss': 2.0570, 'learning_rate': 1.5576e-04, 'epoch': 0.99} + +[INFO|2025-01-23 04:01:52] logging.py:157 >> {'loss': 1.8690, 'learning_rate': 1.5533e-04, 'epoch': 1.00} + +[INFO|2025-01-23 04:02:19] logging.py:157 >> {'loss': 1.8386, 'learning_rate': 1.5491e-04, 'epoch': 1.00} + +[INFO|2025-01-23 04:02:46] logging.py:157 >> {'loss': 1.7059, 'learning_rate': 1.5449e-04, 'epoch': 1.00} + +[INFO|2025-01-23 04:03:14] logging.py:157 >> {'loss': 1.9036, 'learning_rate': 1.5406e-04, 'epoch': 1.00} + +[INFO|2025-01-23 04:03:41] logging.py:157 >> {'loss': 2.0621, 'learning_rate': 1.5364e-04, 'epoch': 1.00} + +[INFO|2025-01-23 04:04:09] logging.py:157 >> {'loss': 2.0563, 'learning_rate': 1.5322e-04, 'epoch': 1.00} + +[INFO|2025-01-23 04:04:36] logging.py:157 >> {'loss': 2.0112, 'learning_rate': 1.5279e-04, 'epoch': 1.01} + +[INFO|2025-01-23 04:05:03] logging.py:157 >> {'loss': 2.0247, 'learning_rate': 1.5237e-04, 'epoch': 1.01} + +[INFO|2025-01-23 04:05:31] logging.py:157 >> {'loss': 2.0245, 'learning_rate': 1.5195e-04, 'epoch': 1.01} + +[INFO|2025-01-23 04:05:58] logging.py:157 >> {'loss': 1.9894, 'learning_rate': 1.5152e-04, 'epoch': 1.01} + +[INFO|2025-01-23 04:06:26] logging.py:157 >> {'loss': 1.9291, 'learning_rate': 1.5110e-04, 'epoch': 1.01} + +[INFO|2025-01-23 04:06:53] logging.py:157 >> {'loss': 1.7575, 'learning_rate': 1.5068e-04, 'epoch': 1.01} + +[INFO|2025-01-23 04:07:20] logging.py:157 >> {'loss': 1.9924, 'learning_rate': 1.5025e-04, 'epoch': 1.02} + +[INFO|2025-01-23 04:07:48] logging.py:157 >> {'loss': 1.9584, 'learning_rate': 1.4983e-04, 'epoch': 1.02} + +[INFO|2025-01-23 04:08:16] logging.py:157 >> {'loss': 1.9540, 'learning_rate': 1.4941e-04, 'epoch': 1.02} + +[INFO|2025-01-23 04:08:43] logging.py:157 >> {'loss': 1.8815, 'learning_rate': 1.4898e-04, 'epoch': 1.02} + +[INFO|2025-01-23 04:09:12] logging.py:157 >> {'loss': 1.8912, 'learning_rate': 1.4856e-04, 'epoch': 1.02} + +[INFO|2025-01-23 04:09:39] logging.py:157 >> {'loss': 1.8818, 'learning_rate': 1.4814e-04, 'epoch': 1.03} + +[INFO|2025-01-23 04:10:06] logging.py:157 >> {'loss': 2.0263, 'learning_rate': 1.4771e-04, 'epoch': 1.03} + +[INFO|2025-01-23 04:10:33] logging.py:157 >> {'loss': 2.1125, 'learning_rate': 1.4729e-04, 'epoch': 1.03} + +[INFO|2025-01-23 04:11:00] logging.py:157 >> {'loss': 2.0275, 'learning_rate': 1.4687e-04, 'epoch': 1.03} + +[INFO|2025-01-23 04:11:27] logging.py:157 >> {'loss': 1.9635, 'learning_rate': 1.4644e-04, 'epoch': 1.03} + +[INFO|2025-01-23 04:11:53] logging.py:157 >> {'loss': 1.9654, 'learning_rate': 1.4602e-04, 'epoch': 1.03} + +[INFO|2025-01-23 04:12:20] logging.py:157 >> {'loss': 2.0137, 'learning_rate': 1.4560e-04, 'epoch': 1.04} + +[INFO|2025-01-23 04:12:47] logging.py:157 >> {'loss': 2.0591, 'learning_rate': 1.4517e-04, 'epoch': 1.04} + +[INFO|2025-01-23 04:13:14] logging.py:157 >> {'loss': 2.1446, 'learning_rate': 1.4475e-04, 'epoch': 1.04} + +[INFO|2025-01-23 04:13:41] logging.py:157 >> {'loss': 1.8930, 'learning_rate': 1.4433e-04, 'epoch': 1.04} + +[INFO|2025-01-23 04:14:08] logging.py:157 >> {'loss': 2.0861, 'learning_rate': 1.4391e-04, 'epoch': 1.04} + +[INFO|2025-01-23 04:14:35] logging.py:157 >> {'loss': 1.8338, 'learning_rate': 1.4348e-04, 'epoch': 1.04} + +[INFO|2025-01-23 04:15:02] logging.py:157 >> {'loss': 1.8763, 'learning_rate': 1.4306e-04, 'epoch': 1.05} + +[INFO|2025-01-23 04:15:29] logging.py:157 >> {'loss': 1.9167, 'learning_rate': 1.4264e-04, 'epoch': 1.05} + +[INFO|2025-01-23 04:15:56] logging.py:157 >> {'loss': 1.9689, 'learning_rate': 1.4221e-04, 'epoch': 1.05} + +[INFO|2025-01-23 04:16:23] logging.py:157 >> {'loss': 2.0867, 'learning_rate': 1.4179e-04, 'epoch': 1.05} + +[INFO|2025-01-23 04:16:49] logging.py:157 >> {'loss': 1.8988, 'learning_rate': 1.4137e-04, 'epoch': 1.05} + +[INFO|2025-01-23 04:17:16] logging.py:157 >> {'loss': 1.8980, 'learning_rate': 1.4095e-04, 'epoch': 1.06} + +[INFO|2025-01-23 04:17:43] logging.py:157 >> {'loss': 2.0711, 'learning_rate': 1.4052e-04, 'epoch': 1.06} + +[INFO|2025-01-23 04:18:12] logging.py:157 >> {'loss': 1.9845, 'learning_rate': 1.4010e-04, 'epoch': 1.06} + +[INFO|2025-01-23 04:18:38] logging.py:157 >> {'loss': 2.1484, 'learning_rate': 1.3968e-04, 'epoch': 1.06} + +[INFO|2025-01-23 04:19:05] logging.py:157 >> {'loss': 1.9079, 'learning_rate': 1.3926e-04, 'epoch': 1.06} + +[INFO|2025-01-23 04:19:31] logging.py:157 >> {'loss': 2.0101, 'learning_rate': 1.3883e-04, 'epoch': 1.06} + +[INFO|2025-01-23 04:19:57] logging.py:157 >> {'loss': 1.8759, 'learning_rate': 1.3841e-04, 'epoch': 1.07} + +[INFO|2025-01-23 04:20:24] logging.py:157 >> {'loss': 1.8954, 'learning_rate': 1.3799e-04, 'epoch': 1.07} + +[INFO|2025-01-23 04:20:50] logging.py:157 >> {'loss': 2.0030, 'learning_rate': 1.3757e-04, 'epoch': 1.07} + +[INFO|2025-01-23 04:21:17] logging.py:157 >> {'loss': 2.1761, 'learning_rate': 1.3715e-04, 'epoch': 1.07} + +[INFO|2025-01-23 04:21:43] logging.py:157 >> {'loss': 1.9865, 'learning_rate': 1.3673e-04, 'epoch': 1.07} + +[INFO|2025-01-23 04:22:09] logging.py:157 >> {'loss': 2.0329, 'learning_rate': 1.3630e-04, 'epoch': 1.07} + +[INFO|2025-01-23 04:22:36] logging.py:157 >> {'loss': 1.9419, 'learning_rate': 1.3588e-04, 'epoch': 1.08} + +[INFO|2025-01-23 04:23:02] logging.py:157 >> {'loss': 1.8413, 'learning_rate': 1.3546e-04, 'epoch': 1.08} + +[INFO|2025-01-23 04:23:28] logging.py:157 >> {'loss': 2.0163, 'learning_rate': 1.3504e-04, 'epoch': 1.08} + +[INFO|2025-01-23 04:23:55] logging.py:157 >> {'loss': 1.9333, 'learning_rate': 1.3462e-04, 'epoch': 1.08} + +[INFO|2025-01-23 04:24:21] logging.py:157 >> {'loss': 1.7206, 'learning_rate': 1.3420e-04, 'epoch': 1.08} + +[INFO|2025-01-23 04:24:47] logging.py:157 >> {'loss': 1.9189, 'learning_rate': 1.3378e-04, 'epoch': 1.09} + +[INFO|2025-01-23 04:25:14] logging.py:157 >> {'loss': 2.0069, 'learning_rate': 1.3336e-04, 'epoch': 1.09} + +[INFO|2025-01-23 04:25:40] logging.py:157 >> {'loss': 1.9691, 'learning_rate': 1.3293e-04, 'epoch': 1.09} + +[INFO|2025-01-23 04:26:06] logging.py:157 >> {'loss': 1.8643, 'learning_rate': 1.3251e-04, 'epoch': 1.09} + +[INFO|2025-01-23 04:26:32] logging.py:157 >> {'loss': 2.0899, 'learning_rate': 1.3209e-04, 'epoch': 1.09} + +[INFO|2025-01-23 04:27:00] logging.py:157 >> {'loss': 1.7938, 'learning_rate': 1.3167e-04, 'epoch': 1.09} + +[INFO|2025-01-23 04:27:26] logging.py:157 >> {'loss': 1.9930, 'learning_rate': 1.3125e-04, 'epoch': 1.10} + +[INFO|2025-01-23 04:27:51] logging.py:157 >> {'loss': 1.8665, 'learning_rate': 1.3083e-04, 'epoch': 1.10} + +[INFO|2025-01-23 04:28:17] logging.py:157 >> {'loss': 1.9102, 'learning_rate': 1.3041e-04, 'epoch': 1.10} + +[INFO|2025-01-23 04:28:43] logging.py:157 >> {'loss': 2.0180, 'learning_rate': 1.2999e-04, 'epoch': 1.10} + +[INFO|2025-01-23 04:29:09] logging.py:157 >> {'loss': 2.1041, 'learning_rate': 1.2958e-04, 'epoch': 1.10} + +[INFO|2025-01-23 04:29:34] logging.py:157 >> {'loss': 1.8694, 'learning_rate': 1.2916e-04, 'epoch': 1.10} + +[INFO|2025-01-23 04:30:00] logging.py:157 >> {'loss': 2.0600, 'learning_rate': 1.2874e-04, 'epoch': 1.11} + +[INFO|2025-01-23 04:30:26] logging.py:157 >> {'loss': 2.2009, 'learning_rate': 1.2832e-04, 'epoch': 1.11} + +[INFO|2025-01-23 04:30:52] logging.py:157 >> {'loss': 2.0763, 'learning_rate': 1.2790e-04, 'epoch': 1.11} + +[INFO|2025-01-23 04:31:18] logging.py:157 >> {'loss': 2.0429, 'learning_rate': 1.2748e-04, 'epoch': 1.11} + +[INFO|2025-01-23 04:31:44] logging.py:157 >> {'loss': 1.9824, 'learning_rate': 1.2706e-04, 'epoch': 1.11} + +[INFO|2025-01-23 04:32:09] logging.py:157 >> {'loss': 1.9851, 'learning_rate': 1.2664e-04, 'epoch': 1.12} + +[INFO|2025-01-23 04:32:35] logging.py:157 >> {'loss': 2.0281, 'learning_rate': 1.2623e-04, 'epoch': 1.12} + +[INFO|2025-01-23 04:33:01] logging.py:157 >> {'loss': 2.0809, 'learning_rate': 1.2581e-04, 'epoch': 1.12} + +[INFO|2025-01-23 04:33:27] logging.py:157 >> {'loss': 1.9259, 'learning_rate': 1.2539e-04, 'epoch': 1.12} + +[INFO|2025-01-23 04:33:53] logging.py:157 >> {'loss': 2.0725, 'learning_rate': 1.2497e-04, 'epoch': 1.12} + +[INFO|2025-01-23 04:34:19] logging.py:157 >> {'loss': 1.9467, 'learning_rate': 1.2456e-04, 'epoch': 1.12} + +[INFO|2025-01-23 04:34:45] logging.py:157 >> {'loss': 1.9903, 'learning_rate': 1.2414e-04, 'epoch': 1.13} + +[INFO|2025-01-23 04:35:11] logging.py:157 >> {'loss': 2.1110, 'learning_rate': 1.2372e-04, 'epoch': 1.13} + +[INFO|2025-01-23 04:35:38] logging.py:157 >> {'loss': 1.9945, 'learning_rate': 1.2330e-04, 'epoch': 1.13} + +[INFO|2025-01-23 04:36:03] logging.py:157 >> {'loss': 1.9593, 'learning_rate': 1.2289e-04, 'epoch': 1.13} + +[INFO|2025-01-23 04:36:29] logging.py:157 >> {'loss': 1.9419, 'learning_rate': 1.2247e-04, 'epoch': 1.13} + +[INFO|2025-01-23 04:36:54] logging.py:157 >> {'loss': 2.1435, 'learning_rate': 1.2206e-04, 'epoch': 1.13} + +[INFO|2025-01-23 04:37:19] logging.py:157 >> {'loss': 2.0217, 'learning_rate': 1.2164e-04, 'epoch': 1.14} + +[INFO|2025-01-23 04:37:45] logging.py:157 >> {'loss': 1.8215, 'learning_rate': 1.2122e-04, 'epoch': 1.14} + +[INFO|2025-01-23 04:38:10] logging.py:157 >> {'loss': 1.9606, 'learning_rate': 1.2081e-04, 'epoch': 1.14} + +[INFO|2025-01-23 04:38:35] logging.py:157 >> {'loss': 2.1079, 'learning_rate': 1.2039e-04, 'epoch': 1.14} + +[INFO|2025-01-23 04:39:01] logging.py:157 >> {'loss': 1.8784, 'learning_rate': 1.1998e-04, 'epoch': 1.14} + +[INFO|2025-01-23 04:39:26] logging.py:157 >> {'loss': 2.1676, 'learning_rate': 1.1956e-04, 'epoch': 1.15} + +[INFO|2025-01-23 04:39:51] logging.py:157 >> {'loss': 1.9355, 'learning_rate': 1.1915e-04, 'epoch': 1.15} + +[INFO|2025-01-23 04:40:16] logging.py:157 >> {'loss': 2.0074, 'learning_rate': 1.1874e-04, 'epoch': 1.15} + +[INFO|2025-01-23 04:40:42] logging.py:157 >> {'loss': 1.8919, 'learning_rate': 1.1832e-04, 'epoch': 1.15} + +[INFO|2025-01-23 04:41:07] logging.py:157 >> {'loss': 1.8595, 'learning_rate': 1.1791e-04, 'epoch': 1.15} + +[INFO|2025-01-23 04:41:32] logging.py:157 >> {'loss': 2.0580, 'learning_rate': 1.1750e-04, 'epoch': 1.15} + +[INFO|2025-01-23 04:41:57] logging.py:157 >> {'loss': 2.0532, 'learning_rate': 1.1708e-04, 'epoch': 1.16} + +[INFO|2025-01-23 04:42:23] logging.py:157 >> {'loss': 2.0180, 'learning_rate': 1.1667e-04, 'epoch': 1.16} + +[INFO|2025-01-23 04:42:48] logging.py:157 >> {'loss': 2.0849, 'learning_rate': 1.1626e-04, 'epoch': 1.16} + +[INFO|2025-01-23 04:43:13] logging.py:157 >> {'loss': 1.8444, 'learning_rate': 1.1584e-04, 'epoch': 1.16} + +[INFO|2025-01-23 04:43:38] logging.py:157 >> {'loss': 1.9923, 'learning_rate': 1.1543e-04, 'epoch': 1.16} + +[INFO|2025-01-23 04:44:05] logging.py:157 >> {'loss': 1.9986, 'learning_rate': 1.1502e-04, 'epoch': 1.16} + +[INFO|2025-01-23 04:44:30] logging.py:157 >> {'loss': 1.9281, 'learning_rate': 1.1461e-04, 'epoch': 1.17} + +[INFO|2025-01-23 04:44:55] logging.py:157 >> {'loss': 1.9358, 'learning_rate': 1.1420e-04, 'epoch': 1.17} + +[INFO|2025-01-23 04:45:19] logging.py:157 >> {'loss': 1.8651, 'learning_rate': 1.1379e-04, 'epoch': 1.17} + +[INFO|2025-01-23 04:45:44] logging.py:157 >> {'loss': 1.8506, 'learning_rate': 1.1338e-04, 'epoch': 1.17} + +[INFO|2025-01-23 04:46:09] logging.py:157 >> {'loss': 1.9455, 'learning_rate': 1.1297e-04, 'epoch': 1.17} + +[INFO|2025-01-23 04:46:34] logging.py:157 >> {'loss': 2.0772, 'learning_rate': 1.1256e-04, 'epoch': 1.18} + +[INFO|2025-01-23 04:46:59] logging.py:157 >> {'loss': 1.8982, 'learning_rate': 1.1215e-04, 'epoch': 1.18} + +[INFO|2025-01-23 04:47:24] logging.py:157 >> {'loss': 1.9232, 'learning_rate': 1.1174e-04, 'epoch': 1.18} + +[INFO|2025-01-23 04:47:48] logging.py:157 >> {'loss': 1.9702, 'learning_rate': 1.1133e-04, 'epoch': 1.18} + +[INFO|2025-01-23 04:48:13] logging.py:157 >> {'loss': 1.8255, 'learning_rate': 1.1092e-04, 'epoch': 1.18} + +[INFO|2025-01-23 04:48:38] logging.py:157 >> {'loss': 1.8687, 'learning_rate': 1.1051e-04, 'epoch': 1.18} + +[INFO|2025-01-23 04:49:03] logging.py:157 >> {'loss': 2.0063, 'learning_rate': 1.1010e-04, 'epoch': 1.19} + +[INFO|2025-01-23 04:49:27] logging.py:157 >> {'loss': 2.1353, 'learning_rate': 1.0969e-04, 'epoch': 1.19} + +[INFO|2025-01-23 04:49:52] logging.py:157 >> {'loss': 1.8850, 'learning_rate': 1.0929e-04, 'epoch': 1.19} + +[INFO|2025-01-23 04:50:17] logging.py:157 >> {'loss': 1.9941, 'learning_rate': 1.0888e-04, 'epoch': 1.19} + +[INFO|2025-01-23 04:50:41] logging.py:157 >> {'loss': 1.9189, 'learning_rate': 1.0847e-04, 'epoch': 1.19} + +[INFO|2025-01-23 04:51:06] logging.py:157 >> {'loss': 1.9432, 'learning_rate': 1.0807e-04, 'epoch': 1.19} + +[INFO|2025-01-23 04:51:30] logging.py:157 >> {'loss': 1.9876, 'learning_rate': 1.0766e-04, 'epoch': 1.20} + +[INFO|2025-01-23 04:51:55] logging.py:157 >> {'loss': 2.0636, 'learning_rate': 1.0725e-04, 'epoch': 1.20} + +[INFO|2025-01-23 04:52:21] logging.py:157 >> {'loss': 1.9318, 'learning_rate': 1.0685e-04, 'epoch': 1.20} + +[INFO|2025-01-23 04:52:45] logging.py:157 >> {'loss': 1.9487, 'learning_rate': 1.0644e-04, 'epoch': 1.20} + +[INFO|2025-01-23 04:53:10] logging.py:157 >> {'loss': 1.8043, 'learning_rate': 1.0604e-04, 'epoch': 1.20} + +[INFO|2025-01-23 04:53:34] logging.py:157 >> {'loss': 2.0779, 'learning_rate': 1.0563e-04, 'epoch': 1.21} + +[INFO|2025-01-23 04:53:58] logging.py:157 >> {'loss': 1.8511, 'learning_rate': 1.0523e-04, 'epoch': 1.21} + +[INFO|2025-01-23 04:54:22] logging.py:157 >> {'loss': 2.0744, 'learning_rate': 1.0482e-04, 'epoch': 1.21} + +[INFO|2025-01-23 04:54:46] logging.py:157 >> {'loss': 1.8954, 'learning_rate': 1.0442e-04, 'epoch': 1.21} + +[INFO|2025-01-23 04:55:11] logging.py:157 >> {'loss': 1.9577, 'learning_rate': 1.0402e-04, 'epoch': 1.21} + +[INFO|2025-01-23 04:55:35] logging.py:157 >> {'loss': 1.9895, 'learning_rate': 1.0362e-04, 'epoch': 1.21} + +[INFO|2025-01-23 04:55:59] logging.py:157 >> {'loss': 2.0698, 'learning_rate': 1.0321e-04, 'epoch': 1.22} + +[INFO|2025-01-23 04:56:23] logging.py:157 >> {'loss': 2.0168, 'learning_rate': 1.0281e-04, 'epoch': 1.22} + +[INFO|2025-01-23 04:56:47] logging.py:157 >> {'loss': 2.0395, 'learning_rate': 1.0241e-04, 'epoch': 1.22} + +[INFO|2025-01-23 04:57:12] logging.py:157 >> {'loss': 1.9029, 'learning_rate': 1.0201e-04, 'epoch': 1.22} + +[INFO|2025-01-23 04:57:36] logging.py:157 >> {'loss': 1.8046, 'learning_rate': 1.0161e-04, 'epoch': 1.22} + +[INFO|2025-01-23 04:58:00] logging.py:157 >> {'loss': 1.8879, 'learning_rate': 1.0121e-04, 'epoch': 1.22} + +[INFO|2025-01-23 04:58:24] logging.py:157 >> {'loss': 2.0476, 'learning_rate': 1.0081e-04, 'epoch': 1.23} + +[INFO|2025-01-23 04:58:48] logging.py:157 >> {'loss': 2.0518, 'learning_rate': 1.0041e-04, 'epoch': 1.23} + +[INFO|2025-01-23 04:59:12] logging.py:157 >> {'loss': 1.9547, 'learning_rate': 1.0001e-04, 'epoch': 1.23} + +[INFO|2025-01-23 04:59:37] logging.py:157 >> {'loss': 1.9264, 'learning_rate': 9.9609e-05, 'epoch': 1.23} + +[INFO|2025-01-23 05:00:01] logging.py:157 >> {'loss': 1.9587, 'learning_rate': 9.9210e-05, 'epoch': 1.23} + +[INFO|2025-01-23 05:00:26] logging.py:157 >> {'loss': 1.8541, 'learning_rate': 9.8812e-05, 'epoch': 1.24} + +[INFO|2025-01-23 05:00:50] logging.py:157 >> {'loss': 1.9058, 'learning_rate': 9.8414e-05, 'epoch': 1.24} + +[INFO|2025-01-23 05:01:14] logging.py:157 >> {'loss': 1.9184, 'learning_rate': 9.8017e-05, 'epoch': 1.24} + +[INFO|2025-01-23 05:01:37] logging.py:157 >> {'loss': 1.9931, 'learning_rate': 9.7620e-05, 'epoch': 1.24} + +[INFO|2025-01-23 05:02:01] logging.py:157 >> {'loss': 2.0504, 'learning_rate': 9.7224e-05, 'epoch': 1.24} + +[INFO|2025-01-23 05:02:24] logging.py:157 >> {'loss': 1.9253, 'learning_rate': 9.6828e-05, 'epoch': 1.24} + +[INFO|2025-01-23 05:02:48] logging.py:157 >> {'loss': 2.0314, 'learning_rate': 9.6432e-05, 'epoch': 1.25} + +[INFO|2025-01-23 05:03:11] logging.py:157 >> {'loss': 1.9675, 'learning_rate': 9.6037e-05, 'epoch': 1.25} + +[INFO|2025-01-23 05:03:35] logging.py:157 >> {'loss': 1.9578, 'learning_rate': 9.5642e-05, 'epoch': 1.25} + +[INFO|2025-01-23 05:03:59] logging.py:157 >> {'loss': 1.8483, 'learning_rate': 9.5248e-05, 'epoch': 1.25} + +[INFO|2025-01-23 05:04:22] logging.py:157 >> {'loss': 2.1405, 'learning_rate': 9.4854e-05, 'epoch': 1.25} + +[INFO|2025-01-23 05:04:46] logging.py:157 >> {'loss': 1.9838, 'learning_rate': 9.4460e-05, 'epoch': 1.25} + +[INFO|2025-01-23 05:05:09] logging.py:157 >> {'loss': 1.9995, 'learning_rate': 9.4067e-05, 'epoch': 1.26} + +[INFO|2025-01-23 05:05:33] logging.py:157 >> {'loss': 1.8116, 'learning_rate': 9.3675e-05, 'epoch': 1.26} + +[INFO|2025-01-23 05:05:57] logging.py:157 >> {'loss': 2.0901, 'learning_rate': 9.3283e-05, 'epoch': 1.26} + +[INFO|2025-01-23 05:06:20] logging.py:157 >> {'loss': 1.8706, 'learning_rate': 9.2891e-05, 'epoch': 1.26} + +[INFO|2025-01-23 05:06:44] logging.py:157 >> {'loss': 2.0057, 'learning_rate': 9.2500e-05, 'epoch': 1.26} + +[INFO|2025-01-23 05:07:08] logging.py:157 >> {'loss': 1.9914, 'learning_rate': 9.2109e-05, 'epoch': 1.27} + +[INFO|2025-01-23 05:07:31] logging.py:157 >> {'loss': 2.1123, 'learning_rate': 9.1719e-05, 'epoch': 1.27} + +[INFO|2025-01-23 05:07:55] logging.py:157 >> {'loss': 2.0011, 'learning_rate': 9.1329e-05, 'epoch': 1.27} + +[INFO|2025-01-23 05:08:20] logging.py:157 >> {'loss': 2.0393, 'learning_rate': 9.0939e-05, 'epoch': 1.27} + +[INFO|2025-01-23 05:08:43] logging.py:157 >> {'loss': 2.0263, 'learning_rate': 9.0551e-05, 'epoch': 1.27} + +[INFO|2025-01-23 05:09:06] logging.py:157 >> {'loss': 1.9792, 'learning_rate': 9.0162e-05, 'epoch': 1.27} + +[INFO|2025-01-23 05:09:29] logging.py:157 >> {'loss': 2.1153, 'learning_rate': 8.9774e-05, 'epoch': 1.28} + +[INFO|2025-01-23 05:09:52] logging.py:157 >> {'loss': 2.0119, 'learning_rate': 8.9387e-05, 'epoch': 1.28} + +[INFO|2025-01-23 05:10:16] logging.py:157 >> {'loss': 1.9112, 'learning_rate': 8.9000e-05, 'epoch': 1.28} + +[INFO|2025-01-23 05:10:39] logging.py:157 >> {'loss': 2.1946, 'learning_rate': 8.8613e-05, 'epoch': 1.28} + +[INFO|2025-01-23 05:11:02] logging.py:157 >> {'loss': 2.2375, 'learning_rate': 8.8227e-05, 'epoch': 1.28} + +[INFO|2025-01-23 05:11:25] logging.py:157 >> {'loss': 2.0459, 'learning_rate': 8.7842e-05, 'epoch': 1.28} + +[INFO|2025-01-23 05:11:48] logging.py:157 >> {'loss': 2.0742, 'learning_rate': 8.7457e-05, 'epoch': 1.29} + +[INFO|2025-01-23 05:12:11] logging.py:157 >> {'loss': 1.9052, 'learning_rate': 8.7072e-05, 'epoch': 1.29} + +[INFO|2025-01-23 05:12:34] logging.py:157 >> {'loss': 1.8587, 'learning_rate': 8.6688e-05, 'epoch': 1.29} + +[INFO|2025-01-23 05:12:57] logging.py:157 >> {'loss': 2.0541, 'learning_rate': 8.6305e-05, 'epoch': 1.29} + +[INFO|2025-01-23 05:13:20] logging.py:157 >> {'loss': 1.8486, 'learning_rate': 8.5922e-05, 'epoch': 1.29} + +[INFO|2025-01-23 05:13:43] logging.py:157 >> {'loss': 2.0947, 'learning_rate': 8.5539e-05, 'epoch': 1.30} + +[INFO|2025-01-23 05:14:06] logging.py:157 >> {'loss': 1.9047, 'learning_rate': 8.5157e-05, 'epoch': 1.30} + +[INFO|2025-01-23 05:14:29] logging.py:157 >> {'loss': 2.1091, 'learning_rate': 8.4776e-05, 'epoch': 1.30} + +[INFO|2025-01-23 05:14:52] logging.py:157 >> {'loss': 2.1205, 'learning_rate': 8.4395e-05, 'epoch': 1.30} + +[INFO|2025-01-23 05:15:15] logging.py:157 >> {'loss': 1.9346, 'learning_rate': 8.4014e-05, 'epoch': 1.30} + +[INFO|2025-01-23 05:15:38] logging.py:157 >> {'loss': 1.9801, 'learning_rate': 8.3635e-05, 'epoch': 1.30} + +[INFO|2025-01-23 05:16:03] logging.py:157 >> {'loss': 2.1433, 'learning_rate': 8.3255e-05, 'epoch': 1.31} + +[INFO|2025-01-23 05:16:25] logging.py:157 >> {'loss': 1.8015, 'learning_rate': 8.2876e-05, 'epoch': 1.31} + +[INFO|2025-01-23 05:16:48] logging.py:157 >> {'loss': 2.0684, 'learning_rate': 8.2498e-05, 'epoch': 1.31} + +[INFO|2025-01-23 05:17:10] logging.py:157 >> {'loss': 1.9730, 'learning_rate': 8.2120e-05, 'epoch': 1.31} + +[INFO|2025-01-23 05:17:33] logging.py:157 >> {'loss': 1.8917, 'learning_rate': 8.1743e-05, 'epoch': 1.31} + +[INFO|2025-01-23 05:17:55] logging.py:157 >> {'loss': 1.9114, 'learning_rate': 8.1366e-05, 'epoch': 1.31} + +[INFO|2025-01-23 05:18:18] logging.py:157 >> {'loss': 1.9653, 'learning_rate': 8.0990e-05, 'epoch': 1.32} + +[INFO|2025-01-23 05:18:41] logging.py:157 >> {'loss': 1.8212, 'learning_rate': 8.0615e-05, 'epoch': 1.32} + +[INFO|2025-01-23 05:19:03] logging.py:157 >> {'loss': 2.0440, 'learning_rate': 8.0240e-05, 'epoch': 1.32} + +[INFO|2025-01-23 05:19:26] logging.py:157 >> {'loss': 1.9966, 'learning_rate': 7.9865e-05, 'epoch': 1.32} + +[INFO|2025-01-23 05:19:48] logging.py:157 >> {'loss': 2.0253, 'learning_rate': 7.9491e-05, 'epoch': 1.32} + +[INFO|2025-01-23 05:20:11] logging.py:157 >> {'loss': 1.8925, 'learning_rate': 7.9118e-05, 'epoch': 1.33} + +[INFO|2025-01-23 05:20:33] logging.py:157 >> {'loss': 1.9373, 'learning_rate': 7.8745e-05, 'epoch': 1.33} + +[INFO|2025-01-23 05:20:56] logging.py:157 >> {'loss': 1.9891, 'learning_rate': 7.8373e-05, 'epoch': 1.33} + +[INFO|2025-01-23 05:21:18] logging.py:157 >> {'loss': 1.7900, 'learning_rate': 7.8001e-05, 'epoch': 1.33} + +[INFO|2025-01-23 05:21:41] logging.py:157 >> {'loss': 2.0676, 'learning_rate': 7.7630e-05, 'epoch': 1.33} + +[INFO|2025-01-23 05:22:03] logging.py:157 >> {'loss': 1.9150, 'learning_rate': 7.7260e-05, 'epoch': 1.33} + +[INFO|2025-01-23 05:22:26] logging.py:157 >> {'loss': 1.9927, 'learning_rate': 7.6890e-05, 'epoch': 1.34} + +[INFO|2025-01-23 05:22:48] logging.py:157 >> {'loss': 1.8792, 'learning_rate': 7.6520e-05, 'epoch': 1.34} + +[INFO|2025-01-23 05:23:11] logging.py:157 >> {'loss': 1.8890, 'learning_rate': 7.6152e-05, 'epoch': 1.34} + +[INFO|2025-01-23 05:23:35] logging.py:157 >> {'loss': 1.9541, 'learning_rate': 7.5783e-05, 'epoch': 1.34} + +[INFO|2025-01-23 05:23:56] logging.py:157 >> {'loss': 2.0380, 'learning_rate': 7.5416e-05, 'epoch': 1.34} + +[INFO|2025-01-23 05:24:19] logging.py:157 >> {'loss': 1.8775, 'learning_rate': 7.5049e-05, 'epoch': 1.34} + +[INFO|2025-01-23 05:24:41] logging.py:157 >> {'loss': 1.9517, 'learning_rate': 7.4683e-05, 'epoch': 1.35} + +[INFO|2025-01-23 05:25:02] logging.py:157 >> {'loss': 1.8343, 'learning_rate': 7.4317e-05, 'epoch': 1.35} + +[INFO|2025-01-23 05:25:24] logging.py:157 >> {'loss': 2.0133, 'learning_rate': 7.3952e-05, 'epoch': 1.35} + +[INFO|2025-01-23 05:25:46] logging.py:157 >> {'loss': 1.8356, 'learning_rate': 7.3587e-05, 'epoch': 1.35} + +[INFO|2025-01-23 05:26:08] logging.py:157 >> {'loss': 2.0353, 'learning_rate': 7.3223e-05, 'epoch': 1.35} + +[INFO|2025-01-23 05:26:30] logging.py:157 >> {'loss': 2.0202, 'learning_rate': 7.2860e-05, 'epoch': 1.36} + +[INFO|2025-01-23 05:26:52] logging.py:157 >> {'loss': 2.0012, 'learning_rate': 7.2497e-05, 'epoch': 1.36} + +[INFO|2025-01-23 05:27:15] logging.py:157 >> {'loss': 2.0860, 'learning_rate': 7.2135e-05, 'epoch': 1.36} + +[INFO|2025-01-23 05:27:36] logging.py:157 >> {'loss': 2.0772, 'learning_rate': 7.1773e-05, 'epoch': 1.36} + +[INFO|2025-01-23 05:27:58] logging.py:157 >> {'loss': 1.8464, 'learning_rate': 7.1412e-05, 'epoch': 1.36} + +[INFO|2025-01-23 05:28:20] logging.py:157 >> {'loss': 2.0004, 'learning_rate': 7.1052e-05, 'epoch': 1.36} + +[INFO|2025-01-23 05:28:42] logging.py:157 >> {'loss': 2.0437, 'learning_rate': 7.0692e-05, 'epoch': 1.37} + +[INFO|2025-01-23 05:29:04] logging.py:157 >> {'loss': 1.9603, 'learning_rate': 7.0333e-05, 'epoch': 1.37} + +[INFO|2025-01-23 05:29:26] logging.py:157 >> {'loss': 1.9865, 'learning_rate': 6.9975e-05, 'epoch': 1.37} + +[INFO|2025-01-23 05:29:48] logging.py:157 >> {'loss': 1.8500, 'learning_rate': 6.9617e-05, 'epoch': 1.37} + +[INFO|2025-01-23 05:30:10] logging.py:157 >> {'loss': 2.0996, 'learning_rate': 6.9260e-05, 'epoch': 1.37} + +[INFO|2025-01-23 05:30:32] logging.py:157 >> {'loss': 1.9583, 'learning_rate': 6.8904e-05, 'epoch': 1.37} + +[INFO|2025-01-23 05:30:56] logging.py:157 >> {'loss': 1.9905, 'learning_rate': 6.8548e-05, 'epoch': 1.38} + +[INFO|2025-01-23 05:31:17] logging.py:157 >> {'loss': 1.9506, 'learning_rate': 6.8193e-05, 'epoch': 1.38} + +[INFO|2025-01-23 05:31:39] logging.py:157 >> {'loss': 1.9496, 'learning_rate': 6.7838e-05, 'epoch': 1.38} + +[INFO|2025-01-23 05:32:00] logging.py:157 >> {'loss': 2.0638, 'learning_rate': 6.7485e-05, 'epoch': 1.38} + +[INFO|2025-01-23 05:32:21] logging.py:157 >> {'loss': 1.8883, 'learning_rate': 6.7131e-05, 'epoch': 1.38} + +[INFO|2025-01-23 05:32:43] logging.py:157 >> {'loss': 2.0215, 'learning_rate': 6.6779e-05, 'epoch': 1.39} + +[INFO|2025-01-23 05:33:04] logging.py:157 >> {'loss': 2.0483, 'learning_rate': 6.6427e-05, 'epoch': 1.39} + +[INFO|2025-01-23 05:33:26] logging.py:157 >> {'loss': 1.8987, 'learning_rate': 6.6076e-05, 'epoch': 1.39} + +[INFO|2025-01-23 05:33:47] logging.py:157 >> {'loss': 1.8423, 'learning_rate': 6.5725e-05, 'epoch': 1.39} + +[INFO|2025-01-23 05:34:09] logging.py:157 >> {'loss': 1.9584, 'learning_rate': 6.5375e-05, 'epoch': 1.39} + +[INFO|2025-01-23 05:34:30] logging.py:157 >> {'loss': 2.0189, 'learning_rate': 6.5026e-05, 'epoch': 1.39} + +[INFO|2025-01-23 05:34:51] logging.py:157 >> {'loss': 1.8980, 'learning_rate': 6.4678e-05, 'epoch': 1.40} + +[INFO|2025-01-23 05:35:13] logging.py:157 >> {'loss': 1.9088, 'learning_rate': 6.4330e-05, 'epoch': 1.40} + +[INFO|2025-01-23 05:35:34] logging.py:157 >> {'loss': 1.9583, 'learning_rate': 6.3983e-05, 'epoch': 1.40} + +[INFO|2025-01-23 05:35:56] logging.py:157 >> {'loss': 1.8288, 'learning_rate': 6.3636e-05, 'epoch': 1.40} + +[INFO|2025-01-23 05:36:17] logging.py:157 >> {'loss': 1.9538, 'learning_rate': 6.3291e-05, 'epoch': 1.40} + +[INFO|2025-01-23 05:36:38] logging.py:157 >> {'loss': 1.9491, 'learning_rate': 6.2945e-05, 'epoch': 1.40} + +[INFO|2025-01-23 05:37:00] logging.py:157 >> {'loss': 1.9820, 'learning_rate': 6.2601e-05, 'epoch': 1.41} + +[INFO|2025-01-23 05:37:21] logging.py:157 >> {'loss': 2.0367, 'learning_rate': 6.2257e-05, 'epoch': 1.41} + +[INFO|2025-01-23 05:37:42] logging.py:157 >> {'loss': 1.9349, 'learning_rate': 6.1914e-05, 'epoch': 1.41} + +[INFO|2025-01-23 05:38:05] logging.py:157 >> {'loss': 1.9296, 'learning_rate': 6.1572e-05, 'epoch': 1.41} + +[INFO|2025-01-23 05:38:26] logging.py:157 >> {'loss': 1.9549, 'learning_rate': 6.1231e-05, 'epoch': 1.41} + +[INFO|2025-01-23 05:38:47] logging.py:157 >> {'loss': 2.0469, 'learning_rate': 6.0890e-05, 'epoch': 1.42} + +[INFO|2025-01-23 05:39:08] logging.py:157 >> {'loss': 1.9037, 'learning_rate': 6.0550e-05, 'epoch': 1.42} + +[INFO|2025-01-23 05:39:29] logging.py:157 >> {'loss': 1.7788, 'learning_rate': 6.0210e-05, 'epoch': 1.42} + +[INFO|2025-01-23 05:39:50] logging.py:157 >> {'loss': 1.9419, 'learning_rate': 5.9871e-05, 'epoch': 1.42} + +[INFO|2025-01-23 05:40:11] logging.py:157 >> {'loss': 1.9051, 'learning_rate': 5.9533e-05, 'epoch': 1.42} + +[INFO|2025-01-23 05:40:31] logging.py:157 >> {'loss': 1.9196, 'learning_rate': 5.9196e-05, 'epoch': 1.42} + +[INFO|2025-01-23 05:40:52] logging.py:157 >> {'loss': 2.0086, 'learning_rate': 5.8859e-05, 'epoch': 1.43} + +[INFO|2025-01-23 05:41:13] logging.py:157 >> {'loss': 1.9886, 'learning_rate': 5.8524e-05, 'epoch': 1.43} + +[INFO|2025-01-23 05:41:34] logging.py:157 >> {'loss': 2.0076, 'learning_rate': 5.8188e-05, 'epoch': 1.43} + +[INFO|2025-01-23 05:41:55] logging.py:157 >> {'loss': 1.8839, 'learning_rate': 5.7854e-05, 'epoch': 1.43} + +[INFO|2025-01-23 05:42:16] logging.py:157 >> {'loss': 2.1015, 'learning_rate': 5.7520e-05, 'epoch': 1.43} + +[INFO|2025-01-23 05:42:37] logging.py:157 >> {'loss': 2.1090, 'learning_rate': 5.7187e-05, 'epoch': 1.43} + +[INFO|2025-01-23 05:42:58] logging.py:157 >> {'loss': 1.8690, 'learning_rate': 5.6855e-05, 'epoch': 1.44} + +[INFO|2025-01-23 05:43:19] logging.py:157 >> {'loss': 2.0871, 'learning_rate': 5.6524e-05, 'epoch': 1.44} + +[INFO|2025-01-23 05:43:40] logging.py:157 >> {'loss': 2.0683, 'learning_rate': 5.6193e-05, 'epoch': 1.44} + +[INFO|2025-01-23 05:44:01] logging.py:157 >> {'loss': 2.0001, 'learning_rate': 5.5863e-05, 'epoch': 1.44} + +[INFO|2025-01-23 05:44:22] logging.py:157 >> {'loss': 1.8997, 'learning_rate': 5.5534e-05, 'epoch': 1.44} + +[INFO|2025-01-23 05:44:42] logging.py:157 >> {'loss': 1.9751, 'learning_rate': 5.5206e-05, 'epoch': 1.45} + +[INFO|2025-01-23 05:45:04] logging.py:157 >> {'loss': 1.9433, 'learning_rate': 5.4878e-05, 'epoch': 1.45} + +[INFO|2025-01-23 05:45:24] logging.py:157 >> {'loss': 1.7073, 'learning_rate': 5.4551e-05, 'epoch': 1.45} + +[INFO|2025-01-23 05:45:45] logging.py:157 >> {'loss': 1.9333, 'learning_rate': 5.4225e-05, 'epoch': 1.45} + +[INFO|2025-01-23 05:46:05] logging.py:157 >> {'loss': 1.9659, 'learning_rate': 5.3899e-05, 'epoch': 1.45} + +[INFO|2025-01-23 05:46:25] logging.py:157 >> {'loss': 2.1037, 'learning_rate': 5.3575e-05, 'epoch': 1.45} + +[INFO|2025-01-23 05:46:46] logging.py:157 >> {'loss': 1.9793, 'learning_rate': 5.3251e-05, 'epoch': 1.46} + +[INFO|2025-01-23 05:47:06] logging.py:157 >> {'loss': 2.0097, 'learning_rate': 5.2928e-05, 'epoch': 1.46} + +[INFO|2025-01-23 05:47:26] logging.py:157 >> {'loss': 2.0911, 'learning_rate': 5.2605e-05, 'epoch': 1.46} + +[INFO|2025-01-23 05:47:47] logging.py:157 >> {'loss': 2.1522, 'learning_rate': 5.2284e-05, 'epoch': 1.46} + +[INFO|2025-01-23 05:48:07] logging.py:157 >> {'loss': 1.9244, 'learning_rate': 5.1963e-05, 'epoch': 1.46} + +[INFO|2025-01-23 05:48:27] logging.py:157 >> {'loss': 2.0307, 'learning_rate': 5.1643e-05, 'epoch': 1.46} + +[INFO|2025-01-23 05:48:47] logging.py:157 >> {'loss': 2.0750, 'learning_rate': 5.1324e-05, 'epoch': 1.47} + +[INFO|2025-01-23 05:49:08] logging.py:157 >> {'loss': 2.0797, 'learning_rate': 5.1005e-05, 'epoch': 1.47} + +[INFO|2025-01-23 05:49:28] logging.py:157 >> {'loss': 2.0316, 'learning_rate': 5.0688e-05, 'epoch': 1.47} + +[INFO|2025-01-23 05:49:48] logging.py:157 >> {'loss': 2.0134, 'learning_rate': 5.0371e-05, 'epoch': 1.47} + +[INFO|2025-01-23 05:50:09] logging.py:157 >> {'loss': 2.0050, 'learning_rate': 5.0055e-05, 'epoch': 1.47} + +[INFO|2025-01-23 05:50:29] logging.py:157 >> {'loss': 2.0198, 'learning_rate': 4.9740e-05, 'epoch': 1.48} + +[INFO|2025-01-23 05:50:49] logging.py:157 >> {'loss': 1.9722, 'learning_rate': 4.9425e-05, 'epoch': 1.48} + +[INFO|2025-01-23 05:51:10] logging.py:157 >> {'loss': 1.8712, 'learning_rate': 4.9111e-05, 'epoch': 1.48} + +[INFO|2025-01-23 05:51:30] logging.py:157 >> {'loss': 1.7412, 'learning_rate': 4.8799e-05, 'epoch': 1.48} + +[INFO|2025-01-23 05:51:51] logging.py:157 >> {'loss': 1.8953, 'learning_rate': 4.8486e-05, 'epoch': 1.48} + +[INFO|2025-01-23 05:52:11] logging.py:157 >> {'loss': 1.9579, 'learning_rate': 4.8175e-05, 'epoch': 1.48} + +[INFO|2025-01-23 05:52:31] logging.py:157 >> {'loss': 1.9982, 'learning_rate': 4.7865e-05, 'epoch': 1.49} + +[INFO|2025-01-23 05:52:51] logging.py:157 >> {'loss': 1.9502, 'learning_rate': 4.7555e-05, 'epoch': 1.49} + +[INFO|2025-01-23 05:53:11] logging.py:157 >> {'loss': 2.0328, 'learning_rate': 4.7246e-05, 'epoch': 1.49} + +[INFO|2025-01-23 05:53:30] logging.py:157 >> {'loss': 1.7977, 'learning_rate': 4.6938e-05, 'epoch': 1.49} + +[INFO|2025-01-23 05:53:50] logging.py:157 >> {'loss': 2.0124, 'learning_rate': 4.6631e-05, 'epoch': 1.49} + +[INFO|2025-01-23 05:54:10] logging.py:157 >> {'loss': 2.0137, 'learning_rate': 4.6325e-05, 'epoch': 1.49} + +[INFO|2025-01-23 05:54:30] logging.py:157 >> {'loss': 1.9050, 'learning_rate': 4.6019e-05, 'epoch': 1.50} + +[INFO|2025-01-23 05:54:50] logging.py:157 >> {'loss': 1.7853, 'learning_rate': 4.5715e-05, 'epoch': 1.50} + +[INFO|2025-01-23 05:55:09] logging.py:157 >> {'loss': 1.9960, 'learning_rate': 4.5411e-05, 'epoch': 1.50} + +[INFO|2025-01-23 05:55:29] logging.py:157 >> {'loss': 1.8462, 'learning_rate': 4.5108e-05, 'epoch': 1.50} + +[INFO|2025-01-23 05:55:49] logging.py:157 >> {'loss': 1.9531, 'learning_rate': 4.4806e-05, 'epoch': 1.50} + +[INFO|2025-01-23 05:56:09] logging.py:157 >> {'loss': 1.9370, 'learning_rate': 4.4504e-05, 'epoch': 1.51} + +[INFO|2025-01-23 05:56:28] logging.py:157 >> {'loss': 2.0452, 'learning_rate': 4.4204e-05, 'epoch': 1.51} + +[INFO|2025-01-23 05:56:48] logging.py:157 >> {'loss': 1.9186, 'learning_rate': 4.3904e-05, 'epoch': 1.51} + +[INFO|2025-01-23 05:57:08] logging.py:157 >> {'loss': 1.9893, 'learning_rate': 4.3605e-05, 'epoch': 1.51} + +[INFO|2025-01-23 05:57:28] logging.py:157 >> {'loss': 1.9865, 'learning_rate': 4.3307e-05, 'epoch': 1.51} + +[INFO|2025-01-23 05:57:47] logging.py:157 >> {'loss': 1.7901, 'learning_rate': 4.3010e-05, 'epoch': 1.51} + +[INFO|2025-01-23 05:58:07] logging.py:157 >> {'loss': 1.8788, 'learning_rate': 4.2714e-05, 'epoch': 1.52} + +[INFO|2025-01-23 05:58:28] logging.py:157 >> {'loss': 1.8669, 'learning_rate': 4.2418e-05, 'epoch': 1.52} + +[INFO|2025-01-23 05:58:47] logging.py:157 >> {'loss': 1.9998, 'learning_rate': 4.2124e-05, 'epoch': 1.52} + +[INFO|2025-01-23 05:59:06] logging.py:157 >> {'loss': 1.9388, 'learning_rate': 4.1830e-05, 'epoch': 1.52} + +[INFO|2025-01-23 05:59:26] logging.py:157 >> {'loss': 1.9969, 'learning_rate': 4.1537e-05, 'epoch': 1.52} + +[INFO|2025-01-23 05:59:45] logging.py:157 >> {'loss': 1.8527, 'learning_rate': 4.1245e-05, 'epoch': 1.52} + +[INFO|2025-01-23 06:00:04] logging.py:157 >> {'loss': 1.8984, 'learning_rate': 4.0954e-05, 'epoch': 1.53} + +[INFO|2025-01-23 06:00:24] logging.py:157 >> {'loss': 1.9105, 'learning_rate': 4.0664e-05, 'epoch': 1.53} + +[INFO|2025-01-23 06:00:43] logging.py:157 >> {'loss': 1.8085, 'learning_rate': 4.0375e-05, 'epoch': 1.53} + +[INFO|2025-01-23 06:01:02] logging.py:157 >> {'loss': 1.9777, 'learning_rate': 4.0086e-05, 'epoch': 1.53} + +[INFO|2025-01-23 06:01:22] logging.py:157 >> {'loss': 1.8944, 'learning_rate': 3.9798e-05, 'epoch': 1.53} + +[INFO|2025-01-23 06:01:41] logging.py:157 >> {'loss': 2.0236, 'learning_rate': 3.9512e-05, 'epoch': 1.54} + +[INFO|2025-01-23 06:02:00] logging.py:157 >> {'loss': 2.0764, 'learning_rate': 3.9226e-05, 'epoch': 1.54} + +[INFO|2025-01-23 06:02:19] logging.py:157 >> {'loss': 1.8375, 'learning_rate': 3.8941e-05, 'epoch': 1.54} + +[INFO|2025-01-23 06:02:39] logging.py:157 >> {'loss': 1.8728, 'learning_rate': 3.8657e-05, 'epoch': 1.54} + +[INFO|2025-01-23 06:02:58] logging.py:157 >> {'loss': 1.9760, 'learning_rate': 3.8374e-05, 'epoch': 1.54} + +[INFO|2025-01-23 06:03:17] logging.py:157 >> {'loss': 1.8648, 'learning_rate': 3.8091e-05, 'epoch': 1.54} + +[INFO|2025-01-23 06:03:37] logging.py:157 >> {'loss': 1.9092, 'learning_rate': 3.7810e-05, 'epoch': 1.55} + +[INFO|2025-01-23 06:03:56] logging.py:157 >> {'loss': 2.0864, 'learning_rate': 3.7529e-05, 'epoch': 1.55} + +[INFO|2025-01-23 06:04:15] logging.py:157 >> {'loss': 1.9425, 'learning_rate': 3.7250e-05, 'epoch': 1.55} + +[INFO|2025-01-23 06:04:34] logging.py:157 >> {'loss': 2.1010, 'learning_rate': 3.6971e-05, 'epoch': 1.55} + +[INFO|2025-01-23 06:04:55] logging.py:157 >> {'loss': 2.0688, 'learning_rate': 3.6693e-05, 'epoch': 1.55} + +[INFO|2025-01-23 06:05:14] logging.py:157 >> {'loss': 1.9454, 'learning_rate': 3.6416e-05, 'epoch': 1.55} + +[INFO|2025-01-23 06:05:32] logging.py:157 >> {'loss': 1.7624, 'learning_rate': 3.6140e-05, 'epoch': 1.56} + +[INFO|2025-01-23 06:05:51] logging.py:157 >> {'loss': 2.1283, 'learning_rate': 3.5865e-05, 'epoch': 1.56} + +[INFO|2025-01-23 06:06:10] logging.py:157 >> {'loss': 1.8186, 'learning_rate': 3.5591e-05, 'epoch': 1.56} + +[INFO|2025-01-23 06:06:28] logging.py:157 >> {'loss': 1.9517, 'learning_rate': 3.5317e-05, 'epoch': 1.56} + +[INFO|2025-01-23 06:06:47] logging.py:157 >> {'loss': 1.8817, 'learning_rate': 3.5045e-05, 'epoch': 1.56} + +[INFO|2025-01-23 06:07:06] logging.py:157 >> {'loss': 1.8772, 'learning_rate': 3.4773e-05, 'epoch': 1.57} + +[INFO|2025-01-23 06:07:25] logging.py:157 >> {'loss': 1.9036, 'learning_rate': 3.4503e-05, 'epoch': 1.57} + +[INFO|2025-01-23 06:07:43] logging.py:157 >> {'loss': 1.9382, 'learning_rate': 3.4233e-05, 'epoch': 1.57} + +[INFO|2025-01-23 06:08:02] logging.py:157 >> {'loss': 2.0641, 'learning_rate': 3.3965e-05, 'epoch': 1.57} + +[INFO|2025-01-23 06:08:20] logging.py:157 >> {'loss': 1.9334, 'learning_rate': 3.3697e-05, 'epoch': 1.57} + +[INFO|2025-01-23 06:08:39] logging.py:157 >> {'loss': 2.0534, 'learning_rate': 3.3430e-05, 'epoch': 1.57} + +[INFO|2025-01-23 06:08:58] logging.py:157 >> {'loss': 1.8423, 'learning_rate': 3.3164e-05, 'epoch': 1.58} + +[INFO|2025-01-23 06:09:16] logging.py:157 >> {'loss': 2.0224, 'learning_rate': 3.2899e-05, 'epoch': 1.58} + +[INFO|2025-01-23 06:09:35] logging.py:157 >> {'loss': 1.7084, 'learning_rate': 3.2635e-05, 'epoch': 1.58} + +[INFO|2025-01-23 06:09:54] logging.py:157 >> {'loss': 2.0756, 'learning_rate': 3.2372e-05, 'epoch': 1.58} + +[INFO|2025-01-23 06:10:13] logging.py:157 >> {'loss': 1.9036, 'learning_rate': 3.2109e-05, 'epoch': 1.58} + +[INFO|2025-01-23 06:10:31] logging.py:157 >> {'loss': 1.9282, 'learning_rate': 3.1848e-05, 'epoch': 1.58} + +[INFO|2025-01-23 06:10:50] logging.py:157 >> {'loss': 1.9540, 'learning_rate': 3.1588e-05, 'epoch': 1.59} + +[INFO|2025-01-23 06:11:10] logging.py:157 >> {'loss': 1.9688, 'learning_rate': 3.1328e-05, 'epoch': 1.59} + +[INFO|2025-01-23 06:11:28] logging.py:157 >> {'loss': 2.0097, 'learning_rate': 3.1070e-05, 'epoch': 1.59} + +[INFO|2025-01-23 06:11:47] logging.py:157 >> {'loss': 1.9257, 'learning_rate': 3.0813e-05, 'epoch': 1.59} + +[INFO|2025-01-23 06:12:05] logging.py:157 >> {'loss': 1.8154, 'learning_rate': 3.0556e-05, 'epoch': 1.59} + +[INFO|2025-01-23 06:12:23] logging.py:157 >> {'loss': 1.9254, 'learning_rate': 3.0300e-05, 'epoch': 1.60} + +[INFO|2025-01-23 06:12:41] logging.py:157 >> {'loss': 2.0497, 'learning_rate': 3.0046e-05, 'epoch': 1.60} + +[INFO|2025-01-23 06:12:59] logging.py:157 >> {'loss': 1.9846, 'learning_rate': 2.9792e-05, 'epoch': 1.60} + +[INFO|2025-01-23 06:13:18] logging.py:157 >> {'loss': 1.8907, 'learning_rate': 2.9539e-05, 'epoch': 1.60} + +[INFO|2025-01-23 06:13:36] logging.py:157 >> {'loss': 2.0426, 'learning_rate': 2.9288e-05, 'epoch': 1.60} + +[INFO|2025-01-23 06:13:54] logging.py:157 >> {'loss': 2.0135, 'learning_rate': 2.9037e-05, 'epoch': 1.60} + +[INFO|2025-01-23 06:14:12] logging.py:157 >> {'loss': 1.9142, 'learning_rate': 2.8787e-05, 'epoch': 1.61} + +[INFO|2025-01-23 06:14:30] logging.py:157 >> {'loss': 1.9797, 'learning_rate': 2.8538e-05, 'epoch': 1.61} + +[INFO|2025-01-23 06:14:48] logging.py:157 >> {'loss': 1.9388, 'learning_rate': 2.8290e-05, 'epoch': 1.61} + +[INFO|2025-01-23 06:15:06] logging.py:157 >> {'loss': 1.9650, 'learning_rate': 2.8043e-05, 'epoch': 1.61} + +[INFO|2025-01-23 06:15:24] logging.py:157 >> {'loss': 1.8038, 'learning_rate': 2.7797e-05, 'epoch': 1.61} + +[INFO|2025-01-23 06:15:43] logging.py:157 >> {'loss': 1.9147, 'learning_rate': 2.7552e-05, 'epoch': 1.61} + +[INFO|2025-01-23 06:16:01] logging.py:157 >> {'loss': 2.0550, 'learning_rate': 2.7308e-05, 'epoch': 1.62} + +[INFO|2025-01-23 06:16:19] logging.py:157 >> {'loss': 1.9506, 'learning_rate': 2.7065e-05, 'epoch': 1.62} + +[INFO|2025-01-23 06:16:37] logging.py:157 >> {'loss': 2.0466, 'learning_rate': 2.6823e-05, 'epoch': 1.62} + +[INFO|2025-01-23 06:16:55] logging.py:157 >> {'loss': 1.9494, 'learning_rate': 2.6582e-05, 'epoch': 1.62} + +[INFO|2025-01-23 06:17:15] logging.py:157 >> {'loss': 2.0550, 'learning_rate': 2.6342e-05, 'epoch': 1.62} + +[INFO|2025-01-23 06:17:32] logging.py:157 >> {'loss': 1.9353, 'learning_rate': 2.6103e-05, 'epoch': 1.63} + +[INFO|2025-01-23 06:17:50] logging.py:157 >> {'loss': 1.9115, 'learning_rate': 2.5865e-05, 'epoch': 1.63} + +[INFO|2025-01-23 06:18:07] logging.py:157 >> {'loss': 1.9418, 'learning_rate': 2.5628e-05, 'epoch': 1.63} + +[INFO|2025-01-23 06:18:25] logging.py:157 >> {'loss': 1.8673, 'learning_rate': 2.5391e-05, 'epoch': 1.63} + +[INFO|2025-01-23 06:18:42] logging.py:157 >> {'loss': 2.0146, 'learning_rate': 2.5156e-05, 'epoch': 1.63} + +[INFO|2025-01-23 06:19:00] logging.py:157 >> {'loss': 1.9037, 'learning_rate': 2.4922e-05, 'epoch': 1.63} + +[INFO|2025-01-23 06:19:18] logging.py:157 >> {'loss': 1.9452, 'learning_rate': 2.4689e-05, 'epoch': 1.64} + +[INFO|2025-01-23 06:19:35] logging.py:157 >> {'loss': 2.0948, 'learning_rate': 2.4457e-05, 'epoch': 1.64} + +[INFO|2025-01-23 06:19:53] logging.py:157 >> {'loss': 1.9246, 'learning_rate': 2.4226e-05, 'epoch': 1.64} + +[INFO|2025-01-23 06:20:10] logging.py:157 >> {'loss': 2.0536, 'learning_rate': 2.3995e-05, 'epoch': 1.64} + +[INFO|2025-01-23 06:20:28] logging.py:157 >> {'loss': 1.9820, 'learning_rate': 2.3766e-05, 'epoch': 1.64} + +[INFO|2025-01-23 06:20:45] logging.py:157 >> {'loss': 1.9125, 'learning_rate': 2.3538e-05, 'epoch': 1.64} + +[INFO|2025-01-23 06:21:03] logging.py:157 >> {'loss': 1.7465, 'learning_rate': 2.3311e-05, 'epoch': 1.65} + +[INFO|2025-01-23 06:21:20] logging.py:157 >> {'loss': 1.9727, 'learning_rate': 2.3085e-05, 'epoch': 1.65} + +[INFO|2025-01-23 06:21:38] logging.py:157 >> {'loss': 1.7661, 'learning_rate': 2.2860e-05, 'epoch': 1.65} + +[INFO|2025-01-23 06:21:55] logging.py:157 >> {'loss': 1.8944, 'learning_rate': 2.2636e-05, 'epoch': 1.65} + +[INFO|2025-01-23 06:22:13] logging.py:157 >> {'loss': 2.0577, 'learning_rate': 2.2412e-05, 'epoch': 1.65} + +[INFO|2025-01-23 06:22:31] logging.py:157 >> {'loss': 1.8156, 'learning_rate': 2.2190e-05, 'epoch': 1.66} + +[INFO|2025-01-23 06:22:48] logging.py:157 >> {'loss': 1.9527, 'learning_rate': 2.1969e-05, 'epoch': 1.66} + +[INFO|2025-01-23 06:23:07] logging.py:157 >> {'loss': 2.1008, 'learning_rate': 2.1749e-05, 'epoch': 1.66} + +[INFO|2025-01-23 06:23:24] logging.py:157 >> {'loss': 2.1552, 'learning_rate': 2.1530e-05, 'epoch': 1.66} + +[INFO|2025-01-23 06:23:42] logging.py:157 >> {'loss': 1.8806, 'learning_rate': 2.1312e-05, 'epoch': 1.66} + +[INFO|2025-01-23 06:23:59] logging.py:157 >> {'loss': 2.0962, 'learning_rate': 2.1095e-05, 'epoch': 1.66} + +[INFO|2025-01-23 06:24:16] logging.py:157 >> {'loss': 1.9126, 'learning_rate': 2.0879e-05, 'epoch': 1.67} + +[INFO|2025-01-23 06:24:33] logging.py:157 >> {'loss': 1.8034, 'learning_rate': 2.0664e-05, 'epoch': 1.67} + +[INFO|2025-01-23 06:24:50] logging.py:157 >> {'loss': 1.8725, 'learning_rate': 2.0450e-05, 'epoch': 1.67} + +[INFO|2025-01-23 06:25:07] logging.py:157 >> {'loss': 1.8807, 'learning_rate': 2.0238e-05, 'epoch': 1.67} + +[INFO|2025-01-23 06:25:24] logging.py:157 >> {'loss': 1.8714, 'learning_rate': 2.0026e-05, 'epoch': 1.67} + +[INFO|2025-01-23 06:25:41] logging.py:157 >> {'loss': 1.9810, 'learning_rate': 1.9815e-05, 'epoch': 1.67} + +[INFO|2025-01-23 06:25:58] logging.py:157 >> {'loss': 2.0369, 'learning_rate': 1.9605e-05, 'epoch': 1.68} + +[INFO|2025-01-23 06:26:15] logging.py:157 >> {'loss': 2.1063, 'learning_rate': 1.9396e-05, 'epoch': 1.68} + +[INFO|2025-01-23 06:26:32] logging.py:157 >> {'loss': 2.1145, 'learning_rate': 1.9189e-05, 'epoch': 1.68} + +[INFO|2025-01-23 06:26:50] logging.py:157 >> {'loss': 2.0411, 'learning_rate': 1.8982e-05, 'epoch': 1.68} + +[INFO|2025-01-23 06:27:07] logging.py:157 >> {'loss': 2.0615, 'learning_rate': 1.8776e-05, 'epoch': 1.68} + +[INFO|2025-01-23 06:27:24] logging.py:157 >> {'loss': 1.9214, 'learning_rate': 1.8572e-05, 'epoch': 1.69} + +[INFO|2025-01-23 06:27:41] logging.py:157 >> {'loss': 1.6658, 'learning_rate': 1.8368e-05, 'epoch': 1.69} + +[INFO|2025-01-23 06:27:58] logging.py:157 >> {'loss': 1.9385, 'learning_rate': 1.8166e-05, 'epoch': 1.69} + +[INFO|2025-01-23 06:28:15] logging.py:157 >> {'loss': 2.0329, 'learning_rate': 1.7965e-05, 'epoch': 1.69} + +[INFO|2025-01-23 06:28:32] logging.py:157 >> {'loss': 1.9731, 'learning_rate': 1.7764e-05, 'epoch': 1.69} + +[INFO|2025-01-23 06:28:50] logging.py:157 >> {'loss': 2.0341, 'learning_rate': 1.7565e-05, 'epoch': 1.69} + +[INFO|2025-01-23 06:29:07] logging.py:157 >> {'loss': 2.0900, 'learning_rate': 1.7367e-05, 'epoch': 1.70} + +[INFO|2025-01-23 06:29:24] logging.py:157 >> {'loss': 2.1418, 'learning_rate': 1.7169e-05, 'epoch': 1.70} + +[INFO|2025-01-23 06:29:40] logging.py:157 >> {'loss': 2.0008, 'learning_rate': 1.6973e-05, 'epoch': 1.70} + +[INFO|2025-01-23 06:29:57] logging.py:157 >> {'loss': 2.0369, 'learning_rate': 1.6778e-05, 'epoch': 1.70} + +[INFO|2025-01-23 06:30:13] logging.py:157 >> {'loss': 1.9419, 'learning_rate': 1.6584e-05, 'epoch': 1.70} + +[INFO|2025-01-23 06:30:30] logging.py:157 >> {'loss': 2.0578, 'learning_rate': 1.6391e-05, 'epoch': 1.70} + +[INFO|2025-01-23 06:30:46] logging.py:157 >> {'loss': 1.8902, 'learning_rate': 1.6199e-05, 'epoch': 1.71} + +[INFO|2025-01-23 06:31:03] logging.py:157 >> {'loss': 2.0424, 'learning_rate': 1.6009e-05, 'epoch': 1.71} + +[INFO|2025-01-23 06:31:19] logging.py:157 >> {'loss': 1.8735, 'learning_rate': 1.5819e-05, 'epoch': 1.71} + +[INFO|2025-01-23 06:31:36] logging.py:157 >> {'loss': 2.0012, 'learning_rate': 1.5630e-05, 'epoch': 1.71} + +[INFO|2025-01-23 06:31:52] logging.py:157 >> {'loss': 1.9999, 'learning_rate': 1.5443e-05, 'epoch': 1.71} + +[INFO|2025-01-23 06:32:09] logging.py:157 >> {'loss': 1.9460, 'learning_rate': 1.5256e-05, 'epoch': 1.72} + +[INFO|2025-01-23 06:32:25] logging.py:157 >> {'loss': 1.9577, 'learning_rate': 1.5071e-05, 'epoch': 1.72} + +[INFO|2025-01-23 06:32:42] logging.py:157 >> {'loss': 2.0624, 'learning_rate': 1.4886e-05, 'epoch': 1.72} + +[INFO|2025-01-23 06:32:58] logging.py:157 >> {'loss': 1.7530, 'learning_rate': 1.4703e-05, 'epoch': 1.72} + +[INFO|2025-01-23 06:33:15] logging.py:157 >> {'loss': 1.9840, 'learning_rate': 1.4521e-05, 'epoch': 1.72} + +[INFO|2025-01-23 06:33:31] logging.py:157 >> {'loss': 1.8575, 'learning_rate': 1.4339e-05, 'epoch': 1.72} + +[INFO|2025-01-23 06:33:48] logging.py:157 >> {'loss': 1.8484, 'learning_rate': 1.4159e-05, 'epoch': 1.73} + +[INFO|2025-01-23 06:34:04] logging.py:157 >> {'loss': 1.7615, 'learning_rate': 1.3980e-05, 'epoch': 1.73} + +[INFO|2025-01-23 06:34:22] logging.py:157 >> {'loss': 1.8561, 'learning_rate': 1.3802e-05, 'epoch': 1.73} + +[INFO|2025-01-23 06:34:38] logging.py:157 >> {'loss': 2.0643, 'learning_rate': 1.3626e-05, 'epoch': 1.73} + +[INFO|2025-01-23 06:34:54] logging.py:157 >> {'loss': 1.9739, 'learning_rate': 1.3450e-05, 'epoch': 1.73} + +[INFO|2025-01-23 06:35:10] logging.py:157 >> {'loss': 1.9237, 'learning_rate': 1.3275e-05, 'epoch': 1.73} + +[INFO|2025-01-23 06:35:27] logging.py:157 >> {'loss': 1.9980, 'learning_rate': 1.3102e-05, 'epoch': 1.74} + +[INFO|2025-01-23 06:35:43] logging.py:157 >> {'loss': 1.9682, 'learning_rate': 1.2929e-05, 'epoch': 1.74} + +[INFO|2025-01-23 06:35:59] logging.py:157 >> {'loss': 1.8698, 'learning_rate': 1.2758e-05, 'epoch': 1.74} + +[INFO|2025-01-23 06:36:15] logging.py:157 >> {'loss': 1.9138, 'learning_rate': 1.2588e-05, 'epoch': 1.74} + +[INFO|2025-01-23 06:36:31] logging.py:157 >> {'loss': 1.7903, 'learning_rate': 1.2418e-05, 'epoch': 1.74} + +[INFO|2025-01-23 06:36:47] logging.py:157 >> {'loss': 1.9706, 'learning_rate': 1.2250e-05, 'epoch': 1.75} + +[INFO|2025-01-23 06:37:03] logging.py:157 >> {'loss': 1.9454, 'learning_rate': 1.2083e-05, 'epoch': 1.75} + +[INFO|2025-01-23 06:37:19] logging.py:157 >> {'loss': 1.9370, 'learning_rate': 1.1917e-05, 'epoch': 1.75} + +[INFO|2025-01-23 06:37:35] logging.py:157 >> {'loss': 1.8670, 'learning_rate': 1.1752e-05, 'epoch': 1.75} + +[INFO|2025-01-23 06:37:51] logging.py:157 >> {'loss': 2.0516, 'learning_rate': 1.1589e-05, 'epoch': 1.75} + +[INFO|2025-01-23 06:38:07] logging.py:157 >> {'loss': 1.9852, 'learning_rate': 1.1426e-05, 'epoch': 1.75} + +[INFO|2025-01-23 06:38:23] logging.py:157 >> {'loss': 2.0095, 'learning_rate': 1.1265e-05, 'epoch': 1.76} + +[INFO|2025-01-23 06:38:40] logging.py:157 >> {'loss': 1.9413, 'learning_rate': 1.1104e-05, 'epoch': 1.76} + +[INFO|2025-01-23 06:38:56] logging.py:157 >> {'loss': 1.9451, 'learning_rate': 1.0945e-05, 'epoch': 1.76} + +[INFO|2025-01-23 06:39:12] logging.py:157 >> {'loss': 1.9751, 'learning_rate': 1.0787e-05, 'epoch': 1.76} + +[INFO|2025-01-23 06:39:28] logging.py:157 >> {'loss': 2.0106, 'learning_rate': 1.0630e-05, 'epoch': 1.76} + +[INFO|2025-01-23 06:39:45] logging.py:157 >> {'loss': 1.8719, 'learning_rate': 1.0474e-05, 'epoch': 1.76} + +[INFO|2025-01-23 06:39:45] trainer.py:4117 >> +***** Running Evaluation ***** + +[INFO|2025-01-23 06:39:45] trainer.py:4119 >> Num examples = 182 + +[INFO|2025-01-23 06:39:45] trainer.py:4122 >> Batch size = 4 + +[INFO|2025-01-23 06:39:52] trainer.py:3801 >> Saving model checkpoint to saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/checkpoint-5000 + +[INFO|2025-01-23 06:39:52] configuration_utils.py:414 >> Configuration saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/checkpoint-5000/config.json + +[INFO|2025-01-23 06:39:52] configuration_utils.py:865 >> Configuration saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/checkpoint-5000/generation_config.json + +[INFO|2025-01-23 06:39:55] modeling_utils.py:3035 >> Model weights saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/checkpoint-5000/model.safetensors + +[INFO|2025-01-23 06:39:55] tokenization_utils_base.py:2646 >> tokenizer config file saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/checkpoint-5000/tokenizer_config.json + +[INFO|2025-01-23 06:39:55] tokenization_utils_base.py:2655 >> Special tokens file saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/checkpoint-5000/special_tokens_map.json + +[INFO|2025-01-23 06:40:12] logging.py:157 >> {'loss': 1.9646, 'learning_rate': 1.0319e-05, 'epoch': 1.77} + +[INFO|2025-01-23 06:40:28] logging.py:157 >> {'loss': 2.0326, 'learning_rate': 1.0165e-05, 'epoch': 1.77} + +[INFO|2025-01-23 06:40:43] logging.py:157 >> {'loss': 1.9593, 'learning_rate': 1.0013e-05, 'epoch': 1.77} + +[INFO|2025-01-23 06:40:59] logging.py:157 >> {'loss': 2.1090, 'learning_rate': 9.8610e-06, 'epoch': 1.77} + +[INFO|2025-01-23 06:41:14] logging.py:157 >> {'loss': 1.9195, 'learning_rate': 9.7106e-06, 'epoch': 1.77} + +[INFO|2025-01-23 06:41:30] logging.py:157 >> {'loss': 2.0177, 'learning_rate': 9.5614e-06, 'epoch': 1.78} + +[INFO|2025-01-23 06:41:45] logging.py:157 >> {'loss': 1.8478, 'learning_rate': 9.4132e-06, 'epoch': 1.78} + +[INFO|2025-01-23 06:42:01] logging.py:157 >> {'loss': 1.8930, 'learning_rate': 9.2662e-06, 'epoch': 1.78} + +[INFO|2025-01-23 06:42:16] logging.py:157 >> {'loss': 1.9549, 'learning_rate': 9.1203e-06, 'epoch': 1.78} + +[INFO|2025-01-23 06:42:31] logging.py:157 >> {'loss': 1.7915, 'learning_rate': 8.9755e-06, 'epoch': 1.78} + +[INFO|2025-01-23 06:42:47] logging.py:157 >> {'loss': 1.8485, 'learning_rate': 8.8318e-06, 'epoch': 1.78} + +[INFO|2025-01-23 06:43:03] logging.py:157 >> {'loss': 1.9637, 'learning_rate': 8.6892e-06, 'epoch': 1.79} + +[INFO|2025-01-23 06:43:18] logging.py:157 >> {'loss': 1.8866, 'learning_rate': 8.5478e-06, 'epoch': 1.79} + +[INFO|2025-01-23 06:43:33] logging.py:157 >> {'loss': 2.1032, 'learning_rate': 8.4075e-06, 'epoch': 1.79} + +[INFO|2025-01-23 06:43:49] logging.py:157 >> {'loss': 1.9345, 'learning_rate': 8.2683e-06, 'epoch': 1.79} + +[INFO|2025-01-23 06:44:04] logging.py:157 >> {'loss': 1.9074, 'learning_rate': 8.1303e-06, 'epoch': 1.79} + +[INFO|2025-01-23 06:44:20] logging.py:157 >> {'loss': 1.9735, 'learning_rate': 7.9934e-06, 'epoch': 1.79} + +[INFO|2025-01-23 06:44:35] logging.py:157 >> {'loss': 1.8525, 'learning_rate': 7.8576e-06, 'epoch': 1.80} + +[INFO|2025-01-23 06:44:51] logging.py:157 >> {'loss': 2.0201, 'learning_rate': 7.7230e-06, 'epoch': 1.80} + +[INFO|2025-01-23 06:45:08] logging.py:157 >> {'loss': 1.9816, 'learning_rate': 7.5895e-06, 'epoch': 1.80} + +[INFO|2025-01-23 06:45:23] logging.py:157 >> {'loss': 2.0992, 'learning_rate': 7.4571e-06, 'epoch': 1.80} + +[INFO|2025-01-23 06:45:37] logging.py:157 >> {'loss': 1.8921, 'learning_rate': 7.3258e-06, 'epoch': 1.80} + +[INFO|2025-01-23 06:45:52] logging.py:157 >> {'loss': 1.9042, 'learning_rate': 7.1957e-06, 'epoch': 1.81} + +[INFO|2025-01-23 06:46:07] logging.py:157 >> {'loss': 1.9033, 'learning_rate': 7.0668e-06, 'epoch': 1.81} + +[INFO|2025-01-23 06:46:22] logging.py:157 >> {'loss': 1.9908, 'learning_rate': 6.9389e-06, 'epoch': 1.81} + +[INFO|2025-01-23 06:46:37] logging.py:157 >> {'loss': 1.8681, 'learning_rate': 6.8122e-06, 'epoch': 1.81} + +[INFO|2025-01-23 06:46:52] logging.py:157 >> {'loss': 1.9336, 'learning_rate': 6.6867e-06, 'epoch': 1.81} + +[INFO|2025-01-23 06:47:07] logging.py:157 >> {'loss': 1.9743, 'learning_rate': 6.5623e-06, 'epoch': 1.81} + +[INFO|2025-01-23 06:47:22] logging.py:157 >> {'loss': 2.0854, 'learning_rate': 6.4390e-06, 'epoch': 1.82} + +[INFO|2025-01-23 06:47:36] logging.py:157 >> {'loss': 2.0476, 'learning_rate': 6.3169e-06, 'epoch': 1.82} + +[INFO|2025-01-23 06:47:51] logging.py:157 >> {'loss': 2.0668, 'learning_rate': 6.1959e-06, 'epoch': 1.82} + +[INFO|2025-01-23 06:48:06] logging.py:157 >> {'loss': 1.9850, 'learning_rate': 6.0761e-06, 'epoch': 1.82} + +[INFO|2025-01-23 06:48:21] logging.py:157 >> {'loss': 1.8655, 'learning_rate': 5.9574e-06, 'epoch': 1.82} + +[INFO|2025-01-23 06:48:36] logging.py:157 >> {'loss': 2.0488, 'learning_rate': 5.8398e-06, 'epoch': 1.82} + +[INFO|2025-01-23 06:48:51] logging.py:157 >> {'loss': 1.9829, 'learning_rate': 5.7234e-06, 'epoch': 1.83} + +[INFO|2025-01-23 06:49:06] logging.py:157 >> {'loss': 2.0452, 'learning_rate': 5.6082e-06, 'epoch': 1.83} + +[INFO|2025-01-23 06:49:21] logging.py:157 >> {'loss': 2.0024, 'learning_rate': 5.4941e-06, 'epoch': 1.83} + +[INFO|2025-01-23 06:49:36] logging.py:157 >> {'loss': 1.8852, 'learning_rate': 5.3811e-06, 'epoch': 1.83} + +[INFO|2025-01-23 06:49:51] logging.py:157 >> {'loss': 2.0005, 'learning_rate': 5.2694e-06, 'epoch': 1.83} + +[INFO|2025-01-23 06:50:07] logging.py:157 >> {'loss': 2.0418, 'learning_rate': 5.1587e-06, 'epoch': 1.84} + +[INFO|2025-01-23 06:50:21] logging.py:157 >> {'loss': 1.9405, 'learning_rate': 5.0492e-06, 'epoch': 1.84} + +[INFO|2025-01-23 06:50:35] logging.py:157 >> {'loss': 2.0745, 'learning_rate': 4.9409e-06, 'epoch': 1.84} + +[INFO|2025-01-23 06:50:50] logging.py:157 >> {'loss': 1.8338, 'learning_rate': 4.8337e-06, 'epoch': 1.84} + +[INFO|2025-01-23 06:51:04] logging.py:157 >> {'loss': 1.9601, 'learning_rate': 4.7277e-06, 'epoch': 1.84} + +[INFO|2025-01-23 06:51:18] logging.py:157 >> {'loss': 1.8938, 'learning_rate': 4.6228e-06, 'epoch': 1.84} + +[INFO|2025-01-23 06:51:33] logging.py:157 >> {'loss': 1.9992, 'learning_rate': 4.5191e-06, 'epoch': 1.85} + +[INFO|2025-01-23 06:51:47] logging.py:157 >> {'loss': 1.9458, 'learning_rate': 4.4166e-06, 'epoch': 1.85} + +[INFO|2025-01-23 06:52:01] logging.py:157 >> {'loss': 1.8073, 'learning_rate': 4.3152e-06, 'epoch': 1.85} + +[INFO|2025-01-23 06:52:15] logging.py:157 >> {'loss': 2.0060, 'learning_rate': 4.2150e-06, 'epoch': 1.85} + +[INFO|2025-01-23 06:52:30] logging.py:157 >> {'loss': 1.9002, 'learning_rate': 4.1159e-06, 'epoch': 1.85} + +[INFO|2025-01-23 06:52:44] logging.py:157 >> {'loss': 1.9321, 'learning_rate': 4.0180e-06, 'epoch': 1.85} + +[INFO|2025-01-23 06:52:58] logging.py:157 >> {'loss': 1.7990, 'learning_rate': 3.9213e-06, 'epoch': 1.86} + +[INFO|2025-01-23 06:53:13] logging.py:157 >> {'loss': 1.8247, 'learning_rate': 3.8257e-06, 'epoch': 1.86} + +[INFO|2025-01-23 06:53:27] logging.py:157 >> {'loss': 1.8936, 'learning_rate': 3.7313e-06, 'epoch': 1.86} + +[INFO|2025-01-23 06:53:41] logging.py:157 >> {'loss': 2.0703, 'learning_rate': 3.6380e-06, 'epoch': 1.86} + +[INFO|2025-01-23 06:53:56] logging.py:157 >> {'loss': 1.9064, 'learning_rate': 3.5459e-06, 'epoch': 1.86} + +[INFO|2025-01-23 06:54:10] logging.py:157 >> {'loss': 1.9654, 'learning_rate': 3.4550e-06, 'epoch': 1.87} + +[INFO|2025-01-23 06:54:24] logging.py:157 >> {'loss': 1.8933, 'learning_rate': 3.3653e-06, 'epoch': 1.87} + +[INFO|2025-01-23 06:54:38] logging.py:157 >> {'loss': 1.8224, 'learning_rate': 3.2767e-06, 'epoch': 1.87} + +[INFO|2025-01-23 06:54:54] logging.py:157 >> {'loss': 1.7724, 'learning_rate': 3.1893e-06, 'epoch': 1.87} + +[INFO|2025-01-23 06:55:08] logging.py:157 >> {'loss': 1.9049, 'learning_rate': 3.1030e-06, 'epoch': 1.87} + +[INFO|2025-01-23 06:55:21] logging.py:157 >> {'loss': 1.9944, 'learning_rate': 3.0180e-06, 'epoch': 1.87} + +[INFO|2025-01-23 06:55:35] logging.py:157 >> {'loss': 2.0412, 'learning_rate': 2.9341e-06, 'epoch': 1.88} + +[INFO|2025-01-23 06:55:49] logging.py:157 >> {'loss': 2.0387, 'learning_rate': 2.8513e-06, 'epoch': 1.88} + +[INFO|2025-01-23 06:56:03] logging.py:157 >> {'loss': 1.9938, 'learning_rate': 2.7698e-06, 'epoch': 1.88} + +[INFO|2025-01-23 06:56:16] logging.py:157 >> {'loss': 1.9957, 'learning_rate': 2.6894e-06, 'epoch': 1.88} + +[INFO|2025-01-23 06:56:30] logging.py:157 >> {'loss': 2.1756, 'learning_rate': 2.6102e-06, 'epoch': 1.88} + +[INFO|2025-01-23 06:56:44] logging.py:157 >> {'loss': 1.9178, 'learning_rate': 2.5321e-06, 'epoch': 1.88} + +[INFO|2025-01-23 06:56:58] logging.py:157 >> {'loss': 1.8600, 'learning_rate': 2.4553e-06, 'epoch': 1.89} + +[INFO|2025-01-23 06:57:12] logging.py:157 >> {'loss': 2.1114, 'learning_rate': 2.3796e-06, 'epoch': 1.89} + +[INFO|2025-01-23 06:57:26] logging.py:157 >> {'loss': 1.8879, 'learning_rate': 2.3050e-06, 'epoch': 1.89} + +[INFO|2025-01-23 06:57:39] logging.py:157 >> {'loss': 2.0636, 'learning_rate': 2.2317e-06, 'epoch': 1.89} + +[INFO|2025-01-23 06:57:53] logging.py:157 >> {'loss': 1.8624, 'learning_rate': 2.1595e-06, 'epoch': 1.89} + +[INFO|2025-01-23 06:58:07] logging.py:157 >> {'loss': 2.0062, 'learning_rate': 2.0886e-06, 'epoch': 1.90} + +[INFO|2025-01-23 06:58:21] logging.py:157 >> {'loss': 1.8553, 'learning_rate': 2.0188e-06, 'epoch': 1.90} + +[INFO|2025-01-23 06:58:35] logging.py:157 >> {'loss': 2.0964, 'learning_rate': 1.9501e-06, 'epoch': 1.90} + +[INFO|2025-01-23 06:58:49] logging.py:157 >> {'loss': 2.0080, 'learning_rate': 1.8827e-06, 'epoch': 1.90} + +[INFO|2025-01-23 06:59:03] logging.py:157 >> {'loss': 2.0443, 'learning_rate': 1.8164e-06, 'epoch': 1.90} + +[INFO|2025-01-23 06:59:16] logging.py:157 >> {'loss': 1.8145, 'learning_rate': 1.7513e-06, 'epoch': 1.90} + +[INFO|2025-01-23 06:59:31] logging.py:157 >> {'loss': 1.9071, 'learning_rate': 1.6874e-06, 'epoch': 1.91} + +[INFO|2025-01-23 06:59:45] logging.py:157 >> {'loss': 1.9990, 'learning_rate': 1.6247e-06, 'epoch': 1.91} + +[INFO|2025-01-23 06:59:58] logging.py:157 >> {'loss': 2.0992, 'learning_rate': 1.5631e-06, 'epoch': 1.91} + +[INFO|2025-01-23 07:00:11] logging.py:157 >> {'loss': 1.9252, 'learning_rate': 1.5028e-06, 'epoch': 1.91} + +[INFO|2025-01-23 07:00:25] logging.py:157 >> {'loss': 2.0734, 'learning_rate': 1.4436e-06, 'epoch': 1.91} + +[INFO|2025-01-23 07:00:38] logging.py:157 >> {'loss': 1.9187, 'learning_rate': 1.3856e-06, 'epoch': 1.91} + +[INFO|2025-01-23 07:00:51] logging.py:157 >> {'loss': 1.8917, 'learning_rate': 1.3288e-06, 'epoch': 1.92} + +[INFO|2025-01-23 07:01:04] logging.py:157 >> {'loss': 2.1228, 'learning_rate': 1.2732e-06, 'epoch': 1.92} + +[INFO|2025-01-23 07:01:18] logging.py:157 >> {'loss': 2.0537, 'learning_rate': 1.2187e-06, 'epoch': 1.92} + +[INFO|2025-01-23 07:01:31] logging.py:157 >> {'loss': 1.9200, 'learning_rate': 1.1655e-06, 'epoch': 1.92} + +[INFO|2025-01-23 07:01:44] logging.py:157 >> {'loss': 2.1021, 'learning_rate': 1.1134e-06, 'epoch': 1.92} + +[INFO|2025-01-23 07:01:57] logging.py:157 >> {'loss': 1.8913, 'learning_rate': 1.0625e-06, 'epoch': 1.93} + +[INFO|2025-01-23 07:02:11] logging.py:157 >> {'loss': 1.8375, 'learning_rate': 1.0128e-06, 'epoch': 1.93} + +[INFO|2025-01-23 07:02:24] logging.py:157 >> {'loss': 1.8955, 'learning_rate': 9.6427e-07, 'epoch': 1.93} + +[INFO|2025-01-23 07:02:38] logging.py:157 >> {'loss': 1.9922, 'learning_rate': 9.1695e-07, 'epoch': 1.93} + +[INFO|2025-01-23 07:02:51] logging.py:157 >> {'loss': 1.9150, 'learning_rate': 8.7080e-07, 'epoch': 1.93} + +[INFO|2025-01-23 07:03:04] logging.py:157 >> {'loss': 1.9352, 'learning_rate': 8.2585e-07, 'epoch': 1.93} + +[INFO|2025-01-23 07:03:17] logging.py:157 >> {'loss': 2.0887, 'learning_rate': 7.8209e-07, 'epoch': 1.94} + +[INFO|2025-01-23 07:03:31] logging.py:157 >> {'loss': 2.1482, 'learning_rate': 7.3951e-07, 'epoch': 1.94} + +[INFO|2025-01-23 07:03:44] logging.py:157 >> {'loss': 1.9899, 'learning_rate': 6.9812e-07, 'epoch': 1.94} + +[INFO|2025-01-23 07:03:58] logging.py:157 >> {'loss': 2.0606, 'learning_rate': 6.5792e-07, 'epoch': 1.94} + +[INFO|2025-01-23 07:04:11] logging.py:157 >> {'loss': 2.0384, 'learning_rate': 6.1891e-07, 'epoch': 1.94} + +[INFO|2025-01-23 07:04:24] logging.py:157 >> {'loss': 1.8498, 'learning_rate': 5.8109e-07, 'epoch': 1.94} + +[INFO|2025-01-23 07:04:37] logging.py:157 >> {'loss': 1.8390, 'learning_rate': 5.4446e-07, 'epoch': 1.95} + +[INFO|2025-01-23 07:04:50] logging.py:157 >> {'loss': 1.8862, 'learning_rate': 5.0902e-07, 'epoch': 1.95} + +[INFO|2025-01-23 07:05:03] logging.py:157 >> {'loss': 2.1260, 'learning_rate': 4.7477e-07, 'epoch': 1.95} + +[INFO|2025-01-23 07:05:15] logging.py:157 >> {'loss': 1.9376, 'learning_rate': 4.4171e-07, 'epoch': 1.95} + +[INFO|2025-01-23 07:05:28] logging.py:157 >> {'loss': 1.9896, 'learning_rate': 4.0984e-07, 'epoch': 1.95} + +[INFO|2025-01-23 07:05:41] logging.py:157 >> {'loss': 1.8946, 'learning_rate': 3.7917e-07, 'epoch': 1.96} + +[INFO|2025-01-23 07:05:54] logging.py:157 >> {'loss': 2.0307, 'learning_rate': 3.4968e-07, 'epoch': 1.96} + +[INFO|2025-01-23 07:06:06] logging.py:157 >> {'loss': 2.0076, 'learning_rate': 3.2139e-07, 'epoch': 1.96} + +[INFO|2025-01-23 07:06:19] logging.py:157 >> {'loss': 2.0952, 'learning_rate': 2.9429e-07, 'epoch': 1.96} + +[INFO|2025-01-23 07:06:32] logging.py:157 >> {'loss': 2.1741, 'learning_rate': 2.6838e-07, 'epoch': 1.96} + +[INFO|2025-01-23 07:06:45] logging.py:157 >> {'loss': 1.8720, 'learning_rate': 2.4367e-07, 'epoch': 1.96} + +[INFO|2025-01-23 07:06:57] logging.py:157 >> {'loss': 1.8811, 'learning_rate': 2.2015e-07, 'epoch': 1.97} + +[INFO|2025-01-23 07:07:10] logging.py:157 >> {'loss': 1.9227, 'learning_rate': 1.9782e-07, 'epoch': 1.97} + +[INFO|2025-01-23 07:07:22] logging.py:157 >> {'loss': 1.8001, 'learning_rate': 1.7668e-07, 'epoch': 1.97} + +[INFO|2025-01-23 07:07:35] logging.py:157 >> {'loss': 1.9557, 'learning_rate': 1.5674e-07, 'epoch': 1.97} + +[INFO|2025-01-23 07:07:48] logging.py:157 >> {'loss': 1.9966, 'learning_rate': 1.3799e-07, 'epoch': 1.97} + +[INFO|2025-01-23 07:08:01] logging.py:157 >> {'loss': 2.0720, 'learning_rate': 1.2043e-07, 'epoch': 1.97} + +[INFO|2025-01-23 07:08:18] logging.py:157 >> {'loss': 2.0203, 'learning_rate': 1.0407e-07, 'epoch': 1.98} + +[INFO|2025-01-23 07:08:45] logging.py:157 >> {'loss': 2.0843, 'learning_rate': 8.8898e-08, 'epoch': 1.98} + +[INFO|2025-01-23 07:09:13] logging.py:157 >> {'loss': 2.0887, 'learning_rate': 7.4923e-08, 'epoch': 1.98} + +[INFO|2025-01-23 07:09:40] logging.py:157 >> {'loss': 2.0708, 'learning_rate': 6.2142e-08, 'epoch': 1.98} + +[INFO|2025-01-23 07:10:08] logging.py:157 >> {'loss': 2.0345, 'learning_rate': 5.0555e-08, 'epoch': 1.98} + +[INFO|2025-01-23 07:10:35] logging.py:157 >> {'loss': 1.9149, 'learning_rate': 4.0163e-08, 'epoch': 1.99} + +[INFO|2025-01-23 07:11:02] logging.py:157 >> {'loss': 1.9564, 'learning_rate': 3.0965e-08, 'epoch': 1.99} + +[INFO|2025-01-23 07:11:30] logging.py:157 >> {'loss': 2.0475, 'learning_rate': 2.2961e-08, 'epoch': 1.99} + +[INFO|2025-01-23 07:11:57] logging.py:157 >> {'loss': 1.8153, 'learning_rate': 1.6152e-08, 'epoch': 1.99} + +[INFO|2025-01-23 07:12:25] logging.py:157 >> {'loss': 1.8298, 'learning_rate': 1.0537e-08, 'epoch': 1.99} + +[INFO|2025-01-23 07:12:52] logging.py:157 >> {'loss': 1.9217, 'learning_rate': 6.1166e-09, 'epoch': 1.99} + +[INFO|2025-01-23 07:13:19] logging.py:157 >> {'loss': 1.9689, 'learning_rate': 2.8911e-09, 'epoch': 2.00} + +[INFO|2025-01-23 07:13:46] logging.py:157 >> {'loss': 1.7275, 'learning_rate': 8.6015e-10, 'epoch': 2.00} + +[INFO|2025-01-23 07:14:14] logging.py:157 >> {'loss': 2.0460, 'learning_rate': 2.3893e-11, 'epoch': 2.00} + +[INFO|2025-01-23 07:14:20] trainer.py:3801 >> Saving model checkpoint to saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/checkpoint-5666 + +[INFO|2025-01-23 07:14:20] configuration_utils.py:414 >> Configuration saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/checkpoint-5666/config.json + +[INFO|2025-01-23 07:14:20] configuration_utils.py:865 >> Configuration saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/checkpoint-5666/generation_config.json + +[INFO|2025-01-23 07:14:23] modeling_utils.py:3035 >> Model weights saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/checkpoint-5666/model.safetensors + +[INFO|2025-01-23 07:14:23] tokenization_utils_base.py:2646 >> tokenizer config file saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/checkpoint-5666/tokenizer_config.json + +[INFO|2025-01-23 07:14:23] tokenization_utils_base.py:2655 >> Special tokens file saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/checkpoint-5666/special_tokens_map.json + +[INFO|2025-01-23 07:14:24] trainer.py:2584 >> + +Training completed. Do not forget to share your model on huggingface.co/models =) + + + +[INFO|2025-01-23 07:14:25] trainer.py:3801 >> Saving model checkpoint to saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56 + +[INFO|2025-01-23 07:14:25] configuration_utils.py:414 >> Configuration saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/config.json + +[INFO|2025-01-23 07:14:25] configuration_utils.py:865 >> Configuration saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/generation_config.json + +[INFO|2025-01-23 07:14:27] modeling_utils.py:3035 >> Model weights saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/model.safetensors + +[INFO|2025-01-23 07:14:27] tokenization_utils_base.py:2646 >> tokenizer config file saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/tokenizer_config.json + +[INFO|2025-01-23 07:14:27] tokenization_utils_base.py:2655 >> Special tokens file saved in saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56/special_tokens_map.json + +[WARNING|2025-01-23 07:14:28] logging.py:162 >> No metric eval_accuracy to plot. + +[INFO|2025-01-23 07:14:28] trainer.py:4117 >> +***** Running Evaluation ***** + +[INFO|2025-01-23 07:14:28] trainer.py:4119 >> Num examples = 182 + +[INFO|2025-01-23 07:14:28] trainer.py:4122 >> Batch size = 4 + +[INFO|2025-01-23 07:14:33] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields: +{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}} +