2023-10-25 03:21:07,036 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:21:07,036 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-25 03:21:07,037 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:21:07,037 MultiCorpus: 5777 train + 722 dev + 723 test sentences - NER_ICDAR_EUROPEANA Corpus: 5777 train + 722 dev + 723 test sentences - /home/ubuntu/.flair/datasets/ner_icdar_europeana/nl 2023-10-25 03:21:07,037 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:21:07,037 Train: 5777 sentences 2023-10-25 03:21:07,037 (train_with_dev=False, train_with_test=False) 2023-10-25 03:21:07,037 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:21:07,037 Training Params: 2023-10-25 03:21:07,037 - learning_rate: "5e-05" 2023-10-25 03:21:07,037 - mini_batch_size: "8" 2023-10-25 03:21:07,037 - max_epochs: "10" 2023-10-25 03:21:07,037 - shuffle: "True" 2023-10-25 03:21:07,037 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:21:07,037 Plugins: 2023-10-25 03:21:07,037 - TensorboardLogger 2023-10-25 03:21:07,037 - LinearScheduler | warmup_fraction: '0.1' 2023-10-25 03:21:07,037 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:21:07,037 Final evaluation on model from best epoch (best-model.pt) 2023-10-25 03:21:07,037 - metric: "('micro avg', 'f1-score')" 2023-10-25 03:21:07,037 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:21:07,037 Computation: 2023-10-25 03:21:07,037 - compute on device: cuda:0 2023-10-25 03:21:07,037 - embedding storage: none 2023-10-25 03:21:07,037 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:21:07,037 Model training base path: "hmbench-icdar/nl-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs8-wsFalse-e10-lr5e-05-poolingfirst-layers-1-crfFalse-5" 2023-10-25 03:21:07,037 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:21:07,038 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:21:07,038 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-25 03:21:15,767 epoch 1 - iter 72/723 - loss 1.68766467 - time (sec): 8.73 - samples/sec: 1919.80 - lr: 0.000005 - momentum: 0.000000 2023-10-25 03:21:23,724 epoch 1 - iter 144/723 - loss 0.99031859 - time (sec): 16.69 - samples/sec: 1975.15 - lr: 0.000010 - momentum: 0.000000 2023-10-25 03:21:32,094 epoch 1 - iter 216/723 - loss 0.71808221 - time (sec): 25.06 - samples/sec: 2003.67 - lr: 0.000015 - momentum: 0.000000 2023-10-25 03:21:40,978 epoch 1 - iter 288/723 - loss 0.57309784 - time (sec): 33.94 - samples/sec: 2017.52 - lr: 0.000020 - momentum: 0.000000 2023-10-25 03:21:49,174 epoch 1 - iter 360/723 - loss 0.49129012 - time (sec): 42.14 - samples/sec: 2020.68 - lr: 0.000025 - momentum: 0.000000 2023-10-25 03:21:57,936 epoch 1 - iter 432/723 - loss 0.43005413 - time (sec): 50.90 - samples/sec: 2038.58 - lr: 0.000030 - momentum: 0.000000 2023-10-25 03:22:06,610 epoch 1 - iter 504/723 - loss 0.38879308 - time (sec): 59.57 - samples/sec: 2042.37 - lr: 0.000035 - momentum: 0.000000 2023-10-25 03:22:15,073 epoch 1 - iter 576/723 - loss 0.35796702 - time (sec): 68.03 - samples/sec: 2041.20 - lr: 0.000040 - momentum: 0.000000 2023-10-25 03:22:24,126 epoch 1 - iter 648/723 - loss 0.33078723 - time (sec): 77.09 - samples/sec: 2042.41 - lr: 0.000045 - momentum: 0.000000 2023-10-25 03:22:33,094 epoch 1 - iter 720/723 - loss 0.31081390 - time (sec): 86.06 - samples/sec: 2040.23 - lr: 0.000050 - momentum: 0.000000 2023-10-25 03:22:33,419 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:22:33,420 EPOCH 1 done: loss 0.3102 - lr: 0.000050 2023-10-25 03:22:36,444 DEV : loss 0.10029962658882141 - f1-score (micro avg) 0.714 2023-10-25 03:22:36,455 saving best model 2023-10-25 03:22:36,922 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:22:45,659 epoch 2 - iter 72/723 - loss 0.12090345 - time (sec): 8.74 - samples/sec: 1958.50 - lr: 0.000049 - momentum: 0.000000 2023-10-25 03:22:53,891 epoch 2 - iter 144/723 - loss 0.11010184 - time (sec): 16.97 - samples/sec: 2023.18 - lr: 0.000049 - momentum: 0.000000 2023-10-25 03:23:02,247 epoch 2 - iter 216/723 - loss 0.10565874 - time (sec): 25.32 - samples/sec: 2041.71 - lr: 0.000048 - momentum: 0.000000 2023-10-25 03:23:10,874 epoch 2 - iter 288/723 - loss 0.10233445 - time (sec): 33.95 - samples/sec: 2038.73 - lr: 0.000048 - momentum: 0.000000 2023-10-25 03:23:19,521 epoch 2 - iter 360/723 - loss 0.09988170 - time (sec): 42.60 - samples/sec: 2031.02 - lr: 0.000047 - momentum: 0.000000 2023-10-25 03:23:28,031 epoch 2 - iter 432/723 - loss 0.09714710 - time (sec): 51.11 - samples/sec: 2031.30 - lr: 0.000047 - momentum: 0.000000 2023-10-25 03:23:36,548 epoch 2 - iter 504/723 - loss 0.09721388 - time (sec): 59.62 - samples/sec: 2030.47 - lr: 0.000046 - momentum: 0.000000 2023-10-25 03:23:44,959 epoch 2 - iter 576/723 - loss 0.09638112 - time (sec): 68.04 - samples/sec: 2035.31 - lr: 0.000046 - momentum: 0.000000 2023-10-25 03:23:54,130 epoch 2 - iter 648/723 - loss 0.09712113 - time (sec): 77.21 - samples/sec: 2032.68 - lr: 0.000045 - momentum: 0.000000 2023-10-25 03:24:03,482 epoch 2 - iter 720/723 - loss 0.09496368 - time (sec): 86.56 - samples/sec: 2029.14 - lr: 0.000044 - momentum: 0.000000 2023-10-25 03:24:03,845 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:24:03,845 EPOCH 2 done: loss 0.0951 - lr: 0.000044 2023-10-25 03:24:07,270 DEV : loss 0.08907141536474228 - f1-score (micro avg) 0.7752 2023-10-25 03:24:07,282 saving best model 2023-10-25 03:24:07,866 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:24:16,521 epoch 3 - iter 72/723 - loss 0.05697145 - time (sec): 8.65 - samples/sec: 1979.04 - lr: 0.000044 - momentum: 0.000000 2023-10-25 03:24:25,015 epoch 3 - iter 144/723 - loss 0.06365511 - time (sec): 17.15 - samples/sec: 2016.66 - lr: 0.000043 - momentum: 0.000000 2023-10-25 03:24:34,291 epoch 3 - iter 216/723 - loss 0.06006955 - time (sec): 26.42 - samples/sec: 2028.12 - lr: 0.000043 - momentum: 0.000000 2023-10-25 03:24:42,967 epoch 3 - iter 288/723 - loss 0.05978662 - time (sec): 35.10 - samples/sec: 2039.51 - lr: 0.000042 - momentum: 0.000000 2023-10-25 03:24:51,561 epoch 3 - iter 360/723 - loss 0.06048031 - time (sec): 43.69 - samples/sec: 2052.41 - lr: 0.000042 - momentum: 0.000000 2023-10-25 03:24:59,876 epoch 3 - iter 432/723 - loss 0.06265153 - time (sec): 52.01 - samples/sec: 2048.22 - lr: 0.000041 - momentum: 0.000000 2023-10-25 03:25:08,066 epoch 3 - iter 504/723 - loss 0.06140349 - time (sec): 60.20 - samples/sec: 2048.19 - lr: 0.000041 - momentum: 0.000000 2023-10-25 03:25:16,386 epoch 3 - iter 576/723 - loss 0.06149397 - time (sec): 68.52 - samples/sec: 2052.30 - lr: 0.000040 - momentum: 0.000000 2023-10-25 03:25:24,635 epoch 3 - iter 648/723 - loss 0.06212052 - time (sec): 76.77 - samples/sec: 2055.82 - lr: 0.000039 - momentum: 0.000000 2023-10-25 03:25:33,470 epoch 3 - iter 720/723 - loss 0.06192748 - time (sec): 85.60 - samples/sec: 2049.87 - lr: 0.000039 - momentum: 0.000000 2023-10-25 03:25:33,882 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:25:33,883 EPOCH 3 done: loss 0.0618 - lr: 0.000039 2023-10-25 03:25:37,591 DEV : loss 0.08058985322713852 - f1-score (micro avg) 0.8289 2023-10-25 03:25:37,603 saving best model 2023-10-25 03:25:38,165 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:25:46,598 epoch 4 - iter 72/723 - loss 0.02797752 - time (sec): 8.43 - samples/sec: 1994.47 - lr: 0.000038 - momentum: 0.000000 2023-10-25 03:25:55,309 epoch 4 - iter 144/723 - loss 0.03614415 - time (sec): 17.14 - samples/sec: 2030.17 - lr: 0.000038 - momentum: 0.000000 2023-10-25 03:26:04,448 epoch 4 - iter 216/723 - loss 0.04145400 - time (sec): 26.28 - samples/sec: 2028.97 - lr: 0.000037 - momentum: 0.000000 2023-10-25 03:26:13,754 epoch 4 - iter 288/723 - loss 0.04451492 - time (sec): 35.59 - samples/sec: 1996.38 - lr: 0.000037 - momentum: 0.000000 2023-10-25 03:26:22,376 epoch 4 - iter 360/723 - loss 0.04603311 - time (sec): 44.21 - samples/sec: 2007.61 - lr: 0.000036 - momentum: 0.000000 2023-10-25 03:26:30,692 epoch 4 - iter 432/723 - loss 0.04476476 - time (sec): 52.53 - samples/sec: 2006.73 - lr: 0.000036 - momentum: 0.000000 2023-10-25 03:26:39,615 epoch 4 - iter 504/723 - loss 0.04287179 - time (sec): 61.45 - samples/sec: 2021.07 - lr: 0.000035 - momentum: 0.000000 2023-10-25 03:26:48,061 epoch 4 - iter 576/723 - loss 0.04179019 - time (sec): 69.89 - samples/sec: 2023.56 - lr: 0.000034 - momentum: 0.000000 2023-10-25 03:26:55,925 epoch 4 - iter 648/723 - loss 0.04178069 - time (sec): 77.76 - samples/sec: 2029.52 - lr: 0.000034 - momentum: 0.000000 2023-10-25 03:27:04,356 epoch 4 - iter 720/723 - loss 0.04186433 - time (sec): 86.19 - samples/sec: 2040.59 - lr: 0.000033 - momentum: 0.000000 2023-10-25 03:27:04,610 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:27:04,610 EPOCH 4 done: loss 0.0419 - lr: 0.000033 2023-10-25 03:27:08,040 DEV : loss 0.10547219961881638 - f1-score (micro avg) 0.8136 2023-10-25 03:27:08,051 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:27:16,658 epoch 5 - iter 72/723 - loss 0.03237456 - time (sec): 8.61 - samples/sec: 2057.62 - lr: 0.000033 - momentum: 0.000000 2023-10-25 03:27:25,600 epoch 5 - iter 144/723 - loss 0.03452148 - time (sec): 17.55 - samples/sec: 2018.99 - lr: 0.000032 - momentum: 0.000000 2023-10-25 03:27:33,985 epoch 5 - iter 216/723 - loss 0.03349648 - time (sec): 25.93 - samples/sec: 2037.16 - lr: 0.000032 - momentum: 0.000000 2023-10-25 03:27:42,171 epoch 5 - iter 288/723 - loss 0.03304803 - time (sec): 34.12 - samples/sec: 2027.60 - lr: 0.000031 - momentum: 0.000000 2023-10-25 03:27:50,434 epoch 5 - iter 360/723 - loss 0.03166254 - time (sec): 42.38 - samples/sec: 2034.44 - lr: 0.000031 - momentum: 0.000000 2023-10-25 03:27:59,929 epoch 5 - iter 432/723 - loss 0.02996305 - time (sec): 51.88 - samples/sec: 2016.91 - lr: 0.000030 - momentum: 0.000000 2023-10-25 03:28:08,267 epoch 5 - iter 504/723 - loss 0.03108088 - time (sec): 60.21 - samples/sec: 2017.39 - lr: 0.000029 - momentum: 0.000000 2023-10-25 03:28:17,112 epoch 5 - iter 576/723 - loss 0.03441065 - time (sec): 69.06 - samples/sec: 2021.48 - lr: 0.000029 - momentum: 0.000000 2023-10-25 03:28:26,189 epoch 5 - iter 648/723 - loss 0.03309049 - time (sec): 78.14 - samples/sec: 2025.80 - lr: 0.000028 - momentum: 0.000000 2023-10-25 03:28:34,845 epoch 5 - iter 720/723 - loss 0.03290640 - time (sec): 86.79 - samples/sec: 2025.49 - lr: 0.000028 - momentum: 0.000000 2023-10-25 03:28:35,078 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:28:35,078 EPOCH 5 done: loss 0.0329 - lr: 0.000028 2023-10-25 03:28:38,506 DEV : loss 0.10743129253387451 - f1-score (micro avg) 0.8259 2023-10-25 03:28:38,518 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:28:47,146 epoch 6 - iter 72/723 - loss 0.01950180 - time (sec): 8.63 - samples/sec: 2088.78 - lr: 0.000027 - momentum: 0.000000 2023-10-25 03:28:54,838 epoch 6 - iter 144/723 - loss 0.02431246 - time (sec): 16.32 - samples/sec: 2088.92 - lr: 0.000027 - momentum: 0.000000 2023-10-25 03:29:03,341 epoch 6 - iter 216/723 - loss 0.02554660 - time (sec): 24.82 - samples/sec: 2097.06 - lr: 0.000026 - momentum: 0.000000 2023-10-25 03:29:12,960 epoch 6 - iter 288/723 - loss 0.02489555 - time (sec): 34.44 - samples/sec: 2067.63 - lr: 0.000026 - momentum: 0.000000 2023-10-25 03:29:21,612 epoch 6 - iter 360/723 - loss 0.02519088 - time (sec): 43.09 - samples/sec: 2065.88 - lr: 0.000025 - momentum: 0.000000 2023-10-25 03:29:30,520 epoch 6 - iter 432/723 - loss 0.02472334 - time (sec): 52.00 - samples/sec: 2055.69 - lr: 0.000024 - momentum: 0.000000 2023-10-25 03:29:39,539 epoch 6 - iter 504/723 - loss 0.02448090 - time (sec): 61.02 - samples/sec: 2041.79 - lr: 0.000024 - momentum: 0.000000 2023-10-25 03:29:47,981 epoch 6 - iter 576/723 - loss 0.02586243 - time (sec): 69.46 - samples/sec: 2041.93 - lr: 0.000023 - momentum: 0.000000 2023-10-25 03:29:55,961 epoch 6 - iter 648/723 - loss 0.02606734 - time (sec): 77.44 - samples/sec: 2044.90 - lr: 0.000023 - momentum: 0.000000 2023-10-25 03:30:04,986 epoch 6 - iter 720/723 - loss 0.02562324 - time (sec): 86.47 - samples/sec: 2032.59 - lr: 0.000022 - momentum: 0.000000 2023-10-25 03:30:05,288 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:30:05,288 EPOCH 6 done: loss 0.0257 - lr: 0.000022 2023-10-25 03:30:09,014 DEV : loss 0.14865967631340027 - f1-score (micro avg) 0.8278 2023-10-25 03:30:09,026 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:30:17,847 epoch 7 - iter 72/723 - loss 0.01296776 - time (sec): 8.82 - samples/sec: 2066.52 - lr: 0.000022 - momentum: 0.000000 2023-10-25 03:30:26,902 epoch 7 - iter 144/723 - loss 0.01515821 - time (sec): 17.87 - samples/sec: 2044.78 - lr: 0.000021 - momentum: 0.000000 2023-10-25 03:30:35,247 epoch 7 - iter 216/723 - loss 0.01484702 - time (sec): 26.22 - samples/sec: 2036.37 - lr: 0.000021 - momentum: 0.000000 2023-10-25 03:30:44,046 epoch 7 - iter 288/723 - loss 0.01579797 - time (sec): 35.02 - samples/sec: 2029.75 - lr: 0.000020 - momentum: 0.000000 2023-10-25 03:30:52,631 epoch 7 - iter 360/723 - loss 0.01628311 - time (sec): 43.60 - samples/sec: 2030.46 - lr: 0.000019 - momentum: 0.000000 2023-10-25 03:31:01,153 epoch 7 - iter 432/723 - loss 0.01613784 - time (sec): 52.13 - samples/sec: 2033.21 - lr: 0.000019 - momentum: 0.000000 2023-10-25 03:31:09,837 epoch 7 - iter 504/723 - loss 0.01677860 - time (sec): 60.81 - samples/sec: 2024.48 - lr: 0.000018 - momentum: 0.000000 2023-10-25 03:31:18,928 epoch 7 - iter 576/723 - loss 0.01663651 - time (sec): 69.90 - samples/sec: 2025.46 - lr: 0.000018 - momentum: 0.000000 2023-10-25 03:31:27,367 epoch 7 - iter 648/723 - loss 0.01651969 - time (sec): 78.34 - samples/sec: 2028.03 - lr: 0.000017 - momentum: 0.000000 2023-10-25 03:31:35,458 epoch 7 - iter 720/723 - loss 0.01681181 - time (sec): 86.43 - samples/sec: 2030.51 - lr: 0.000017 - momentum: 0.000000 2023-10-25 03:31:35,920 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:31:35,920 EPOCH 7 done: loss 0.0168 - lr: 0.000017 2023-10-25 03:31:39,682 DEV : loss 0.15617409348487854 - f1-score (micro avg) 0.8255 2023-10-25 03:31:39,694 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:31:48,026 epoch 8 - iter 72/723 - loss 0.01544069 - time (sec): 8.33 - samples/sec: 2031.15 - lr: 0.000016 - momentum: 0.000000 2023-10-25 03:31:56,092 epoch 8 - iter 144/723 - loss 0.01094618 - time (sec): 16.40 - samples/sec: 2069.54 - lr: 0.000016 - momentum: 0.000000 2023-10-25 03:32:04,253 epoch 8 - iter 216/723 - loss 0.01140394 - time (sec): 24.56 - samples/sec: 2055.24 - lr: 0.000015 - momentum: 0.000000 2023-10-25 03:32:12,929 epoch 8 - iter 288/723 - loss 0.01145852 - time (sec): 33.23 - samples/sec: 2030.80 - lr: 0.000014 - momentum: 0.000000 2023-10-25 03:32:21,548 epoch 8 - iter 360/723 - loss 0.01128244 - time (sec): 41.85 - samples/sec: 2028.57 - lr: 0.000014 - momentum: 0.000000 2023-10-25 03:32:30,258 epoch 8 - iter 432/723 - loss 0.01132223 - time (sec): 50.56 - samples/sec: 2029.34 - lr: 0.000013 - momentum: 0.000000 2023-10-25 03:32:39,319 epoch 8 - iter 504/723 - loss 0.01152129 - time (sec): 59.62 - samples/sec: 2012.44 - lr: 0.000013 - momentum: 0.000000 2023-10-25 03:32:47,863 epoch 8 - iter 576/723 - loss 0.01234181 - time (sec): 68.17 - samples/sec: 2017.87 - lr: 0.000012 - momentum: 0.000000 2023-10-25 03:32:56,530 epoch 8 - iter 648/723 - loss 0.01271235 - time (sec): 76.83 - samples/sec: 2033.67 - lr: 0.000012 - momentum: 0.000000 2023-10-25 03:33:06,090 epoch 8 - iter 720/723 - loss 0.01201023 - time (sec): 86.40 - samples/sec: 2032.91 - lr: 0.000011 - momentum: 0.000000 2023-10-25 03:33:06,338 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:33:06,338 EPOCH 8 done: loss 0.0120 - lr: 0.000011 2023-10-25 03:33:09,773 DEV : loss 0.17202885448932648 - f1-score (micro avg) 0.8299 2023-10-25 03:33:09,785 saving best model 2023-10-25 03:33:10,374 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:33:19,969 epoch 9 - iter 72/723 - loss 0.01007116 - time (sec): 9.59 - samples/sec: 1891.90 - lr: 0.000011 - momentum: 0.000000 2023-10-25 03:33:28,571 epoch 9 - iter 144/723 - loss 0.01044210 - time (sec): 18.20 - samples/sec: 1982.50 - lr: 0.000010 - momentum: 0.000000 2023-10-25 03:33:37,499 epoch 9 - iter 216/723 - loss 0.01026889 - time (sec): 27.12 - samples/sec: 2014.27 - lr: 0.000009 - momentum: 0.000000 2023-10-25 03:33:46,186 epoch 9 - iter 288/723 - loss 0.00973665 - time (sec): 35.81 - samples/sec: 2022.23 - lr: 0.000009 - momentum: 0.000000 2023-10-25 03:33:54,191 epoch 9 - iter 360/723 - loss 0.00955647 - time (sec): 43.82 - samples/sec: 2027.79 - lr: 0.000008 - momentum: 0.000000 2023-10-25 03:34:02,428 epoch 9 - iter 432/723 - loss 0.00922189 - time (sec): 52.05 - samples/sec: 2024.17 - lr: 0.000008 - momentum: 0.000000 2023-10-25 03:34:11,130 epoch 9 - iter 504/723 - loss 0.00851347 - time (sec): 60.76 - samples/sec: 2033.68 - lr: 0.000007 - momentum: 0.000000 2023-10-25 03:34:19,732 epoch 9 - iter 576/723 - loss 0.00799039 - time (sec): 69.36 - samples/sec: 2034.77 - lr: 0.000007 - momentum: 0.000000 2023-10-25 03:34:28,576 epoch 9 - iter 648/723 - loss 0.00810368 - time (sec): 78.20 - samples/sec: 2031.04 - lr: 0.000006 - momentum: 0.000000 2023-10-25 03:34:36,912 epoch 9 - iter 720/723 - loss 0.00785735 - time (sec): 86.54 - samples/sec: 2031.17 - lr: 0.000006 - momentum: 0.000000 2023-10-25 03:34:37,188 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:34:37,188 EPOCH 9 done: loss 0.0078 - lr: 0.000006 2023-10-25 03:34:40,919 DEV : loss 0.17869263887405396 - f1-score (micro avg) 0.8368 2023-10-25 03:34:40,931 saving best model 2023-10-25 03:34:41,550 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:34:50,569 epoch 10 - iter 72/723 - loss 0.00494847 - time (sec): 9.02 - samples/sec: 1973.49 - lr: 0.000005 - momentum: 0.000000 2023-10-25 03:34:59,482 epoch 10 - iter 144/723 - loss 0.00333403 - time (sec): 17.93 - samples/sec: 1997.15 - lr: 0.000004 - momentum: 0.000000 2023-10-25 03:35:07,959 epoch 10 - iter 216/723 - loss 0.00471172 - time (sec): 26.41 - samples/sec: 1988.70 - lr: 0.000004 - momentum: 0.000000 2023-10-25 03:35:16,347 epoch 10 - iter 288/723 - loss 0.00424818 - time (sec): 34.80 - samples/sec: 1996.50 - lr: 0.000003 - momentum: 0.000000 2023-10-25 03:35:25,001 epoch 10 - iter 360/723 - loss 0.00450474 - time (sec): 43.45 - samples/sec: 1992.80 - lr: 0.000003 - momentum: 0.000000 2023-10-25 03:35:33,557 epoch 10 - iter 432/723 - loss 0.00435067 - time (sec): 52.01 - samples/sec: 2004.72 - lr: 0.000002 - momentum: 0.000000 2023-10-25 03:35:42,304 epoch 10 - iter 504/723 - loss 0.00425906 - time (sec): 60.75 - samples/sec: 1998.60 - lr: 0.000002 - momentum: 0.000000 2023-10-25 03:35:51,059 epoch 10 - iter 576/723 - loss 0.00465530 - time (sec): 69.51 - samples/sec: 1988.71 - lr: 0.000001 - momentum: 0.000000 2023-10-25 03:36:00,064 epoch 10 - iter 648/723 - loss 0.00486187 - time (sec): 78.51 - samples/sec: 1994.27 - lr: 0.000001 - momentum: 0.000000 2023-10-25 03:36:09,002 epoch 10 - iter 720/723 - loss 0.00532142 - time (sec): 87.45 - samples/sec: 2007.26 - lr: 0.000000 - momentum: 0.000000 2023-10-25 03:36:09,280 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:36:09,281 EPOCH 10 done: loss 0.0053 - lr: 0.000000 2023-10-25 03:36:12,707 DEV : loss 0.18416795134544373 - f1-score (micro avg) 0.834 2023-10-25 03:36:13,192 ---------------------------------------------------------------------------------------------------- 2023-10-25 03:36:13,193 Loading model from best epoch ... 2023-10-25 03:36:15,267 SequenceTagger predicts: Dictionary with 13 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-ORG, B-ORG, E-ORG, I-ORG 2023-10-25 03:36:18,515 Results: - F-score (micro) 0.8085 - F-score (macro) 0.7291 - Accuracy 0.69 By class: precision recall f1-score support PER 0.8450 0.8029 0.8234 482 LOC 0.8802 0.7860 0.8304 458 ORG 0.6275 0.4638 0.5333 69 micro avg 0.8486 0.7721 0.8085 1009 macro avg 0.7842 0.6842 0.7291 1009 weighted avg 0.8461 0.7721 0.8068 1009 2023-10-25 03:36:18,515 ----------------------------------------------------------------------------------------------------