2023-10-25 17:25:12,830 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:25:12,831 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-25 17:25:12,831 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:25:12,832 MultiCorpus: 14465 train + 1392 dev + 2432 test sentences - NER_HIPE_2022 Corpus: 14465 train + 1392 dev + 2432 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/letemps/fr/with_doc_seperator 2023-10-25 17:25:12,832 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:25:12,832 Train: 14465 sentences 2023-10-25 17:25:12,832 (train_with_dev=False, train_with_test=False) 2023-10-25 17:25:12,832 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:25:12,832 Training Params: 2023-10-25 17:25:12,832 - learning_rate: "5e-05" 2023-10-25 17:25:12,832 - mini_batch_size: "8" 2023-10-25 17:25:12,832 - max_epochs: "10" 2023-10-25 17:25:12,832 - shuffle: "True" 2023-10-25 17:25:12,832 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:25:12,832 Plugins: 2023-10-25 17:25:12,832 - TensorboardLogger 2023-10-25 17:25:12,832 - LinearScheduler | warmup_fraction: '0.1' 2023-10-25 17:25:12,832 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:25:12,832 Final evaluation on model from best epoch (best-model.pt) 2023-10-25 17:25:12,832 - metric: "('micro avg', 'f1-score')" 2023-10-25 17:25:12,832 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:25:12,832 Computation: 2023-10-25 17:25:12,832 - compute on device: cuda:0 2023-10-25 17:25:12,832 - embedding storage: none 2023-10-25 17:25:12,832 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:25:12,832 Model training base path: "hmbench-letemps/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs8-wsFalse-e10-lr5e-05-poolingfirst-layers-1-crfFalse-5" 2023-10-25 17:25:12,832 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:25:12,832 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:25:12,832 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-25 17:25:28,883 epoch 1 - iter 180/1809 - loss 1.08867170 - time (sec): 16.05 - samples/sec: 2407.65 - lr: 0.000005 - momentum: 0.000000 2023-10-25 17:25:44,521 epoch 1 - iter 360/1809 - loss 0.65004327 - time (sec): 31.69 - samples/sec: 2416.98 - lr: 0.000010 - momentum: 0.000000 2023-10-25 17:26:00,346 epoch 1 - iter 540/1809 - loss 0.48753569 - time (sec): 47.51 - samples/sec: 2407.41 - lr: 0.000015 - momentum: 0.000000 2023-10-25 17:26:15,761 epoch 1 - iter 720/1809 - loss 0.39766425 - time (sec): 62.93 - samples/sec: 2421.92 - lr: 0.000020 - momentum: 0.000000 2023-10-25 17:26:31,459 epoch 1 - iter 900/1809 - loss 0.34169738 - time (sec): 78.63 - samples/sec: 2408.69 - lr: 0.000025 - momentum: 0.000000 2023-10-25 17:26:47,241 epoch 1 - iter 1080/1809 - loss 0.30530543 - time (sec): 94.41 - samples/sec: 2400.17 - lr: 0.000030 - momentum: 0.000000 2023-10-25 17:27:03,504 epoch 1 - iter 1260/1809 - loss 0.27539034 - time (sec): 110.67 - samples/sec: 2397.75 - lr: 0.000035 - momentum: 0.000000 2023-10-25 17:27:19,006 epoch 1 - iter 1440/1809 - loss 0.25487372 - time (sec): 126.17 - samples/sec: 2394.96 - lr: 0.000040 - momentum: 0.000000 2023-10-25 17:27:35,047 epoch 1 - iter 1620/1809 - loss 0.23807391 - time (sec): 142.21 - samples/sec: 2389.53 - lr: 0.000045 - momentum: 0.000000 2023-10-25 17:27:51,142 epoch 1 - iter 1800/1809 - loss 0.22438103 - time (sec): 158.31 - samples/sec: 2389.71 - lr: 0.000050 - momentum: 0.000000 2023-10-25 17:27:51,843 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:27:51,843 EPOCH 1 done: loss 0.2237 - lr: 0.000050 2023-10-25 17:27:56,354 DEV : loss 0.11131972819566727 - f1-score (micro avg) 0.5998 2023-10-25 17:27:56,377 saving best model 2023-10-25 17:27:56,928 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:28:12,813 epoch 2 - iter 180/1809 - loss 0.08088650 - time (sec): 15.88 - samples/sec: 2358.09 - lr: 0.000049 - momentum: 0.000000 2023-10-25 17:28:28,596 epoch 2 - iter 360/1809 - loss 0.08543964 - time (sec): 31.67 - samples/sec: 2357.98 - lr: 0.000049 - momentum: 0.000000 2023-10-25 17:28:44,265 epoch 2 - iter 540/1809 - loss 0.08544855 - time (sec): 47.34 - samples/sec: 2378.99 - lr: 0.000048 - momentum: 0.000000 2023-10-25 17:28:59,984 epoch 2 - iter 720/1809 - loss 0.08766095 - time (sec): 63.06 - samples/sec: 2384.45 - lr: 0.000048 - momentum: 0.000000 2023-10-25 17:29:16,164 epoch 2 - iter 900/1809 - loss 0.08687484 - time (sec): 79.23 - samples/sec: 2399.84 - lr: 0.000047 - momentum: 0.000000 2023-10-25 17:29:31,913 epoch 2 - iter 1080/1809 - loss 0.08737062 - time (sec): 94.98 - samples/sec: 2401.58 - lr: 0.000047 - momentum: 0.000000 2023-10-25 17:29:47,549 epoch 2 - iter 1260/1809 - loss 0.08575598 - time (sec): 110.62 - samples/sec: 2399.66 - lr: 0.000046 - momentum: 0.000000 2023-10-25 17:30:03,603 epoch 2 - iter 1440/1809 - loss 0.08550450 - time (sec): 126.67 - samples/sec: 2395.66 - lr: 0.000046 - momentum: 0.000000 2023-10-25 17:30:19,491 epoch 2 - iter 1620/1809 - loss 0.08602069 - time (sec): 142.56 - samples/sec: 2392.68 - lr: 0.000045 - momentum: 0.000000 2023-10-25 17:30:35,371 epoch 2 - iter 1800/1809 - loss 0.08584415 - time (sec): 158.44 - samples/sec: 2389.42 - lr: 0.000044 - momentum: 0.000000 2023-10-25 17:30:36,066 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:30:36,066 EPOCH 2 done: loss 0.0859 - lr: 0.000044 2023-10-25 17:30:41,320 DEV : loss 0.14367857575416565 - f1-score (micro avg) 0.621 2023-10-25 17:30:41,343 saving best model 2023-10-25 17:30:42,046 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:30:57,826 epoch 3 - iter 180/1809 - loss 0.06105910 - time (sec): 15.78 - samples/sec: 2444.97 - lr: 0.000044 - momentum: 0.000000 2023-10-25 17:31:14,553 epoch 3 - iter 360/1809 - loss 0.06291567 - time (sec): 32.51 - samples/sec: 2454.20 - lr: 0.000043 - momentum: 0.000000 2023-10-25 17:31:30,567 epoch 3 - iter 540/1809 - loss 0.05939069 - time (sec): 48.52 - samples/sec: 2434.77 - lr: 0.000043 - momentum: 0.000000 2023-10-25 17:31:46,426 epoch 3 - iter 720/1809 - loss 0.06189798 - time (sec): 64.38 - samples/sec: 2420.88 - lr: 0.000042 - momentum: 0.000000 2023-10-25 17:32:02,072 epoch 3 - iter 900/1809 - loss 0.06226376 - time (sec): 80.02 - samples/sec: 2408.22 - lr: 0.000042 - momentum: 0.000000 2023-10-25 17:32:17,628 epoch 3 - iter 1080/1809 - loss 0.06229431 - time (sec): 95.58 - samples/sec: 2403.38 - lr: 0.000041 - momentum: 0.000000 2023-10-25 17:32:33,396 epoch 3 - iter 1260/1809 - loss 0.06309156 - time (sec): 111.35 - samples/sec: 2401.39 - lr: 0.000041 - momentum: 0.000000 2023-10-25 17:32:49,252 epoch 3 - iter 1440/1809 - loss 0.06334011 - time (sec): 127.20 - samples/sec: 2389.92 - lr: 0.000040 - momentum: 0.000000 2023-10-25 17:33:05,170 epoch 3 - iter 1620/1809 - loss 0.06299930 - time (sec): 143.12 - samples/sec: 2384.04 - lr: 0.000039 - momentum: 0.000000 2023-10-25 17:33:20,601 epoch 3 - iter 1800/1809 - loss 0.06268902 - time (sec): 158.55 - samples/sec: 2385.99 - lr: 0.000039 - momentum: 0.000000 2023-10-25 17:33:21,442 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:33:21,442 EPOCH 3 done: loss 0.0626 - lr: 0.000039 2023-10-25 17:33:26,209 DEV : loss 0.16750071942806244 - f1-score (micro avg) 0.6271 2023-10-25 17:33:26,232 saving best model 2023-10-25 17:33:27,448 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:33:43,590 epoch 4 - iter 180/1809 - loss 0.04454260 - time (sec): 16.14 - samples/sec: 2370.51 - lr: 0.000038 - momentum: 0.000000 2023-10-25 17:33:59,289 epoch 4 - iter 360/1809 - loss 0.04376843 - time (sec): 31.84 - samples/sec: 2384.31 - lr: 0.000038 - momentum: 0.000000 2023-10-25 17:34:15,317 epoch 4 - iter 540/1809 - loss 0.04208892 - time (sec): 47.87 - samples/sec: 2401.85 - lr: 0.000037 - momentum: 0.000000 2023-10-25 17:34:31,148 epoch 4 - iter 720/1809 - loss 0.04219370 - time (sec): 63.70 - samples/sec: 2412.85 - lr: 0.000037 - momentum: 0.000000 2023-10-25 17:34:46,949 epoch 4 - iter 900/1809 - loss 0.04300031 - time (sec): 79.50 - samples/sec: 2412.83 - lr: 0.000036 - momentum: 0.000000 2023-10-25 17:35:02,757 epoch 4 - iter 1080/1809 - loss 0.04422396 - time (sec): 95.31 - samples/sec: 2397.75 - lr: 0.000036 - momentum: 0.000000 2023-10-25 17:35:18,399 epoch 4 - iter 1260/1809 - loss 0.04338815 - time (sec): 110.95 - samples/sec: 2391.32 - lr: 0.000035 - momentum: 0.000000 2023-10-25 17:35:34,237 epoch 4 - iter 1440/1809 - loss 0.04365612 - time (sec): 126.79 - samples/sec: 2388.41 - lr: 0.000034 - momentum: 0.000000 2023-10-25 17:35:49,818 epoch 4 - iter 1620/1809 - loss 0.04436943 - time (sec): 142.37 - samples/sec: 2386.02 - lr: 0.000034 - momentum: 0.000000 2023-10-25 17:36:05,768 epoch 4 - iter 1800/1809 - loss 0.04471372 - time (sec): 158.32 - samples/sec: 2388.70 - lr: 0.000033 - momentum: 0.000000 2023-10-25 17:36:06,615 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:36:06,615 EPOCH 4 done: loss 0.0448 - lr: 0.000033 2023-10-25 17:36:11,377 DEV : loss 0.21365126967430115 - f1-score (micro avg) 0.6119 2023-10-25 17:36:11,400 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:36:27,342 epoch 5 - iter 180/1809 - loss 0.02797797 - time (sec): 15.94 - samples/sec: 2446.04 - lr: 0.000033 - momentum: 0.000000 2023-10-25 17:36:43,491 epoch 5 - iter 360/1809 - loss 0.02861473 - time (sec): 32.09 - samples/sec: 2410.53 - lr: 0.000032 - momentum: 0.000000 2023-10-25 17:36:59,285 epoch 5 - iter 540/1809 - loss 0.02970454 - time (sec): 47.88 - samples/sec: 2406.82 - lr: 0.000032 - momentum: 0.000000 2023-10-25 17:37:14,595 epoch 5 - iter 720/1809 - loss 0.03017154 - time (sec): 63.19 - samples/sec: 2383.00 - lr: 0.000031 - momentum: 0.000000 2023-10-25 17:37:30,823 epoch 5 - iter 900/1809 - loss 0.02993842 - time (sec): 79.42 - samples/sec: 2390.07 - lr: 0.000031 - momentum: 0.000000 2023-10-25 17:37:47,071 epoch 5 - iter 1080/1809 - loss 0.03036229 - time (sec): 95.67 - samples/sec: 2389.07 - lr: 0.000030 - momentum: 0.000000 2023-10-25 17:38:02,683 epoch 5 - iter 1260/1809 - loss 0.03032083 - time (sec): 111.28 - samples/sec: 2380.89 - lr: 0.000029 - momentum: 0.000000 2023-10-25 17:38:18,242 epoch 5 - iter 1440/1809 - loss 0.03017265 - time (sec): 126.84 - samples/sec: 2381.62 - lr: 0.000029 - momentum: 0.000000 2023-10-25 17:38:34,319 epoch 5 - iter 1620/1809 - loss 0.03017571 - time (sec): 142.92 - samples/sec: 2368.45 - lr: 0.000028 - momentum: 0.000000 2023-10-25 17:38:50,611 epoch 5 - iter 1800/1809 - loss 0.02982099 - time (sec): 159.21 - samples/sec: 2376.76 - lr: 0.000028 - momentum: 0.000000 2023-10-25 17:38:51,342 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:38:51,342 EPOCH 5 done: loss 0.0299 - lr: 0.000028 2023-10-25 17:38:56,115 DEV : loss 0.2786865830421448 - f1-score (micro avg) 0.6472 2023-10-25 17:38:56,139 saving best model 2023-10-25 17:38:56,860 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:39:12,505 epoch 6 - iter 180/1809 - loss 0.01698871 - time (sec): 15.64 - samples/sec: 2353.61 - lr: 0.000027 - momentum: 0.000000 2023-10-25 17:39:28,137 epoch 6 - iter 360/1809 - loss 0.01883337 - time (sec): 31.28 - samples/sec: 2382.83 - lr: 0.000027 - momentum: 0.000000 2023-10-25 17:39:44,244 epoch 6 - iter 540/1809 - loss 0.02197274 - time (sec): 47.38 - samples/sec: 2391.47 - lr: 0.000026 - momentum: 0.000000 2023-10-25 17:39:59,902 epoch 6 - iter 720/1809 - loss 0.02290012 - time (sec): 63.04 - samples/sec: 2395.80 - lr: 0.000026 - momentum: 0.000000 2023-10-25 17:40:16,174 epoch 6 - iter 900/1809 - loss 0.02254507 - time (sec): 79.31 - samples/sec: 2378.20 - lr: 0.000025 - momentum: 0.000000 2023-10-25 17:40:31,791 epoch 6 - iter 1080/1809 - loss 0.02233300 - time (sec): 94.93 - samples/sec: 2375.99 - lr: 0.000024 - momentum: 0.000000 2023-10-25 17:40:47,376 epoch 6 - iter 1260/1809 - loss 0.02223982 - time (sec): 110.51 - samples/sec: 2371.40 - lr: 0.000024 - momentum: 0.000000 2023-10-25 17:41:03,399 epoch 6 - iter 1440/1809 - loss 0.02230768 - time (sec): 126.54 - samples/sec: 2379.75 - lr: 0.000023 - momentum: 0.000000 2023-10-25 17:41:19,391 epoch 6 - iter 1620/1809 - loss 0.02169893 - time (sec): 142.53 - samples/sec: 2382.79 - lr: 0.000023 - momentum: 0.000000 2023-10-25 17:41:35,589 epoch 6 - iter 1800/1809 - loss 0.02156452 - time (sec): 158.73 - samples/sec: 2382.82 - lr: 0.000022 - momentum: 0.000000 2023-10-25 17:41:36,334 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:41:36,334 EPOCH 6 done: loss 0.0216 - lr: 0.000022 2023-10-25 17:41:41,614 DEV : loss 0.36510854959487915 - f1-score (micro avg) 0.6298 2023-10-25 17:41:41,637 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:41:57,608 epoch 7 - iter 180/1809 - loss 0.02022287 - time (sec): 15.97 - samples/sec: 2400.72 - lr: 0.000022 - momentum: 0.000000 2023-10-25 17:42:13,475 epoch 7 - iter 360/1809 - loss 0.02033749 - time (sec): 31.84 - samples/sec: 2400.51 - lr: 0.000021 - momentum: 0.000000 2023-10-25 17:42:29,613 epoch 7 - iter 540/1809 - loss 0.02001745 - time (sec): 47.98 - samples/sec: 2407.10 - lr: 0.000021 - momentum: 0.000000 2023-10-25 17:42:45,451 epoch 7 - iter 720/1809 - loss 0.01848045 - time (sec): 63.81 - samples/sec: 2415.31 - lr: 0.000020 - momentum: 0.000000 2023-10-25 17:43:00,702 epoch 7 - iter 900/1809 - loss 0.01690827 - time (sec): 79.06 - samples/sec: 2403.90 - lr: 0.000019 - momentum: 0.000000 2023-10-25 17:43:16,683 epoch 7 - iter 1080/1809 - loss 0.01639987 - time (sec): 95.05 - samples/sec: 2384.14 - lr: 0.000019 - momentum: 0.000000 2023-10-25 17:43:32,462 epoch 7 - iter 1260/1809 - loss 0.01694441 - time (sec): 110.82 - samples/sec: 2375.78 - lr: 0.000018 - momentum: 0.000000 2023-10-25 17:43:48,583 epoch 7 - iter 1440/1809 - loss 0.01604881 - time (sec): 126.95 - samples/sec: 2373.46 - lr: 0.000018 - momentum: 0.000000 2023-10-25 17:44:04,373 epoch 7 - iter 1620/1809 - loss 0.01596925 - time (sec): 142.74 - samples/sec: 2380.20 - lr: 0.000017 - momentum: 0.000000 2023-10-25 17:44:20,312 epoch 7 - iter 1800/1809 - loss 0.01571324 - time (sec): 158.67 - samples/sec: 2384.33 - lr: 0.000017 - momentum: 0.000000 2023-10-25 17:44:21,026 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:44:21,026 EPOCH 7 done: loss 0.0157 - lr: 0.000017 2023-10-25 17:44:26,345 DEV : loss 0.3604075312614441 - f1-score (micro avg) 0.6238 2023-10-25 17:44:26,368 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:44:42,092 epoch 8 - iter 180/1809 - loss 0.00997516 - time (sec): 15.72 - samples/sec: 2339.27 - lr: 0.000016 - momentum: 0.000000 2023-10-25 17:44:57,918 epoch 8 - iter 360/1809 - loss 0.01105999 - time (sec): 31.55 - samples/sec: 2358.40 - lr: 0.000016 - momentum: 0.000000 2023-10-25 17:45:13,718 epoch 8 - iter 540/1809 - loss 0.01121835 - time (sec): 47.35 - samples/sec: 2355.37 - lr: 0.000015 - momentum: 0.000000 2023-10-25 17:45:29,687 epoch 8 - iter 720/1809 - loss 0.01185460 - time (sec): 63.32 - samples/sec: 2359.55 - lr: 0.000014 - momentum: 0.000000 2023-10-25 17:45:45,573 epoch 8 - iter 900/1809 - loss 0.01093216 - time (sec): 79.20 - samples/sec: 2365.07 - lr: 0.000014 - momentum: 0.000000 2023-10-25 17:46:01,155 epoch 8 - iter 1080/1809 - loss 0.01026454 - time (sec): 94.79 - samples/sec: 2366.17 - lr: 0.000013 - momentum: 0.000000 2023-10-25 17:46:16,917 epoch 8 - iter 1260/1809 - loss 0.00974729 - time (sec): 110.55 - samples/sec: 2370.48 - lr: 0.000013 - momentum: 0.000000 2023-10-25 17:46:32,383 epoch 8 - iter 1440/1809 - loss 0.00942034 - time (sec): 126.01 - samples/sec: 2370.26 - lr: 0.000012 - momentum: 0.000000 2023-10-25 17:46:48,687 epoch 8 - iter 1620/1809 - loss 0.01013871 - time (sec): 142.32 - samples/sec: 2373.81 - lr: 0.000012 - momentum: 0.000000 2023-10-25 17:47:05,389 epoch 8 - iter 1800/1809 - loss 0.01001323 - time (sec): 159.02 - samples/sec: 2378.22 - lr: 0.000011 - momentum: 0.000000 2023-10-25 17:47:06,147 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:47:06,147 EPOCH 8 done: loss 0.0101 - lr: 0.000011 2023-10-25 17:47:11,459 DEV : loss 0.3776438534259796 - f1-score (micro avg) 0.649 2023-10-25 17:47:11,482 saving best model 2023-10-25 17:47:12,162 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:47:28,015 epoch 9 - iter 180/1809 - loss 0.00806189 - time (sec): 15.85 - samples/sec: 2396.31 - lr: 0.000011 - momentum: 0.000000 2023-10-25 17:47:43,744 epoch 9 - iter 360/1809 - loss 0.00786294 - time (sec): 31.58 - samples/sec: 2414.19 - lr: 0.000010 - momentum: 0.000000 2023-10-25 17:47:59,740 epoch 9 - iter 540/1809 - loss 0.00779681 - time (sec): 47.58 - samples/sec: 2404.36 - lr: 0.000009 - momentum: 0.000000 2023-10-25 17:48:15,611 epoch 9 - iter 720/1809 - loss 0.00824623 - time (sec): 63.45 - samples/sec: 2391.40 - lr: 0.000009 - momentum: 0.000000 2023-10-25 17:48:31,872 epoch 9 - iter 900/1809 - loss 0.00749187 - time (sec): 79.71 - samples/sec: 2379.55 - lr: 0.000008 - momentum: 0.000000 2023-10-25 17:48:47,559 epoch 9 - iter 1080/1809 - loss 0.00729148 - time (sec): 95.40 - samples/sec: 2381.02 - lr: 0.000008 - momentum: 0.000000 2023-10-25 17:49:03,417 epoch 9 - iter 1260/1809 - loss 0.00706207 - time (sec): 111.25 - samples/sec: 2374.16 - lr: 0.000007 - momentum: 0.000000 2023-10-25 17:49:19,068 epoch 9 - iter 1440/1809 - loss 0.00684409 - time (sec): 126.90 - samples/sec: 2373.51 - lr: 0.000007 - momentum: 0.000000 2023-10-25 17:49:34,617 epoch 9 - iter 1620/1809 - loss 0.00687108 - time (sec): 142.45 - samples/sec: 2378.28 - lr: 0.000006 - momentum: 0.000000 2023-10-25 17:49:50,728 epoch 9 - iter 1800/1809 - loss 0.00658565 - time (sec): 158.56 - samples/sec: 2381.77 - lr: 0.000006 - momentum: 0.000000 2023-10-25 17:49:51,725 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:49:51,725 EPOCH 9 done: loss 0.0067 - lr: 0.000006 2023-10-25 17:49:56,514 DEV : loss 0.3927135467529297 - f1-score (micro avg) 0.6399 2023-10-25 17:49:56,538 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:50:12,657 epoch 10 - iter 180/1809 - loss 0.00422793 - time (sec): 16.12 - samples/sec: 2350.66 - lr: 0.000005 - momentum: 0.000000 2023-10-25 17:50:28,618 epoch 10 - iter 360/1809 - loss 0.00339541 - time (sec): 32.08 - samples/sec: 2385.59 - lr: 0.000004 - momentum: 0.000000 2023-10-25 17:50:44,793 epoch 10 - iter 540/1809 - loss 0.00334334 - time (sec): 48.25 - samples/sec: 2373.68 - lr: 0.000004 - momentum: 0.000000 2023-10-25 17:51:00,624 epoch 10 - iter 720/1809 - loss 0.00349222 - time (sec): 64.09 - samples/sec: 2368.92 - lr: 0.000003 - momentum: 0.000000 2023-10-25 17:51:16,681 epoch 10 - iter 900/1809 - loss 0.00336080 - time (sec): 80.14 - samples/sec: 2376.57 - lr: 0.000003 - momentum: 0.000000 2023-10-25 17:51:32,495 epoch 10 - iter 1080/1809 - loss 0.00320549 - time (sec): 95.96 - samples/sec: 2379.97 - lr: 0.000002 - momentum: 0.000000 2023-10-25 17:51:48,151 epoch 10 - iter 1260/1809 - loss 0.00327382 - time (sec): 111.61 - samples/sec: 2376.20 - lr: 0.000002 - momentum: 0.000000 2023-10-25 17:52:04,075 epoch 10 - iter 1440/1809 - loss 0.00335080 - time (sec): 127.54 - samples/sec: 2378.32 - lr: 0.000001 - momentum: 0.000000 2023-10-25 17:52:20,274 epoch 10 - iter 1620/1809 - loss 0.00332977 - time (sec): 143.74 - samples/sec: 2380.65 - lr: 0.000001 - momentum: 0.000000 2023-10-25 17:52:35,875 epoch 10 - iter 1800/1809 - loss 0.00380800 - time (sec): 159.34 - samples/sec: 2375.32 - lr: 0.000000 - momentum: 0.000000 2023-10-25 17:52:36,617 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:52:36,617 EPOCH 10 done: loss 0.0038 - lr: 0.000000 2023-10-25 17:52:41,399 DEV : loss 0.4173244535923004 - f1-score (micro avg) 0.6413 2023-10-25 17:52:41,977 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:52:41,978 Loading model from best epoch ... 2023-10-25 17:52:43,747 SequenceTagger predicts: Dictionary with 13 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org 2023-10-25 17:52:50,008 Results: - F-score (micro) 0.6409 - F-score (macro) 0.4973 - Accuracy 0.4862 By class: precision recall f1-score support loc 0.6403 0.7259 0.6804 591 pers 0.5885 0.7451 0.6576 357 org 0.1961 0.1266 0.1538 79 micro avg 0.6010 0.6865 0.6409 1027 macro avg 0.4750 0.5325 0.4973 1027 weighted avg 0.5881 0.6865 0.6320 1027 2023-10-25 17:52:50,008 ----------------------------------------------------------------------------------------------------