2023-03-09 09:25:13,819 ---------------------------------------------------------------------------------------------------- 2023-03-09 09:25:13,823 Model: "MultitaskModel( (Task_0): TextClassifier( (embeddings): TransformerDocumentEmbeddings( (model): XLMRobertaModel( (embeddings): RobertaEmbeddings( (word_embeddings): Embedding(250003, 1024) (position_embeddings): Embedding(514, 1024, padding_idx=1) (token_type_embeddings): Embedding(1, 1024) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): RobertaEncoder( (layer): ModuleList( (0): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (12): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (13): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (14): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (15): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (16): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (17): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (18): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (19): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (20): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (21): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (22): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (23): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): RobertaPooler( (dense): Linear(in_features=1024, out_features=1024, bias=True) (activation): Tanh() ) ) ) (decoder): Linear(in_features=2048, out_features=2, bias=True) (dropout): Dropout(p=0.0, inplace=False) (locked_dropout): LockedDropout(p=0.0) (word_dropout): WordDropout(p=0.0) (loss_function): CrossEntropyLoss() ) (Task_1): TextClassifier( (embeddings): TransformerDocumentEmbeddings( (model): XLMRobertaModel( (embeddings): RobertaEmbeddings( (word_embeddings): Embedding(250003, 1024) (position_embeddings): Embedding(514, 1024, padding_idx=1) (token_type_embeddings): Embedding(1, 1024) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): RobertaEncoder( (layer): ModuleList( (0): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (12): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (13): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (14): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (15): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (16): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (17): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (18): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (19): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (20): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (21): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (22): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (23): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): RobertaPooler( (dense): Linear(in_features=1024, out_features=1024, bias=True) (activation): Tanh() ) ) ) (decoder): Linear(in_features=2048, out_features=2, bias=True) (dropout): Dropout(p=0.0, inplace=False) (locked_dropout): LockedDropout(p=0.0) (word_dropout): WordDropout(p=0.0) (loss_function): CrossEntropyLoss() ) (Task_2): TextClassifier( (embeddings): TransformerDocumentEmbeddings( (model): XLMRobertaModel( (embeddings): RobertaEmbeddings( (word_embeddings): Embedding(250003, 1024) (position_embeddings): Embedding(514, 1024, padding_idx=1) (token_type_embeddings): Embedding(1, 1024) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): RobertaEncoder( (layer): ModuleList( (0): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (12): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (13): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (14): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (15): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (16): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (17): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (18): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (19): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (20): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (21): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (22): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (23): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=1024, out_features=1024, bias=True) (key): Linear(in_features=1024, out_features=1024, bias=True) (value): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=1024, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=1024, out_features=4096, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=4096, out_features=1024, bias=True) (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): RobertaPooler( (dense): Linear(in_features=1024, out_features=1024, bias=True) (activation): Tanh() ) ) ) (decoder): Linear(in_features=2048, out_features=3, bias=True) (dropout): Dropout(p=0.0, inplace=False) (locked_dropout): LockedDropout(p=0.0) (word_dropout): WordDropout(p=0.0) (loss_function): CrossEntropyLoss() ) )" 2023-03-09 09:25:13,833 ---------------------------------------------------------------------------------------------------- 2023-03-09 09:25:13,833 Corpus: "MultiCorpus: 20547 train + 2282 dev + 6159 test sentences - CSVClassificationCorpus Corpus: 12690 train + 1410 dev + 3887 test sentences - csv_corpus - CSVClassificationCorpus Corpus: 4176 train + 464 dev + 1422 test sentences - csv_corpus - CSVClassificationCorpus Corpus: 3681 train + 408 dev + 850 test sentences - csv_corpus" 2023-03-09 09:25:13,833 ---------------------------------------------------------------------------------------------------- 2023-03-09 09:25:13,833 Parameters: 2023-03-09 09:25:13,833 - learning_rate: "0.000005" 2023-03-09 09:25:13,833 - mini_batch_size: "4" 2023-03-09 09:25:13,833 - patience: "3" 2023-03-09 09:25:13,833 - anneal_factor: "0.5" 2023-03-09 09:25:13,833 - max_epochs: "6" 2023-03-09 09:25:13,833 - shuffle: "True" 2023-03-09 09:25:13,833 - train_with_dev: "False" 2023-03-09 09:25:13,834 - batch_growth_annealing: "False" 2023-03-09 09:25:13,834 ---------------------------------------------------------------------------------------------------- 2023-03-09 09:25:13,834 Model training base path: "models/olid_multitask" 2023-03-09 09:25:13,834 ---------------------------------------------------------------------------------------------------- 2023-03-09 09:25:13,834 Device: cuda:0 2023-03-09 09:25:13,834 ---------------------------------------------------------------------------------------------------- 2023-03-09 09:25:13,834 Embeddings storage mode: none 2023-03-09 09:25:13,834 ---------------------------------------------------------------------------------------------------- 2023-03-09 09:26:22,861 epoch 1 - iter 513/5137 - loss 0.89968743 - time (sec): 69.03 - samples/sec: 29.73 - lr: 0.000001 2023-03-09 09:27:31,973 epoch 1 - iter 1026/5137 - loss 0.80337151 - time (sec): 138.14 - samples/sec: 29.71 - lr: 0.000002 2023-03-09 09:28:41,817 epoch 1 - iter 1539/5137 - loss 0.76202047 - time (sec): 207.98 - samples/sec: 29.60 - lr: 0.000002 2023-03-09 09:29:51,692 epoch 1 - iter 2052/5137 - loss 0.76442478 - time (sec): 277.86 - samples/sec: 29.54 - lr: 0.000003 2023-03-09 09:31:01,198 epoch 1 - iter 2565/5137 - loss 0.77306474 - time (sec): 347.36 - samples/sec: 29.54 - lr: 0.000004 2023-03-09 09:32:11,580 epoch 1 - iter 3078/5137 - loss 0.76748787 - time (sec): 417.75 - samples/sec: 29.47 - lr: 0.000005 2023-03-09 09:33:21,511 epoch 1 - iter 3591/5137 - loss 0.94002634 - time (sec): 487.68 - samples/sec: 29.45 - lr: 0.000005 2023-03-09 09:34:31,164 epoch 1 - iter 4104/5137 - loss 1.04345380 - time (sec): 557.33 - samples/sec: 29.45 - lr: 0.000005 2023-03-09 09:35:40,934 epoch 1 - iter 4617/5137 - loss 1.07841374 - time (sec): 627.10 - samples/sec: 29.45 - lr: 0.000005 2023-03-09 09:36:51,001 epoch 1 - iter 5130/5137 - loss 1.08190428 - time (sec): 697.17 - samples/sec: 29.43 - lr: 0.000005 2023-03-09 09:36:51,917 ---------------------------------------------------------------------------------------------------- 2023-03-09 09:36:51,918 EPOCH 1 done: loss 1.0819 - lr 0.000005 2023-03-09 09:36:59,228 Evaluating as a multi-label problem: False 2023-03-09 09:36:59,236 Task_0 - TextClassifier - loss: 0.7644573450088501 - f1-score (micro avg) 0.4752 2023-03-09 09:37:01,363 Evaluating as a multi-label problem: False 2023-03-09 09:37:01,369 Task_1 - TextClassifier - loss: 0.886383593082428 - f1-score (micro avg) 0.8901 2023-03-09 09:37:03,247 Evaluating as a multi-label problem: False 2023-03-09 09:37:03,252 Task_2 - TextClassifier - loss: 1.427789568901062 - f1-score (micro avg) 0.7157 2023-03-09 09:37:03,252 DEV : loss 1.0262101491292317 - f1-score (micro avg) 0.6936 2023-03-09 09:37:04,122 ---------------------------------------------------------------------------------------------------- 2023-03-09 09:38:31,533 epoch 2 - iter 513/5137 - loss 0.90357270 - time (sec): 87.41 - samples/sec: 23.48 - lr: 0.000005 2023-03-09 09:39:59,070 epoch 2 - iter 1026/5137 - loss 0.94510332 - time (sec): 174.95 - samples/sec: 23.46 - lr: 0.000004 2023-03-09 09:41:26,296 epoch 2 - iter 1539/5137 - loss 0.95755981 - time (sec): 262.17 - samples/sec: 23.48 - lr: 0.000004 2023-03-09 09:42:53,800 epoch 2 - iter 2052/5137 - loss 0.97994334 - time (sec): 349.68 - samples/sec: 23.47 - lr: 0.000004 2023-03-09 09:44:19,905 epoch 2 - iter 2565/5137 - loss 0.97328010 - time (sec): 435.78 - samples/sec: 23.54 - lr: 0.000004 2023-03-09 09:45:47,221 epoch 2 - iter 3078/5137 - loss 0.96453966 - time (sec): 523.10 - samples/sec: 23.54 - lr: 0.000004 2023-03-09 09:47:14,713 epoch 2 - iter 3591/5137 - loss 0.97875627 - time (sec): 610.59 - samples/sec: 23.52 - lr: 0.000004 2023-03-09 09:48:41,874 epoch 2 - iter 4104/5137 - loss 0.98685362 - time (sec): 697.75 - samples/sec: 23.53 - lr: 0.000004 2023-03-09 09:50:09,542 epoch 2 - iter 4617/5137 - loss 0.98525325 - time (sec): 785.42 - samples/sec: 23.51 - lr: 0.000004 2023-03-09 09:51:36,166 epoch 2 - iter 5130/5137 - loss 0.98617675 - time (sec): 872.04 - samples/sec: 23.53 - lr: 0.000004 2023-03-09 09:51:37,402 ---------------------------------------------------------------------------------------------------- 2023-03-09 09:51:37,403 EPOCH 2 done: loss 0.9871 - lr 0.000004 2023-03-09 09:51:44,627 Evaluating as a multi-label problem: False 2023-03-09 09:51:44,636 Task_0 - TextClassifier - loss: 0.8539988398551941 - f1-score (micro avg) 0.8326 2023-03-09 09:51:46,874 Evaluating as a multi-label problem: False 2023-03-09 09:51:46,880 Task_1 - TextClassifier - loss: 1.2971891164779663 - f1-score (micro avg) 0.8944 2023-03-09 09:51:48,761 Evaluating as a multi-label problem: False 2023-03-09 09:51:48,767 Task_2 - TextClassifier - loss: 1.7388358116149902 - f1-score (micro avg) 0.6863 2023-03-09 09:51:48,767 DEV : loss 1.2966745694478352 - f1-score (micro avg) 0.8044 2023-03-09 09:51:49,584 ---------------------------------------------------------------------------------------------------- 2023-03-09 09:53:16,823 epoch 3 - iter 513/5137 - loss 0.93951453 - time (sec): 87.24 - samples/sec: 23.52 - lr: 0.000004 2023-03-09 09:54:44,783 epoch 3 - iter 1026/5137 - loss 0.98191897 - time (sec): 175.20 - samples/sec: 23.42 - lr: 0.000004 2023-03-09 09:56:11,742 epoch 3 - iter 1539/5137 - loss 0.98822068 - time (sec): 262.16 - samples/sec: 23.48 - lr: 0.000003 2023-03-09 09:57:39,400 epoch 3 - iter 2052/5137 - loss 0.97989690 - time (sec): 349.82 - samples/sec: 23.46 - lr: 0.000003 2023-03-09 09:59:06,932 epoch 3 - iter 2565/5137 - loss 0.97280398 - time (sec): 437.35 - samples/sec: 23.46 - lr: 0.000003 2023-03-09 10:00:34,045 epoch 3 - iter 3078/5137 - loss 0.95220877 - time (sec): 524.46 - samples/sec: 23.48 - lr: 0.000003 2023-03-09 10:02:01,083 epoch 3 - iter 3591/5137 - loss 0.96090609 - time (sec): 611.50 - samples/sec: 23.49 - lr: 0.000003 2023-03-09 10:03:28,086 epoch 3 - iter 4104/5137 - loss 0.96310866 - time (sec): 698.50 - samples/sec: 23.50 - lr: 0.000003 2023-03-09 10:04:53,980 epoch 3 - iter 4617/5137 - loss 0.96264001 - time (sec): 784.40 - samples/sec: 23.54 - lr: 0.000003 2023-03-09 10:06:19,861 epoch 3 - iter 5130/5137 - loss 0.96874570 - time (sec): 870.28 - samples/sec: 23.58 - lr: 0.000003 2023-03-09 10:06:21,083 ---------------------------------------------------------------------------------------------------- 2023-03-09 10:06:21,083 EPOCH 3 done: loss 0.9685 - lr 0.000003 2023-03-09 10:06:28,382 Evaluating as a multi-label problem: False 2023-03-09 10:06:28,391 Task_0 - TextClassifier - loss: 0.6750268340110779 - f1-score (micro avg) 0.8262 2023-03-09 10:06:30,516 Evaluating as a multi-label problem: False 2023-03-09 10:06:30,522 Task_1 - TextClassifier - loss: 1.4912396669387817 - f1-score (micro avg) 0.903 2023-03-09 10:06:32,531 Evaluating as a multi-label problem: False 2023-03-09 10:06:32,536 Task_2 - TextClassifier - loss: 1.5878006219863892 - f1-score (micro avg) 0.6936 2023-03-09 10:06:32,536 DEV : loss 1.2513556480407715 - f1-score (micro avg) 0.8076 2023-03-09 10:06:33,348 ---------------------------------------------------------------------------------------------------- 2023-03-09 10:07:59,480 epoch 4 - iter 513/5137 - loss 0.86635708 - time (sec): 86.13 - samples/sec: 23.82 - lr: 0.000003 2023-03-09 10:09:25,258 epoch 4 - iter 1026/5137 - loss 0.85837489 - time (sec): 171.91 - samples/sec: 23.87 - lr: 0.000003 2023-03-09 10:10:50,330 epoch 4 - iter 1539/5137 - loss 0.83713003 - time (sec): 256.98 - samples/sec: 23.96 - lr: 0.000003 2023-03-09 10:12:15,894 epoch 4 - iter 2052/5137 - loss 0.83851480 - time (sec): 342.55 - samples/sec: 23.96 - lr: 0.000002 2023-03-09 10:13:42,217 epoch 4 - iter 2565/5137 - loss 0.85732759 - time (sec): 428.87 - samples/sec: 23.92 - lr: 0.000002 2023-03-09 10:15:07,724 epoch 4 - iter 3078/5137 - loss 0.86813140 - time (sec): 514.38 - samples/sec: 23.94 - lr: 0.000002 2023-03-09 10:16:34,955 epoch 4 - iter 3591/5137 - loss 0.87352673 - time (sec): 601.61 - samples/sec: 23.88 - lr: 0.000002 2023-03-09 10:18:00,724 epoch 4 - iter 4104/5137 - loss 0.87163753 - time (sec): 687.38 - samples/sec: 23.88 - lr: 0.000002 2023-03-09 10:19:26,823 epoch 4 - iter 4617/5137 - loss 0.87379546 - time (sec): 773.48 - samples/sec: 23.88 - lr: 0.000002 2023-03-09 10:20:52,600 epoch 4 - iter 5130/5137 - loss 0.87853938 - time (sec): 859.25 - samples/sec: 23.88 - lr: 0.000002 2023-03-09 10:20:53,858 ---------------------------------------------------------------------------------------------------- 2023-03-09 10:20:53,858 EPOCH 4 done: loss 0.8782 - lr 0.000002 2023-03-09 10:21:01,077 Evaluating as a multi-label problem: False 2023-03-09 10:21:01,085 Task_0 - TextClassifier - loss: 0.8997748494148254 - f1-score (micro avg) 0.8255 2023-03-09 10:21:03,319 Evaluating as a multi-label problem: False 2023-03-09 10:21:03,325 Task_1 - TextClassifier - loss: 1.6563892364501953 - f1-score (micro avg) 0.8944 2023-03-09 10:21:05,206 Evaluating as a multi-label problem: False 2023-03-09 10:21:05,211 Task_2 - TextClassifier - loss: 1.9594365358352661 - f1-score (micro avg) 0.7059 2023-03-09 10:21:05,211 DEV : loss 1.5052002271016438 - f1-score (micro avg) 0.8086 2023-03-09 10:21:06,011 ---------------------------------------------------------------------------------------------------- 2023-03-09 10:22:34,002 epoch 5 - iter 513/5137 - loss 0.74250209 - time (sec): 87.99 - samples/sec: 23.32 - lr: 0.000002 2023-03-09 10:24:00,753 epoch 5 - iter 1026/5137 - loss 0.77652037 - time (sec): 174.74 - samples/sec: 23.49 - lr: 0.000002 2023-03-09 10:25:28,429 epoch 5 - iter 1539/5137 - loss 0.77736529 - time (sec): 262.42 - samples/sec: 23.46 - lr: 0.000002 2023-03-09 10:26:55,110 epoch 5 - iter 2052/5137 - loss 0.76620240 - time (sec): 349.10 - samples/sec: 23.51 - lr: 0.000001 2023-03-09 10:28:22,234 epoch 5 - iter 2565/5137 - loss 0.77413676 - time (sec): 436.22 - samples/sec: 23.52 - lr: 0.000001 2023-03-09 10:29:48,768 epoch 5 - iter 3078/5137 - loss 0.80168968 - time (sec): 522.76 - samples/sec: 23.55 - lr: 0.000001 2023-03-09 10:31:16,786 epoch 5 - iter 3591/5137 - loss 0.80222358 - time (sec): 610.77 - samples/sec: 23.52 - lr: 0.000001 2023-03-09 10:32:44,098 epoch 5 - iter 4104/5137 - loss 0.80434418 - time (sec): 698.09 - samples/sec: 23.52 - lr: 0.000001 2023-03-09 10:34:11,572 epoch 5 - iter 4617/5137 - loss 0.79665919 - time (sec): 785.56 - samples/sec: 23.51 - lr: 0.000001 2023-03-09 10:35:38,908 epoch 5 - iter 5130/5137 - loss 0.79652909 - time (sec): 872.90 - samples/sec: 23.51 - lr: 0.000001 2023-03-09 10:35:40,076 ---------------------------------------------------------------------------------------------------- 2023-03-09 10:35:40,076 EPOCH 5 done: loss 0.7972 - lr 0.000001 2023-03-09 10:35:47,384 Evaluating as a multi-label problem: False 2023-03-09 10:35:47,393 Task_0 - TextClassifier - loss: 0.9911175966262817 - f1-score (micro avg) 0.8305 2023-03-09 10:35:49,521 Evaluating as a multi-label problem: False 2023-03-09 10:35:49,527 Task_1 - TextClassifier - loss: 1.8957191705703735 - f1-score (micro avg) 0.8987 2023-03-09 10:35:51,411 Evaluating as a multi-label problem: False 2023-03-09 10:35:51,417 Task_2 - TextClassifier - loss: 1.965000033378601 - f1-score (micro avg) 0.6985 2023-03-09 10:35:51,417 DEV : loss 1.6172788937886555 - f1-score (micro avg) 0.8092 2023-03-09 10:35:52,285 ---------------------------------------------------------------------------------------------------- 2023-03-09 10:37:19,157 epoch 6 - iter 513/5137 - loss 0.67655701 - time (sec): 86.87 - samples/sec: 23.62 - lr: 0.000001 2023-03-09 10:38:46,908 epoch 6 - iter 1026/5137 - loss 0.71497110 - time (sec): 174.62 - samples/sec: 23.50 - lr: 0.000001 2023-03-09 10:40:14,833 epoch 6 - iter 1539/5137 - loss 0.72622619 - time (sec): 262.55 - samples/sec: 23.45 - lr: 0.000001 2023-03-09 10:41:42,346 epoch 6 - iter 2052/5137 - loss 0.70362521 - time (sec): 350.06 - samples/sec: 23.45 - lr: 0.000001 2023-03-09 10:43:08,949 epoch 6 - iter 2565/5137 - loss 0.69521474 - time (sec): 436.66 - samples/sec: 23.50 - lr: 0.000000 2023-03-09 10:44:36,179 epoch 6 - iter 3078/5137 - loss 0.70019985 - time (sec): 523.89 - samples/sec: 23.50 - lr: 0.000000 2023-03-09 10:46:04,186 epoch 6 - iter 3591/5137 - loss 0.70011303 - time (sec): 611.90 - samples/sec: 23.47 - lr: 0.000000 2023-03-09 10:47:31,875 epoch 6 - iter 4104/5137 - loss 0.70173858 - time (sec): 699.59 - samples/sec: 23.47 - lr: 0.000000 2023-03-09 10:49:00,248 epoch 6 - iter 4617/5137 - loss 0.70728641 - time (sec): 787.96 - samples/sec: 23.44 - lr: 0.000000 2023-03-09 10:50:27,929 epoch 6 - iter 5130/5137 - loss 0.71045816 - time (sec): 875.64 - samples/sec: 23.43 - lr: 0.000000 2023-03-09 10:50:29,119 ---------------------------------------------------------------------------------------------------- 2023-03-09 10:50:29,119 EPOCH 6 done: loss 0.7103 - lr 0.000000 2023-03-09 10:50:36,428 Evaluating as a multi-label problem: False 2023-03-09 10:50:36,437 Task_0 - TextClassifier - loss: 1.0861172676086426 - f1-score (micro avg) 0.8348 2023-03-09 10:50:38,565 Evaluating as a multi-label problem: False 2023-03-09 10:50:38,570 Task_1 - TextClassifier - loss: 1.648613452911377 - f1-score (micro avg) 0.8922 2023-03-09 10:50:40,452 Evaluating as a multi-label problem: False 2023-03-09 10:50:40,458 Task_2 - TextClassifier - loss: 2.3406741619110107 - f1-score (micro avg) 0.6838 2023-03-09 10:50:40,458 DEV : loss 1.6918017069498699 - f1-score (micro avg) 0.8036 2023-03-09 10:50:44,160 ---------------------------------------------------------------------------------------------------- 2023-03-09 10:50:44,165 Testing using last state of model ... 2023-03-09 10:50:57,200 Evaluating as a multi-label problem: False 2023-03-09 10:50:57,216 Task_0 - TextClassifier - loss: 0.0 - f1-score (micro avg) 0.9256 2023-03-09 10:51:01,701 Evaluating as a multi-label problem: False 2023-03-09 10:51:01,709 Task_1 - TextClassifier - loss: 0.0 - f1-score (micro avg) 0.7138 2023-03-09 10:51:04,575 Evaluating as a multi-label problem: False 2023-03-09 10:51:04,581 Task_2 - TextClassifier - loss: 0.0 - f1-score (micro avg) 0.8318 2023-03-09 10:51:04,581 2023-03-09 10:51:04,581 -------------------------------------------------- Task_0 - Label type: subtask_a Results: - F-score (micro) 0.9256 - F-score (macro) 0.9131 - Accuracy 0.9256 By class: precision recall f1-score support NOT 0.9922 0.9042 0.9461 2807 OFF 0.7976 0.9815 0.8800 1080 accuracy 0.9256 3887 macro avg 0.8949 0.9428 0.9131 3887 weighted avg 0.9381 0.9256 0.9278 3887 -------------------------------------------------- Task_1 - Label type: subtask_b Results: - F-score (micro) 0.7138 - F-score (macro) 0.6408 - Accuracy 0.7138 By class: precision recall f1-score support TIN 0.6826 0.9741 0.8027 850 UNT 0.8947 0.3269 0.4789 572 accuracy 0.7138 1422 macro avg 0.7887 0.6505 0.6408 1422 weighted avg 0.7679 0.7138 0.6724 1422 -------------------------------------------------- Task_2 - Label type: subtask_c Results: - F-score (micro) 0.8318 - F-score (macro) 0.6978 - Accuracy 0.8318 By class: precision recall f1-score support IND 0.8703 0.9483 0.9076 580 GRP 0.7216 0.6684 0.6940 190 OTH 0.7143 0.3750 0.4918 80 accuracy 0.8318 850 macro avg 0.7687 0.6639 0.6978 850 weighted avg 0.8223 0.8318 0.8207 850 2023-03-09 10:51:04,581 ----------------------------------------------------------------------------------------------------