{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/noobhocai/train-pairwise-stage2"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:34:51.417747Z","iopub.status.busy":"2023-06-29T01:34:51.417032Z","iopub.status.idle":"2023-06-29T01:35:11.748112Z","shell.execute_reply":"2023-06-29T01:35:11.746819Z","shell.execute_reply.started":"2023-06-29T01:34:51.417706Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"]}],"source":["!pip install sentence_transformers pyvi -q\n","# !pip install --upgrade tensorflow-io==0.32.0"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:11.750592Z","iopub.status.busy":"2023-06-29T01:35:11.750272Z","iopub.status.idle":"2023-06-29T01:35:11.755063Z","shell.execute_reply":"2023-06-29T01:35:11.754177Z","shell.execute_reply.started":"2023-06-29T01:35:11.750563Z"},"trusted":true},"outputs":[],"source":["# !pip install gsutil -q\n","# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py\n","# !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev"]},{"cell_type":"code","execution_count":3,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2023-06-29T01:35:11.760109Z","iopub.status.busy":"2023-06-29T01:35:11.759784Z","iopub.status.idle":"2023-06-29T01:35:46.341332Z","shell.execute_reply":"2023-06-29T01:35:46.339954Z","shell.execute_reply.started":"2023-06-29T01:35:11.760071Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n"," from .autonotebook import tqdm as notebook_tqdm\n"]}],"source":["import os\n","import pandas as pd\n","from transformers import AutoModel, AutoTokenizer\n","import torch\n","from torch.utils.data import DataLoader\n","from sklearn.metrics.pairwise import cosine_similarity\n","import numpy as np\n","from pyvi.ViTokenizer import tokenize\n","from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup\n","from transformers import DataCollatorWithPadding\n","from scipy.stats import pearsonr, spearmanr\n","import math\n","from sklearn.metrics import *\n","from sklearn.model_selection import GroupKFold, KFold"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:46.343395Z","iopub.status.busy":"2023-06-29T01:35:46.342831Z","iopub.status.idle":"2023-06-29T01:35:46.348021Z","shell.execute_reply":"2023-06-29T01:35:46.347052Z","shell.execute_reply.started":"2023-06-29T01:35:46.343363Z"},"trusted":true},"outputs":[],"source":["AUTH_TOKEN = \"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\""]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:46.349601Z","iopub.status.busy":"2023-06-29T01:35:46.349307Z","iopub.status.idle":"2023-06-29T01:35:47.904138Z","shell.execute_reply":"2023-06-29T01:35:47.903274Z","shell.execute_reply.started":"2023-06-29T01:35:46.349575Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["Downloading (…)okenizer_config.json: 100%|██████████| 398/398 [00:00<00:00, 47.5kB/s]\n","Downloading (…)/main/tokenizer.json: 100%|██████████| 9.08M/9.08M [00:00<00:00, 70.8MB/s]\n","Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 133kB/s]\n"]},{"name":"stdout","output_type":"stream","text":[" sinh viên đại học khoa học tự nhiên\n"]}],"source":["tokenizer = AutoTokenizer.from_pretrained('nguyenvulebinh/vi-mrc-base', use_auth_token=AUTH_TOKEN)\n","print(tokenizer.decode(tokenizer.encode(\"sinh viên đại học khoa học tự nhiên \")))"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:47.905567Z","iopub.status.busy":"2023-06-29T01:35:47.905232Z","iopub.status.idle":"2023-06-29T01:35:47.909874Z","shell.execute_reply":"2023-06-29T01:35:47.909125Z","shell.execute_reply.started":"2023-06-29T01:35:47.905538Z"},"trusted":true},"outputs":[],"source":["import os\n","os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:47.911132Z","iopub.status.busy":"2023-06-29T01:35:47.910848Z","iopub.status.idle":"2023-06-29T01:35:58.020338Z","shell.execute_reply":"2023-06-29T01:35:58.019355Z","shell.execute_reply.started":"2023-06-29T01:35:47.911107Z"},"trusted":true},"outputs":[],"source":["df = pd.read_csv(\"/kaggle/input/e2eqa-wiki-zalo-ai/processed/train_stage2_ranking.csv\")"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:58.021880Z","iopub.status.busy":"2023-06-29T01:35:58.021545Z","iopub.status.idle":"2023-06-29T01:35:58.046072Z","shell.execute_reply":"2023-06-29T01:35:58.045164Z","shell.execute_reply.started":"2023-06-29T01:35:58.021853Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
questionanswertitlecandidatelabelgroup
0Đất nước nào không có quân độiCosta Rica, Iceland, Panama, Micronesia, Quần ...Costa RicaCosta Rica Costa Rica (Phiên âm: Cô-xta Ri-ca)...00
1Đất nước nào không có quân độiCosta Rica, Iceland, Panama, Micronesia, Quần ...Quần đảo MarshallQuần đảo Marshall Quần đảo Marshall, tên chính...00
2Đất nước nào không có quân độiCosta Rica, Iceland, Panama, Micronesia, Quần ...Montezuma, Costa RicaMontezuma, Costa Rica Montezuma là một thị xã ...00
3Đất nước nào không có quân độiCosta Rica, Iceland, Panama, Micronesia, Quần ...Tamarindo, Costa RicaTamarindo, Costa Rica Tamarindo là một thị xã ...00
4Đất nước nào không có quân độiCosta Rica, Iceland, Panama, Micronesia, Quần ...MicronesiaMicronesia Micronesia (, ), còn gọi là Tiểu Đả...00
.....................
46075trong thần thoại hy lạp vị thần tình yêu có tê...ErosEros phaleratusEros phaleratus Eros phaleratus là một loài bọ...04607
46076trong thần thoại hy lạp vị thần tình yêu có tê...ErosEros melanurusEros melanurus Eros melanurus là một loài bọ c...04607
46077trong thần thoại hy lạp vị thần tình yêu có tê...ErosEros melanopterusEros melanopterus Eros melanopterus là một loà...04607
46078trong thần thoại hy lạp vị thần tình yêu có tê...ErosEros humeralisEros humeralis Eros humeralis là một loài bọ c...04607
46079trong thần thoại hy lạp vị thần tình yêu có tê...ErosEros patruelisEros patruelis Eros patruelis là một loài bọ c...04607
\n","

46080 rows × 6 columns

\n","
"],"text/plain":[" question \\\n","0 Đất nước nào không có quân đội \n","1 Đất nước nào không có quân đội \n","2 Đất nước nào không có quân đội \n","3 Đất nước nào không có quân đội \n","4 Đất nước nào không có quân đội \n","... ... \n","46075 trong thần thoại hy lạp vị thần tình yêu có tê... \n","46076 trong thần thoại hy lạp vị thần tình yêu có tê... \n","46077 trong thần thoại hy lạp vị thần tình yêu có tê... \n","46078 trong thần thoại hy lạp vị thần tình yêu có tê... \n","46079 trong thần thoại hy lạp vị thần tình yêu có tê... \n","\n"," answer \\\n","0 Costa Rica, Iceland, Panama, Micronesia, Quần ... \n","1 Costa Rica, Iceland, Panama, Micronesia, Quần ... \n","2 Costa Rica, Iceland, Panama, Micronesia, Quần ... \n","3 Costa Rica, Iceland, Panama, Micronesia, Quần ... \n","4 Costa Rica, Iceland, Panama, Micronesia, Quần ... \n","... ... \n","46075 Eros \n","46076 Eros \n","46077 Eros \n","46078 Eros \n","46079 Eros \n","\n"," title \\\n","0 Costa Rica \n","1 Quần đảo Marshall \n","2 Montezuma, Costa Rica \n","3 Tamarindo, Costa Rica \n","4 Micronesia \n","... ... \n","46075 Eros phaleratus \n","46076 Eros melanurus \n","46077 Eros melanopterus \n","46078 Eros humeralis \n","46079 Eros patruelis \n","\n"," candidate label group \n","0 Costa Rica Costa Rica (Phiên âm: Cô-xta Ri-ca)... 0 0 \n","1 Quần đảo Marshall Quần đảo Marshall, tên chính... 0 0 \n","2 Montezuma, Costa Rica Montezuma là một thị xã ... 0 0 \n","3 Tamarindo, Costa Rica Tamarindo là một thị xã ... 0 0 \n","4 Micronesia Micronesia (, ), còn gọi là Tiểu Đả... 0 0 \n","... ... ... ... \n","46075 Eros phaleratus Eros phaleratus là một loài bọ... 0 4607 \n","46076 Eros melanurus Eros melanurus là một loài bọ c... 0 4607 \n","46077 Eros melanopterus Eros melanopterus là một loà... 0 4607 \n","46078 Eros humeralis Eros humeralis là một loài bọ c... 0 4607 \n","46079 Eros patruelis Eros patruelis là một loài bọ c... 0 4607 \n","\n","[46080 rows x 6 columns]"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["df"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:58.049946Z","iopub.status.busy":"2023-06-29T01:35:58.049516Z","iopub.status.idle":"2023-06-29T01:35:58.056766Z","shell.execute_reply":"2023-06-29T01:35:58.055691Z","shell.execute_reply.started":"2023-06-29T01:35:58.049902Z"},"trusted":true},"outputs":[{"data":{"text/plain":["''"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["tokenizer.sep_token"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:58.058453Z","iopub.status.busy":"2023-06-29T01:35:58.058155Z","iopub.status.idle":"2023-06-29T01:35:58.065931Z","shell.execute_reply":"2023-06-29T01:35:58.064879Z","shell.execute_reply.started":"2023-06-29T01:35:58.058427Z"},"trusted":true},"outputs":[],"source":["# !pip install torch"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:58.067617Z","iopub.status.busy":"2023-06-29T01:35:58.067324Z","iopub.status.idle":"2023-06-29T01:35:58.076734Z","shell.execute_reply":"2023-06-29T01:35:58.075869Z","shell.execute_reply.started":"2023-06-29T01:35:58.067591Z"},"trusted":true},"outputs":[],"source":["import torch.nn as nn\n","from transformers import AutoModel, AutoConfig\n","\n","class PairwiseModel(nn.Module):\n"," def __init__(self, model_name):\n"," super(PairwiseModel, self).__init__()\n"," self.model = AutoModel.from_pretrained(model_name,use_auth_token=AUTH_TOKEN)\n"," self.config = AutoConfig.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)\n"," self.drop = nn.Dropout(p=0.2)\n"," self.fc = nn.Linear(768, 1)\n"," \n"," def forward(self, ids, masks):\n"," out = self.model(input_ids=ids,\n"," attention_mask=masks,\n"," output_hidden_states=False).last_hidden_state\n"," out = out[:,0]\n"," outputs = self.fc(out)\n"," return outputs"]},{"cell_type":"code","execution_count":12,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:58.078100Z","iopub.status.busy":"2023-06-29T01:35:58.077801Z","iopub.status.idle":"2023-06-29T01:36:03.750905Z","shell.execute_reply":"2023-06-29T01:36:03.749622Z","shell.execute_reply.started":"2023-06-29T01:35:58.078073Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Requirement already satisfied: tqdm in /usr/local/lib/python3.8/site-packages (4.65.0)\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"]}],"source":["!pip install tqdm"]},{"cell_type":"code","execution_count":13,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:36:03.752804Z","iopub.status.busy":"2023-06-29T01:36:03.752475Z","iopub.status.idle":"2023-06-29T01:36:03.772087Z","shell.execute_reply":"2023-06-29T01:36:03.771300Z","shell.execute_reply.started":"2023-06-29T01:36:03.752773Z"},"trusted":true},"outputs":[],"source":["from torch.utils.data import Dataset\n","from tqdm.auto import tqdm\n","tqdm.pandas()\n","\n","class SiameseDataset(Dataset):\n","\n"," def __init__(self, df, tokenizer, max_length):\n"," self.df = df\n"," self.max_length = max_length\n"," self.tokenizer = tokenizer\n"," self.df[\"content1\"] = self.df.apply(lambda row: row.question+f\" {tokenizer.sep_token} \"+row.answer,axis=1)\n"," self.df[\"content2\"] = self.df.apply(lambda row: row.title+f\" {tokenizer.sep_token} \"+row.candidate,axis=1)\n"," self.content1 = tokenizer.batch_encode_plus(list(df.content1.apply(lambda x: x.replace(\"_\",\" \")).values), max_length=max_length, truncation=True)[\"input_ids\"]\n"," self.content2 = tokenizer.batch_encode_plus(list(df.content2.apply(lambda x: x.replace(\"_\",\" \")).values), max_length=max_length, truncation=True)[\"input_ids\"]\n"," self.targets = self.df.label\n"," \n"," def __len__(self):\n"," return len(self.df)\n","\n"," def __getitem__(self, index):\n"," return {\n"," 'ids1': torch.tensor(self.content1[index], dtype=torch.long),\n"," 'ids2': torch.tensor(self.content2[index][1:], dtype=torch.long),\n"," 'target': torch.tensor(self.targets[index], dtype=torch.float)\n"," }\n","pad_token_id = tokenizer.pad_token_id\n","def collate_fn(batch):\n"," ids = [torch.cat([x[\"ids1\"], x[\"ids2\"]]) for x in batch]\n"," targets = [x[\"target\"] for x in batch]\n"," max_len = np.max([len(x) for x in ids])\n"," masks = []\n"," for i in range(len(ids)):\n"," if len(ids[i]) < max_len:\n"," ids[i]= torch.cat((ids[i], torch.tensor([pad_token_id,]*(max_len - len(ids[i])),dtype=torch.long)))\n"," masks.append(ids[i] != pad_token_id)\n"," # print(tokenizer.decode(ids[0]))\n"," outputs = {\n"," \"ids\": torch.vstack(ids),\n"," \"masks\": torch.vstack(masks),\n"," \"target\": torch.vstack(targets).view(-1)\n"," }\n"," return outputs"]},{"cell_type":"code","execution_count":14,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:36:03.773416Z","iopub.status.busy":"2023-06-29T01:36:03.773148Z","iopub.status.idle":"2023-06-29T01:36:03.789151Z","shell.execute_reply":"2023-06-29T01:36:03.788258Z","shell.execute_reply.started":"2023-06-29T01:36:03.773393Z"},"trusted":true},"outputs":[],"source":["def optimizer_scheduler(model, num_train_steps):\n"," param_optimizer = list(model.named_parameters())\n"," no_decay = [\"bias\", \"LayerNorm.weight\"]\n"," optimizer_parameters = [\n"," {\n"," \"params\": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],\n"," \"weight_decay\": 0.001,\n"," },\n"," {\n"," \"params\": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],\n"," \"weight_decay\": 0.0,\n"," },\n"," ]\n","\n"," opt = AdamW(optimizer_parameters, lr=3e-5)\n"," sch = get_linear_schedule_with_warmup(\n"," opt,\n"," num_warmup_steps=int(0.05*num_train_steps),\n"," num_training_steps=num_train_steps,\n"," last_epoch=-1,\n"," )\n"," return opt, sch"]},{"cell_type":"code","execution_count":15,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:36:03.790608Z","iopub.status.busy":"2023-06-29T01:36:03.790351Z","iopub.status.idle":"2023-06-29T08:54:33.816427Z","shell.execute_reply":"2023-06-29T08:54:33.813840Z","shell.execute_reply.started":"2023-06-29T01:36:03.790586Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.\n","INFO:tensorflow:Initializing the TPU system: local\n","INFO:tensorflow:Finished initializing TPU system.\n","INFO:tensorflow:Found TPU system:\n","INFO:tensorflow:*** Num TPU Cores: 8\n","INFO:tensorflow:*** Num TPU Workers: 1\n","INFO:tensorflow:*** Num TPU Cores Per Worker: 8\n","INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\n","INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)\n","INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)\n","INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)\n","INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)\n","INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)\n","INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)\n","INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)\n","INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)\n","INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\n"]},{"name":"stderr","output_type":"stream","text":["Downloading (…)lve/main/config.json: 100%|██████████| 688/688 [00:00<00:00, 97.6kB/s]\n","Downloading pytorch_model.bin: 100%|██████████| 1.11G/1.11G [00:14<00:00, 77.9MB/s]\n","Some weights of the model checkpoint at nguyenvulebinh/vi-mrc-base were not used when initializing RobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']\n","- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n","- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n","Some weights of RobertaModel were not initialized from the model checkpoint at nguyenvulebinh/vi-mrc-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n","You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n","/usr/local/lib/python3.8/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n"," warnings.warn(\n"," 0%| | 0/5 [00:00 46\u001b[0m preds \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmasks\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 47\u001b[0m loss \u001b[38;5;241m=\u001b[39m loss_fn(preds\u001b[38;5;241m.\u001b[39mview(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m), target\u001b[38;5;241m.\u001b[39mview(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m))\n\u001b[1;32m 48\u001b[0m loss \u001b[38;5;241m/\u001b[39m\u001b[38;5;241m=\u001b[39m accumulation_steps\n","File \u001b[0;32m/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n","Cell \u001b[0;32mIn[11], line 13\u001b[0m, in \u001b[0;36mPairwiseModel.forward\u001b[0;34m(self, ids, masks)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, ids, masks):\n\u001b[0;32m---> 13\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmasks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mlast_hidden_state\n\u001b[1;32m 16\u001b[0m out \u001b[38;5;241m=\u001b[39m out[:,\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 17\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfc(out)\n","File \u001b[0;32m/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n","File \u001b[0;32m/usr/local/lib/python3.8/site-packages/transformers/models/roberta/modeling_roberta.py:852\u001b[0m, in \u001b[0;36mRobertaModel.forward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 843\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[1;32m 845\u001b[0m embedding_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membeddings(\n\u001b[1;32m 846\u001b[0m input_ids\u001b[38;5;241m=\u001b[39minput_ids,\n\u001b[1;32m 847\u001b[0m position_ids\u001b[38;5;241m=\u001b[39mposition_ids,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 850\u001b[0m past_key_values_length\u001b[38;5;241m=\u001b[39mpast_key_values_length,\n\u001b[1;32m 851\u001b[0m )\n\u001b[0;32m--> 852\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 853\u001b[0m \u001b[43m \u001b[49m\u001b[43membedding_output\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 854\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 855\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 856\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 857\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_extended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 858\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 859\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 860\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 861\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 862\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 863\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 864\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 865\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler(sequence_output) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n","File \u001b[0;32m/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n","File \u001b[0;32m/usr/local/lib/python3.8/site-packages/transformers/models/roberta/modeling_roberta.py:527\u001b[0m, in \u001b[0;36mRobertaEncoder.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 518\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mcheckpoint\u001b[38;5;241m.\u001b[39mcheckpoint(\n\u001b[1;32m 519\u001b[0m create_custom_forward(layer_module),\n\u001b[1;32m 520\u001b[0m hidden_states,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 524\u001b[0m encoder_attention_mask,\n\u001b[1;32m 525\u001b[0m )\n\u001b[1;32m 526\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 527\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 528\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 529\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 530\u001b[0m \u001b[43m \u001b[49m\u001b[43mlayer_head_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 531\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 532\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 533\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 535\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 537\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_cache:\n","File \u001b[0;32m/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n","File \u001b[0;32m/usr/local/lib/python3.8/site-packages/transformers/models/roberta/modeling_roberta.py:411\u001b[0m, in \u001b[0;36mRobertaLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 400\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 401\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 408\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[1;32m 409\u001b[0m \u001b[38;5;66;03m# decoder uni-directional self-attention cached key/values tuple is at positions 1,2\u001b[39;00m\n\u001b[1;32m 410\u001b[0m self_attn_past_key_value \u001b[38;5;241m=\u001b[39m past_key_value[:\u001b[38;5;241m2\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m past_key_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 411\u001b[0m self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 413\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 414\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 415\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 416\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mself_attn_past_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 417\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 418\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 420\u001b[0m \u001b[38;5;66;03m# if decoder, the last output is tuple of self-attn cache\u001b[39;00m\n","File \u001b[0;32m/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n","File \u001b[0;32m/usr/local/lib/python3.8/site-packages/transformers/models/roberta/modeling_roberta.py:338\u001b[0m, in \u001b[0;36mRobertaAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 329\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 330\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 336\u001b[0m output_attentions: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 337\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[0;32m--> 338\u001b[0m self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 339\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 340\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 341\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 342\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 343\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 344\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 345\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 346\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 347\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[1;32m 348\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:] \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n","File \u001b[0;32m/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n","File \u001b[0;32m/usr/local/lib/python3.8/site-packages/transformers/models/roberta/modeling_roberta.py:218\u001b[0m, in \u001b[0;36mRobertaSelfAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 217\u001b[0m key_layer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtranspose_for_scores(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkey(hidden_states))\n\u001b[0;32m--> 218\u001b[0m value_layer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtranspose_for_scores(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalue\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 220\u001b[0m query_layer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtranspose_for_scores(mixed_query_layer)\n\u001b[1;32m 222\u001b[0m use_cache \u001b[38;5;241m=\u001b[39m past_key_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n","File \u001b[0;32m/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n","File \u001b[0;32m/usr/local/lib/python3.8/site-packages/torch/nn/modules/linear.py:114\u001b[0m, in \u001b[0;36mLinear.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n","\u001b[0;31mKeyboardInterrupt\u001b[0m: "]}],"source":["from sklearn.model_selection import GroupKFold\n","kfold = GroupKFold(n_splits=5)\n","import tensorflow as tf\n","\n","loss_fn = nn.BCEWithLogitsLoss()\n","epochs = 5\n","accumulation_steps = 8\n","error_ids = None\n","\n","# detect and init the TPU\n","tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()\n","\n","# instantiate a distribution strategy\n","tpu_strategy = tf.distribute.TPUStrategy(tpu)\n","\n","# adjust the batch size and learning rate\n","BATCH_SIZE = 16 * tpu_strategy.num_replicas_in_sync # 128 per core\n","LEARNING_RATE = 0.001 * tpu_strategy.num_replicas_in_sync # scale up\n","\n","for fold, (train_index, test_index) in enumerate(kfold.split(df, df.label, df.group)):\n"," # instantiating the model in the strategy scope creates the model on the TPU\n"," with tpu_strategy.scope():\n"," model = PairwiseModel('nguyenvulebinh/vi-mrc-base')\n"," train_df = df\n"," val_df = df.iloc[test_index].reset_index(drop=True)\n"," \n"," train_dataset = SiameseDataset(train_df, tokenizer, 256)\n"," valid_dataset = SiameseDataset(val_df, tokenizer, 256)\n"," train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn,\n"," num_workers=2, shuffle=True, pin_memory=True, drop_last=True)\n"," valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn,\n"," num_workers=2, shuffle=False, pin_memory=True)\n"," \n"," num_train_steps = len(train_loader) * epochs // accumulation_steps\n"," \n"," # use your custom optimizer and scheduler\n"," optimizer, scheduler = optimizer_scheduler(model, num_train_steps)\n"," \n"," for epoch in tqdm(range(epochs)):\n"," model.train()\n"," bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)\n"," for step, data in bar:\n"," ids = data[\"ids\"]\n"," masks = data[\"masks\"]\n"," target = data[\"target\"]\n"," preds = model(ids, masks)\n"," loss = loss_fn(preds.view(-1), target.view(-1))\n"," loss /= accumulation_steps\n"," loss.backward()\n"," if (step + 1) % accumulation_steps == 0:\n","# optimizer.apply_gradients(zip(model.trainable_variables, model.gradients()))\n"," optimizer.step()\n"," optimizer.zero_grad()\n"," scheduler.step()\n"," bar.set_postfix(loss=loss.item())\n"," model.eval()\n"," with torch.no_grad():\n"," bar = tqdm(enumerate(valid_loader), total=len(valid_loader), leave=False)\n"," targets = []\n"," all_preds = []\n"," for step, data in bar:\n"," ids = data[\"ids\"]\n"," masks = data[\"masks\"]\n"," target = data[\"target\"]\n"," preds = torch.sigmoid(model(ids, masks))\n"," all_preds.extend(preds.cpu().view(-1).numpy())\n"," targets.extend(target.cpu().view(-1).numpy())\n"," all_preds = np.array(all_preds)\n"," targets = np.array(targets)\n"," total = 0\n"," val_df[\"preds\"] = all_preds\n"," for group in val_df.group.unique():\n"," tmp = val_df[val_df.group == group]\n"," if np.argmax(tmp.label.values) == np.argmax(tmp.preds.values):\n"," total += 1\n"," print(total/len(val_df.group.unique()))\n"," break\n"]},{"cell_type":"code","execution_count":16,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T08:54:41.373753Z","iopub.status.busy":"2023-06-29T08:54:41.373191Z","iopub.status.idle":"2023-06-29T08:54:42.698384Z","shell.execute_reply":"2023-06-29T08:54:42.696862Z","shell.execute_reply.started":"2023-06-29T08:54:41.373706Z"},"trusted":true},"outputs":[],"source":["torch.save(model.state_dict(), f\"/kaggle/working/pairwise_stage2_seed0.bin\")"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.17"}},"nbformat":4,"nbformat_minor":4}