Spaces:

foxxy-hm
/

e2eqa-wiki

Build error

File size: 31,521 Bytes

c5d2283

{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/noobhocai/train-pairwise-stage1"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:52:30.450145Z","iopub.status.busy":"2023-06-28T02:52:30.449601Z","iopub.status.idle":"2023-06-28T02:52:48.306606Z","shell.execute_reply":"2023-06-28T02:52:48.305298Z","shell.execute_reply.started":"2023-06-28T02:52:30.450111Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install sentence_transformers pyvi -q"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:52:48.310777Z","iopub.status.busy":"2023-06-28T02:52:48.309711Z","iopub.status.idle":"2023-06-28T02:53:01.669646Z","shell.execute_reply":"2023-06-28T02:53:01.668372Z","shell.execute_reply.started":"2023-06-28T02:52:48.310744Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Collecting tensorflow-io==0.32.0\n","  Downloading tensorflow_io-0.32.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (28.0 MB)\n","\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m28.0/28.0 MB\u001b[0m \u001b[31m40.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n","\u001b[?25hCollecting tensorflow-io-gcs-filesystem==0.32.0 (from tensorflow-io==0.32.0)\n","  Downloading tensorflow_io_gcs_filesystem-0.32.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.4 MB)\n","\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m58.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n","\u001b[?25hInstalling collected packages: tensorflow-io-gcs-filesystem, tensorflow-io\n","  Attempting uninstall: tensorflow-io-gcs-filesystem\n","    Found existing installation: tensorflow-io-gcs-filesystem 0.31.0\n","    Uninstalling tensorflow-io-gcs-filesystem-0.31.0:\n","      Successfully uninstalled tensorflow-io-gcs-filesystem-0.31.0\n","  Attempting uninstall: tensorflow-io\n","    Found existing installation: tensorflow-io 0.31.0\n","    Uninstalling tensorflow-io-0.31.0:\n","      Successfully uninstalled tensorflow-io-0.31.0\n","Successfully installed tensorflow-io-0.32.0 tensorflow-io-gcs-filesystem-0.32.0\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install --upgrade tensorflow-io==0.32.0"]},{"cell_type":"code","execution_count":3,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2023-06-28T02:53:01.672246Z","iopub.status.busy":"2023-06-28T02:53:01.671610Z","iopub.status.idle":"2023-06-28T02:53:15.771561Z","shell.execute_reply":"2023-06-28T02:53:15.770593Z","shell.execute_reply.started":"2023-06-28T02:53:01.672210Z"},"trusted":true},"outputs":[],"source":["import os\n","import pandas as pd\n","import json\n","from tqdm.auto import tqdm\n","tqdm.pandas()\n","from transformers import AutoModel, AutoTokenizer\n","import torch\n","from torch.utils.data import DataLoader\n","import pandas as pd\n","from sklearn.metrics.pairwise import cosine_similarity\n","import numpy as np\n","from sentence_transformers import SentenceTransformer\n","from pyvi.ViTokenizer import tokenize\n","from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup\n","from transformers import DataCollatorWithPadding\n","from scipy.stats import pearsonr, spearmanr\n","import math\n","from sklearn.metrics import *"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:15.775041Z","iopub.status.busy":"2023-06-28T02:53:15.774278Z","iopub.status.idle":"2023-06-28T02:53:15.780313Z","shell.execute_reply":"2023-06-28T02:53:15.778919Z","shell.execute_reply.started":"2023-06-28T02:53:15.774985Z"},"trusted":true},"outputs":[],"source":["AUTH_TOKEN = \"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\""]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:15.782302Z","iopub.status.busy":"2023-06-28T02:53:15.781713Z","iopub.status.idle":"2023-06-28T02:53:18.884084Z","shell.execute_reply":"2023-06-28T02:53:18.882908Z","shell.execute_reply.started":"2023-06-28T02:53:15.782268Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"6118765cb5854ac7b8f4c68eb0c654a2","version_major":2,"version_minor":0},"text/plain":["Downloading (…)okenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"9d4cfe6e1330443eb6775a558657bbfb","version_major":2,"version_minor":0},"text/plain":["Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"cf5607811ef54e528e24d6a66eceb758","version_major":2,"version_minor":0},"text/plain":["Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["<s> sinh viên đại học khoa học tự nhiên</s>\n"]}],"source":["tokenizer = AutoTokenizer.from_pretrained('nguyenvulebinh/vi-mrc-base', use_auth_token=AUTH_TOKEN)\n","print(tokenizer.decode(tokenizer.encode(\"sinh viên đại học khoa học tự nhiên\")))"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:18.886122Z","iopub.status.busy":"2023-06-28T02:53:18.885497Z","iopub.status.idle":"2023-06-28T02:53:18.897285Z","shell.execute_reply":"2023-06-28T02:53:18.896274Z","shell.execute_reply.started":"2023-06-28T02:53:18.886088Z"},"trusted":true},"outputs":[],"source":["import json \n","from glob import glob \n","import re \n","from nltk import word_tokenize as lib_tokenizer \n"," \n","dict_map = dict({}) \n"," \n","def word_tokenize(text): \n","    global dict_map \n","    words = text.split() \n","    words_norm = [] \n","    for w in words: \n","        if dict_map.get(w, None) is None: \n","            dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '\"').replace(\"''\", '\"') \n","        words_norm.append(dict_map[w]) \n","    return words_norm \n"," \n","def strip_answer_string(text): \n","    text = text.strip() \n","    while text[-1] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n","        if text[0] != '(' and text[-1] == ')' and '(' in text: \n","            break \n","        if text[-1] == '\"' and text[0] != '\"' and text.count('\"') > 1: \n","            break \n","        text = text[:-1].strip() \n","    while text[0] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n","        if text[0] == '\"' and text[-1] != '\"' and text.count('\"') > 1: \n","            break \n","        text = text[1:].strip() \n","    text = text.strip() \n","    return text \n"," \n","def strip_context(text): \n","    text = text.replace('\\n', ' ') \n","    text = re.sub(r'\\s+', ' ', text) \n","    text = text.strip() \n","    return text"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:18.899048Z","iopub.status.busy":"2023-06-28T02:53:18.898414Z","iopub.status.idle":"2023-06-28T02:53:28.963083Z","shell.execute_reply":"2023-06-28T02:53:28.962126Z","shell.execute_reply.started":"2023-06-28T02:53:18.899002Z"},"trusted":true},"outputs":[],"source":["train1 = pd.read_csv(\"/kaggle/input/e2eqa-wiki-zalo-ai/processed/train_stage1_ranking.csv\")\n","train1['text'] = train1['text'].apply(lambda x: \" \".join(word_tokenize(strip_context(x))))\n","train1['question'] = train1['question'].apply(lambda x: \" \".join(word_tokenize(strip_context(x))))\n","df = train1"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:28.965037Z","iopub.status.busy":"2023-06-28T02:53:28.964656Z","iopub.status.idle":"2023-06-28T02:53:28.976243Z","shell.execute_reply":"2023-06-28T02:53:28.974943Z","shell.execute_reply.started":"2023-06-28T02:53:28.964989Z"},"trusted":true},"outputs":[],"source":["import torch.nn as nn\n","from transformers import AutoModel, AutoConfig\n","\n","class MeanPooling(nn.Module):\n","    def __init__(self):\n","        super(MeanPooling, self).__init__()\n","\n","    def forward(self, last_hidden_state, attention_mask):\n","        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()\n","        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)\n","        sum_mask = input_mask_expanded.sum(1)\n","        sum_mask = torch.clamp(sum_mask, min=1e-9)\n","        mean_embeddings = sum_embeddings / sum_mask\n","        return mean_embeddings\n","\n","class PairwiseModel(nn.Module):\n","    def __init__(self, model_name):\n","        super(PairwiseModel, self).__init__()\n","        self.model = AutoModel.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)\n","        self.config = AutoConfig.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)\n","        self.drop = nn.Dropout(p=0.2)\n","        self.fc = nn.Linear(768, 1)\n","        \n","    def forward(self, ids, masks):\n","        out = self.model(input_ids=ids,\n","                           attention_mask=masks,\n","                           output_hidden_states=False).last_hidden_state\n","        out = out[:,0]\n","        outputs = self.fc(out)\n","        return outputs"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:28.978164Z","iopub.status.busy":"2023-06-28T02:53:28.977797Z","iopub.status.idle":"2023-06-28T02:53:28.990859Z","shell.execute_reply":"2023-06-28T02:53:28.989964Z","shell.execute_reply.started":"2023-06-28T02:53:28.978133Z"},"trusted":true},"outputs":[],"source":["from torch.utils.data import Dataset\n","\n","class SiameseDataset(Dataset):\n","\n","    def __init__(self, df, tokenizer, max_length):\n","        self.df = df\n","        self.max_length = max_length\n","        self.tokenizer = tokenizer\n","        self.content1 = tokenizer.batch_encode_plus(list(df.question.apply(lambda x: x.replace(\"_\",\" \")).values), max_length=max_length, truncation=True)[\"input_ids\"]\n","        self.content2 = tokenizer.batch_encode_plus(list(df.text.apply(lambda x: x.replace(\"_\",\" \")).values), max_length=max_length, truncation=True)[\"input_ids\"]\n","        self.targets = self.df.label\n","        \n","    def __len__(self):\n","        return len(self.df)\n","\n","    def __getitem__(self, index):\n","        return {\n","            'ids1': torch.tensor(self.content1[index], dtype=torch.long),\n","            'ids2': torch.tensor(self.content2[index][1:], dtype=torch.long),\n","            'target': torch.tensor(self.targets[index], dtype=torch.float)\n","        }"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:28.995179Z","iopub.status.busy":"2023-06-28T02:53:28.994892Z","iopub.status.idle":"2023-06-28T02:53:29.004203Z","shell.execute_reply":"2023-06-28T02:53:29.003189Z","shell.execute_reply.started":"2023-06-28T02:53:28.995156Z"},"trusted":true},"outputs":[],"source":["pad_token_id = tokenizer.pad_token_id\n","def collate_fn(batch):\n","    ids = [torch.cat([x[\"ids1\"], x[\"ids2\"]]) for x in batch]\n","    targets = [x[\"target\"] for x in batch]\n","    max_len = np.max([len(x) for x in ids])\n","    masks = []\n","    for i in range(len(ids)):\n","        if len(ids[i]) < max_len:\n","            ids[i]= torch.cat((ids[i], torch.tensor([pad_token_id,]*(max_len - len(ids[i])),dtype=torch.long)))\n","        masks.append(ids[i] != pad_token_id)\n","    # print(tokenizer.decode(ids[0]))\n","    outputs = {\n","        \"ids\": torch.vstack(ids),\n","        \"masks\": torch.vstack(masks),\n","        \"target\": torch.vstack(targets).view(-1)\n","    }\n","    return outputs"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.006085Z","iopub.status.busy":"2023-06-28T02:53:29.005583Z","iopub.status.idle":"2023-06-28T02:53:29.013002Z","shell.execute_reply":"2023-06-28T02:53:29.011919Z","shell.execute_reply.started":"2023-06-28T02:53:29.006052Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import GroupKFold, KFold"]},{"cell_type":"code","execution_count":12,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.014999Z","iopub.status.busy":"2023-06-28T02:53:29.014493Z","iopub.status.idle":"2023-06-28T02:53:29.023324Z","shell.execute_reply":"2023-06-28T02:53:29.022211Z","shell.execute_reply.started":"2023-06-28T02:53:29.014969Z"},"trusted":true},"outputs":[],"source":["def optimizer_scheduler(model, num_train_steps):\n","    param_optimizer = list(model.named_parameters())\n","    no_decay = [\"bias\", \"LayerNorm.weight\"]\n","    optimizer_parameters = [\n","            {\n","                \"params\": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],\n","                \"weight_decay\": 0.001,\n","            },\n","            {\n","                \"params\": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],\n","                \"weight_decay\": 0.0,\n","            },\n","        ]\n","\n","    opt = AdamW(optimizer_parameters, lr=3e-5)\n","    sch = get_linear_schedule_with_warmup(\n","        opt,\n","        num_warmup_steps=int(0.05*num_train_steps),\n","        num_training_steps=num_train_steps,\n","        last_epoch=-1,\n","    )\n","    return opt, sch"]},{"cell_type":"code","execution_count":13,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.025531Z","iopub.status.busy":"2023-06-28T02:53:29.025091Z","iopub.status.idle":"2023-06-28T02:53:29.036112Z","shell.execute_reply":"2023-06-28T02:53:29.034921Z","shell.execute_reply.started":"2023-06-28T02:53:29.025496Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import KFold\n","kfold = KFold(n_splits=5, shuffle=True, random_state=42)"]},{"cell_type":"code","execution_count":14,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.038723Z","iopub.status.busy":"2023-06-28T02:53:29.037722Z","iopub.status.idle":"2023-06-28T04:54:29.777596Z","shell.execute_reply":"2023-06-28T04:54:29.776341Z","shell.execute_reply.started":"2023-06-28T02:53:29.038691Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["[    3     6    17 ... 20845 20850 20855]\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"f0f58e6b09914620903bd80dae3eca59","version_major":2,"version_minor":0},"text/plain":["Downloading (…)lve/main/config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"2b7bfe1d5b504e8f8ec03f19cd7bfba5","version_major":2,"version_minor":0},"text/plain":["Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stderr","output_type":"stream","text":["Some weights of the model checkpoint at nguyenvulebinh/vi-mrc-base were not used when initializing RobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']\n","- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n","- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n","Some weights of RobertaModel were not initialized from the model checkpoint at nguyenvulebinh/vi-mrc-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n","You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n","/opt/conda/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n","  warnings.warn(\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"3a999992c57e44838063b744088409f8","version_major":2,"version_minor":0},"text/plain":["  0%|          | 0/5 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":["  0%|          | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":["  0%|          | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9280047017337644\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":["  0%|          | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":["  0%|          | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9665513264129183\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":["  0%|          | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":["  0%|          | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9868613138686132\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":["  0%|          | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":["  0%|          | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9889341875364007\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":["  0%|          | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":["  0%|          | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9926921952645424\n"]}],"source":["from tqdm.auto import tqdm\n","loss_fn = nn.BCEWithLogitsLoss()\n","epochs = 5\n","accumulation_steps = 8\n","scaler = torch.cuda.amp.GradScaler()\n","error_ids = None\n","for fold, (train_index, test_index) in enumerate(kfold.split(df, df.label)):\n","    if fold != 0:\n","        break\n","    print(test_index)\n","    model = PairwiseModel('nguyenvulebinh/vi-mrc-base')\n","    # model.load_state_dict(torch.load(f\"./outputs/pairwise_v2.bin\"))\n","    model.cuda()\n","    train_df = df\n","    # train_df = df.iloc[train_index].reset_index(drop=True)\n","    val_df = df.iloc[test_index].reset_index(drop=True)\n","    \n","    train_dataset = SiameseDataset(train_df, tokenizer, 384)\n","    valid_dataset = SiameseDataset(val_df, tokenizer, 384)\n","    train_loader = DataLoader(train_dataset, batch_size=4, collate_fn=collate_fn,\n","                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)\n","    valid_loader = DataLoader(valid_dataset, batch_size=32, collate_fn=collate_fn,\n","                              num_workers=2, shuffle=False, pin_memory=True)\n","    \n","    num_train_steps = len(train_loader) * epochs // accumulation_steps\n","    optimizer, scheduler = optimizer_scheduler(model, num_train_steps)\n","    \n","    for epoch in tqdm(range(epochs)):\n","        model.train()\n","        bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)\n","        for step, data in bar:\n","            ids = data[\"ids\"].cuda()\n","            # for x in ids:\n","            #     print(tokenizer.decode(x))\n","            masks = data[\"masks\"].cuda()\n","            target = data[\"target\"].cuda()\n","            # with torch.cuda.amp.autocast():\n","            preds = model(ids, masks)\n","            # print(preds.view(-1))\n","            loss = loss_fn(preds.view(-1), target.view(-1))\n","            loss /= accumulation_steps\n","            loss.backward()\n","            if (step + 1) % accumulation_steps == 0:\n","                optimizer.step()\n","                # scaler.update()\n","                optimizer.zero_grad()\n","                scheduler.step()\n","            bar.set_postfix(loss=loss.item())\n","\n","        model.eval()\n","        with torch.no_grad():\n","            bar = tqdm(enumerate(valid_loader), total=len(valid_loader), leave=False)\n","            targets = []\n","            all_preds = []\n","            for step, data in bar:\n","                ids = data[\"ids\"].cuda()\n","                masks = data[\"masks\"].cuda()\n","                target = data[\"target\"].cuda()\n","                preds = torch.sigmoid(model(ids, masks))\n","                all_preds.extend(preds.cpu().view(-1).numpy())\n","                targets.extend(target.cpu().view(-1).numpy())\n","            all_preds = np.array(all_preds)\n","            targets = np.array(targets)\n","            print(f\"F1 {f1_score(targets, all_preds > 0.5)}\")"]},{"cell_type":"code","execution_count":15,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T04:54:29.780731Z","iopub.status.busy":"2023-06-28T04:54:29.779924Z","iopub.status.idle":"2023-06-28T04:54:29.791869Z","shell.execute_reply":"2023-06-28T04:54:29.790613Z","shell.execute_reply.started":"2023-06-28T04:54:29.780691Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["F1 0.9953106682297772\n"]}],"source":["print(f\"F1 {recall_score(np.array(targets), np.array(all_preds) > 0.5)}\")"]},{"cell_type":"code","execution_count":16,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T04:54:29.796171Z","iopub.status.busy":"2023-06-28T04:54:29.795752Z","iopub.status.idle":"2023-06-28T04:54:31.811461Z","shell.execute_reply":"2023-06-28T04:54:31.810427Z","shell.execute_reply.started":"2023-06-28T04:54:29.796143Z"},"trusted":true},"outputs":[],"source":["torch.save(model.state_dict(), f\"/kaggle/working/pairwise_v2.bin\")\n"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.10"}},"nbformat":4,"nbformat_minor":4}