Spaces:
Build error
Build error
File size: 31,521 Bytes
c5d2283 |
1 2 |
{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/noobhocai/train-pairwise-stage1"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:52:30.450145Z","iopub.status.busy":"2023-06-28T02:52:30.449601Z","iopub.status.idle":"2023-06-28T02:52:48.306606Z","shell.execute_reply":"2023-06-28T02:52:48.305298Z","shell.execute_reply.started":"2023-06-28T02:52:30.450111Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install sentence_transformers pyvi -q"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:52:48.310777Z","iopub.status.busy":"2023-06-28T02:52:48.309711Z","iopub.status.idle":"2023-06-28T02:53:01.669646Z","shell.execute_reply":"2023-06-28T02:53:01.668372Z","shell.execute_reply.started":"2023-06-28T02:52:48.310744Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Collecting tensorflow-io==0.32.0\n"," Downloading tensorflow_io-0.32.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (28.0 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m28.0/28.0 MB\u001b[0m \u001b[31m40.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n","\u001b[?25hCollecting tensorflow-io-gcs-filesystem==0.32.0 (from tensorflow-io==0.32.0)\n"," Downloading tensorflow_io_gcs_filesystem-0.32.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.4 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m58.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n","\u001b[?25hInstalling collected packages: tensorflow-io-gcs-filesystem, tensorflow-io\n"," Attempting uninstall: tensorflow-io-gcs-filesystem\n"," Found existing installation: tensorflow-io-gcs-filesystem 0.31.0\n"," Uninstalling tensorflow-io-gcs-filesystem-0.31.0:\n"," Successfully uninstalled tensorflow-io-gcs-filesystem-0.31.0\n"," Attempting uninstall: tensorflow-io\n"," Found existing installation: tensorflow-io 0.31.0\n"," Uninstalling tensorflow-io-0.31.0:\n"," Successfully uninstalled tensorflow-io-0.31.0\n","Successfully installed tensorflow-io-0.32.0 tensorflow-io-gcs-filesystem-0.32.0\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install --upgrade tensorflow-io==0.32.0"]},{"cell_type":"code","execution_count":3,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2023-06-28T02:53:01.672246Z","iopub.status.busy":"2023-06-28T02:53:01.671610Z","iopub.status.idle":"2023-06-28T02:53:15.771561Z","shell.execute_reply":"2023-06-28T02:53:15.770593Z","shell.execute_reply.started":"2023-06-28T02:53:01.672210Z"},"trusted":true},"outputs":[],"source":["import os\n","import pandas as pd\n","import json\n","from tqdm.auto import tqdm\n","tqdm.pandas()\n","from transformers import AutoModel, AutoTokenizer\n","import torch\n","from torch.utils.data import DataLoader\n","import pandas as pd\n","from sklearn.metrics.pairwise import cosine_similarity\n","import numpy as np\n","from sentence_transformers import SentenceTransformer\n","from pyvi.ViTokenizer import tokenize\n","from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup\n","from transformers import DataCollatorWithPadding\n","from scipy.stats import pearsonr, spearmanr\n","import math\n","from sklearn.metrics import *"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:15.775041Z","iopub.status.busy":"2023-06-28T02:53:15.774278Z","iopub.status.idle":"2023-06-28T02:53:15.780313Z","shell.execute_reply":"2023-06-28T02:53:15.778919Z","shell.execute_reply.started":"2023-06-28T02:53:15.774985Z"},"trusted":true},"outputs":[],"source":["AUTH_TOKEN = \"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\""]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:15.782302Z","iopub.status.busy":"2023-06-28T02:53:15.781713Z","iopub.status.idle":"2023-06-28T02:53:18.884084Z","shell.execute_reply":"2023-06-28T02:53:18.882908Z","shell.execute_reply.started":"2023-06-28T02:53:15.782268Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"6118765cb5854ac7b8f4c68eb0c654a2","version_major":2,"version_minor":0},"text/plain":["Downloading (…)okenizer_config.json: 0%| | 0.00/398 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"9d4cfe6e1330443eb6775a558657bbfb","version_major":2,"version_minor":0},"text/plain":["Downloading (…)/main/tokenizer.json: 0%| | 0.00/9.08M [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"cf5607811ef54e528e24d6a66eceb758","version_major":2,"version_minor":0},"text/plain":["Downloading (…)cial_tokens_map.json: 0%| | 0.00/239 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["<s> sinh viên đại học khoa học tự nhiên</s>\n"]}],"source":["tokenizer = AutoTokenizer.from_pretrained('nguyenvulebinh/vi-mrc-base', use_auth_token=AUTH_TOKEN)\n","print(tokenizer.decode(tokenizer.encode(\"sinh viên đại học khoa học tự nhiên\")))"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:18.886122Z","iopub.status.busy":"2023-06-28T02:53:18.885497Z","iopub.status.idle":"2023-06-28T02:53:18.897285Z","shell.execute_reply":"2023-06-28T02:53:18.896274Z","shell.execute_reply.started":"2023-06-28T02:53:18.886088Z"},"trusted":true},"outputs":[],"source":["import json \n","from glob import glob \n","import re \n","from nltk import word_tokenize as lib_tokenizer \n"," \n","dict_map = dict({}) \n"," \n","def word_tokenize(text): \n"," global dict_map \n"," words = text.split() \n"," words_norm = [] \n"," for w in words: \n"," if dict_map.get(w, None) is None: \n"," dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '\"').replace(\"''\", '\"') \n"," words_norm.append(dict_map[w]) \n"," return words_norm \n"," \n","def strip_answer_string(text): \n"," text = text.strip() \n"," while text[-1] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n"," if text[0] != '(' and text[-1] == ')' and '(' in text: \n"," break \n"," if text[-1] == '\"' and text[0] != '\"' and text.count('\"') > 1: \n"," break \n"," text = text[:-1].strip() \n"," while text[0] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n"," if text[0] == '\"' and text[-1] != '\"' and text.count('\"') > 1: \n"," break \n"," text = text[1:].strip() \n"," text = text.strip() \n"," return text \n"," \n","def strip_context(text): \n"," text = text.replace('\\n', ' ') \n"," text = re.sub(r'\\s+', ' ', text) \n"," text = text.strip() \n"," return text"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:18.899048Z","iopub.status.busy":"2023-06-28T02:53:18.898414Z","iopub.status.idle":"2023-06-28T02:53:28.963083Z","shell.execute_reply":"2023-06-28T02:53:28.962126Z","shell.execute_reply.started":"2023-06-28T02:53:18.899002Z"},"trusted":true},"outputs":[],"source":["train1 = pd.read_csv(\"/kaggle/input/e2eqa-wiki-zalo-ai/processed/train_stage1_ranking.csv\")\n","train1['text'] = train1['text'].apply(lambda x: \" \".join(word_tokenize(strip_context(x))))\n","train1['question'] = train1['question'].apply(lambda x: \" \".join(word_tokenize(strip_context(x))))\n","df = train1"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:28.965037Z","iopub.status.busy":"2023-06-28T02:53:28.964656Z","iopub.status.idle":"2023-06-28T02:53:28.976243Z","shell.execute_reply":"2023-06-28T02:53:28.974943Z","shell.execute_reply.started":"2023-06-28T02:53:28.964989Z"},"trusted":true},"outputs":[],"source":["import torch.nn as nn\n","from transformers import AutoModel, AutoConfig\n","\n","class MeanPooling(nn.Module):\n"," def __init__(self):\n"," super(MeanPooling, self).__init__()\n","\n"," def forward(self, last_hidden_state, attention_mask):\n"," input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()\n"," sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)\n"," sum_mask = input_mask_expanded.sum(1)\n"," sum_mask = torch.clamp(sum_mask, min=1e-9)\n"," mean_embeddings = sum_embeddings / sum_mask\n"," return mean_embeddings\n","\n","class PairwiseModel(nn.Module):\n"," def __init__(self, model_name):\n"," super(PairwiseModel, self).__init__()\n"," self.model = AutoModel.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)\n"," self.config = AutoConfig.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)\n"," self.drop = nn.Dropout(p=0.2)\n"," self.fc = nn.Linear(768, 1)\n"," \n"," def forward(self, ids, masks):\n"," out = self.model(input_ids=ids,\n"," attention_mask=masks,\n"," output_hidden_states=False).last_hidden_state\n"," out = out[:,0]\n"," outputs = self.fc(out)\n"," return outputs"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:28.978164Z","iopub.status.busy":"2023-06-28T02:53:28.977797Z","iopub.status.idle":"2023-06-28T02:53:28.990859Z","shell.execute_reply":"2023-06-28T02:53:28.989964Z","shell.execute_reply.started":"2023-06-28T02:53:28.978133Z"},"trusted":true},"outputs":[],"source":["from torch.utils.data import Dataset\n","\n","class SiameseDataset(Dataset):\n","\n"," def __init__(self, df, tokenizer, max_length):\n"," self.df = df\n"," self.max_length = max_length\n"," self.tokenizer = tokenizer\n"," self.content1 = tokenizer.batch_encode_plus(list(df.question.apply(lambda x: x.replace(\"_\",\" \")).values), max_length=max_length, truncation=True)[\"input_ids\"]\n"," self.content2 = tokenizer.batch_encode_plus(list(df.text.apply(lambda x: x.replace(\"_\",\" \")).values), max_length=max_length, truncation=True)[\"input_ids\"]\n"," self.targets = self.df.label\n"," \n"," def __len__(self):\n"," return len(self.df)\n","\n"," def __getitem__(self, index):\n"," return {\n"," 'ids1': torch.tensor(self.content1[index], dtype=torch.long),\n"," 'ids2': torch.tensor(self.content2[index][1:], dtype=torch.long),\n"," 'target': torch.tensor(self.targets[index], dtype=torch.float)\n"," }"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:28.995179Z","iopub.status.busy":"2023-06-28T02:53:28.994892Z","iopub.status.idle":"2023-06-28T02:53:29.004203Z","shell.execute_reply":"2023-06-28T02:53:29.003189Z","shell.execute_reply.started":"2023-06-28T02:53:28.995156Z"},"trusted":true},"outputs":[],"source":["pad_token_id = tokenizer.pad_token_id\n","def collate_fn(batch):\n"," ids = [torch.cat([x[\"ids1\"], x[\"ids2\"]]) for x in batch]\n"," targets = [x[\"target\"] for x in batch]\n"," max_len = np.max([len(x) for x in ids])\n"," masks = []\n"," for i in range(len(ids)):\n"," if len(ids[i]) < max_len:\n"," ids[i]= torch.cat((ids[i], torch.tensor([pad_token_id,]*(max_len - len(ids[i])),dtype=torch.long)))\n"," masks.append(ids[i] != pad_token_id)\n"," # print(tokenizer.decode(ids[0]))\n"," outputs = {\n"," \"ids\": torch.vstack(ids),\n"," \"masks\": torch.vstack(masks),\n"," \"target\": torch.vstack(targets).view(-1)\n"," }\n"," return outputs"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.006085Z","iopub.status.busy":"2023-06-28T02:53:29.005583Z","iopub.status.idle":"2023-06-28T02:53:29.013002Z","shell.execute_reply":"2023-06-28T02:53:29.011919Z","shell.execute_reply.started":"2023-06-28T02:53:29.006052Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import GroupKFold, KFold"]},{"cell_type":"code","execution_count":12,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.014999Z","iopub.status.busy":"2023-06-28T02:53:29.014493Z","iopub.status.idle":"2023-06-28T02:53:29.023324Z","shell.execute_reply":"2023-06-28T02:53:29.022211Z","shell.execute_reply.started":"2023-06-28T02:53:29.014969Z"},"trusted":true},"outputs":[],"source":["def optimizer_scheduler(model, num_train_steps):\n"," param_optimizer = list(model.named_parameters())\n"," no_decay = [\"bias\", \"LayerNorm.weight\"]\n"," optimizer_parameters = [\n"," {\n"," \"params\": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],\n"," \"weight_decay\": 0.001,\n"," },\n"," {\n"," \"params\": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],\n"," \"weight_decay\": 0.0,\n"," },\n"," ]\n","\n"," opt = AdamW(optimizer_parameters, lr=3e-5)\n"," sch = get_linear_schedule_with_warmup(\n"," opt,\n"," num_warmup_steps=int(0.05*num_train_steps),\n"," num_training_steps=num_train_steps,\n"," last_epoch=-1,\n"," )\n"," return opt, sch"]},{"cell_type":"code","execution_count":13,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.025531Z","iopub.status.busy":"2023-06-28T02:53:29.025091Z","iopub.status.idle":"2023-06-28T02:53:29.036112Z","shell.execute_reply":"2023-06-28T02:53:29.034921Z","shell.execute_reply.started":"2023-06-28T02:53:29.025496Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import KFold\n","kfold = KFold(n_splits=5, shuffle=True, random_state=42)"]},{"cell_type":"code","execution_count":14,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.038723Z","iopub.status.busy":"2023-06-28T02:53:29.037722Z","iopub.status.idle":"2023-06-28T04:54:29.777596Z","shell.execute_reply":"2023-06-28T04:54:29.776341Z","shell.execute_reply.started":"2023-06-28T02:53:29.038691Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["[ 3 6 17 ... 20845 20850 20855]\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"f0f58e6b09914620903bd80dae3eca59","version_major":2,"version_minor":0},"text/plain":["Downloading (…)lve/main/config.json: 0%| | 0.00/688 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"2b7bfe1d5b504e8f8ec03f19cd7bfba5","version_major":2,"version_minor":0},"text/plain":["Downloading pytorch_model.bin: 0%| | 0.00/1.11G [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stderr","output_type":"stream","text":["Some weights of the model checkpoint at nguyenvulebinh/vi-mrc-base were not used when initializing RobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']\n","- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n","- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n","Some weights of RobertaModel were not initialized from the model checkpoint at nguyenvulebinh/vi-mrc-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n","You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n","/opt/conda/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n"," warnings.warn(\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"3a999992c57e44838063b744088409f8","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/5 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9280047017337644\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9665513264129183\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9868613138686132\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9889341875364007\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/5214 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n","To disable this warning, you can either:\n","\t- Avoid using `tokenizers` before the fork if possible\n","\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/131 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["F1 0.9926921952645424\n"]}],"source":["from tqdm.auto import tqdm\n","loss_fn = nn.BCEWithLogitsLoss()\n","epochs = 5\n","accumulation_steps = 8\n","scaler = torch.cuda.amp.GradScaler()\n","error_ids = None\n","for fold, (train_index, test_index) in enumerate(kfold.split(df, df.label)):\n"," if fold != 0:\n"," break\n"," print(test_index)\n"," model = PairwiseModel('nguyenvulebinh/vi-mrc-base')\n"," # model.load_state_dict(torch.load(f\"./outputs/pairwise_v2.bin\"))\n"," model.cuda()\n"," train_df = df\n"," # train_df = df.iloc[train_index].reset_index(drop=True)\n"," val_df = df.iloc[test_index].reset_index(drop=True)\n"," \n"," train_dataset = SiameseDataset(train_df, tokenizer, 384)\n"," valid_dataset = SiameseDataset(val_df, tokenizer, 384)\n"," train_loader = DataLoader(train_dataset, batch_size=4, collate_fn=collate_fn,\n"," num_workers=2, shuffle=True, pin_memory=True, drop_last=True)\n"," valid_loader = DataLoader(valid_dataset, batch_size=32, collate_fn=collate_fn,\n"," num_workers=2, shuffle=False, pin_memory=True)\n"," \n"," num_train_steps = len(train_loader) * epochs // accumulation_steps\n"," optimizer, scheduler = optimizer_scheduler(model, num_train_steps)\n"," \n"," for epoch in tqdm(range(epochs)):\n"," model.train()\n"," bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)\n"," for step, data in bar:\n"," ids = data[\"ids\"].cuda()\n"," # for x in ids:\n"," # print(tokenizer.decode(x))\n"," masks = data[\"masks\"].cuda()\n"," target = data[\"target\"].cuda()\n"," # with torch.cuda.amp.autocast():\n"," preds = model(ids, masks)\n"," # print(preds.view(-1))\n"," loss = loss_fn(preds.view(-1), target.view(-1))\n"," loss /= accumulation_steps\n"," loss.backward()\n"," if (step + 1) % accumulation_steps == 0:\n"," optimizer.step()\n"," # scaler.update()\n"," optimizer.zero_grad()\n"," scheduler.step()\n"," bar.set_postfix(loss=loss.item())\n","\n"," model.eval()\n"," with torch.no_grad():\n"," bar = tqdm(enumerate(valid_loader), total=len(valid_loader), leave=False)\n"," targets = []\n"," all_preds = []\n"," for step, data in bar:\n"," ids = data[\"ids\"].cuda()\n"," masks = data[\"masks\"].cuda()\n"," target = data[\"target\"].cuda()\n"," preds = torch.sigmoid(model(ids, masks))\n"," all_preds.extend(preds.cpu().view(-1).numpy())\n"," targets.extend(target.cpu().view(-1).numpy())\n"," all_preds = np.array(all_preds)\n"," targets = np.array(targets)\n"," print(f\"F1 {f1_score(targets, all_preds > 0.5)}\")"]},{"cell_type":"code","execution_count":15,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T04:54:29.780731Z","iopub.status.busy":"2023-06-28T04:54:29.779924Z","iopub.status.idle":"2023-06-28T04:54:29.791869Z","shell.execute_reply":"2023-06-28T04:54:29.790613Z","shell.execute_reply.started":"2023-06-28T04:54:29.780691Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["F1 0.9953106682297772\n"]}],"source":["print(f\"F1 {recall_score(np.array(targets), np.array(all_preds) > 0.5)}\")"]},{"cell_type":"code","execution_count":16,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T04:54:29.796171Z","iopub.status.busy":"2023-06-28T04:54:29.795752Z","iopub.status.idle":"2023-06-28T04:54:31.811461Z","shell.execute_reply":"2023-06-28T04:54:31.810427Z","shell.execute_reply.started":"2023-06-28T04:54:29.796143Z"},"trusted":true},"outputs":[],"source":["torch.save(model.state_dict(), f\"/kaggle/working/pairwise_v2.bin\")\n"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.10"}},"nbformat":4,"nbformat_minor":4}
|