{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/noobhocai/train-pairwise-stage1"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:52:30.450145Z","iopub.status.busy":"2023-06-28T02:52:30.449601Z","iopub.status.idle":"2023-06-28T02:52:48.306606Z","shell.execute_reply":"2023-06-28T02:52:48.305298Z","shell.execute_reply.started":"2023-06-28T02:52:30.450111Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install sentence_transformers pyvi -q"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:52:48.310777Z","iopub.status.busy":"2023-06-28T02:52:48.309711Z","iopub.status.idle":"2023-06-28T02:53:01.669646Z","shell.execute_reply":"2023-06-28T02:53:01.668372Z","shell.execute_reply.started":"2023-06-28T02:52:48.310744Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Collecting tensorflow-io==0.32.0\n"," Downloading tensorflow_io-0.32.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (28.0 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m28.0/28.0 MB\u001b[0m \u001b[31m40.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n","\u001b[?25hCollecting tensorflow-io-gcs-filesystem==0.32.0 (from tensorflow-io==0.32.0)\n"," Downloading tensorflow_io_gcs_filesystem-0.32.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.4 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m58.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n","\u001b[?25hInstalling collected packages: tensorflow-io-gcs-filesystem, tensorflow-io\n"," Attempting uninstall: tensorflow-io-gcs-filesystem\n"," Found existing installation: tensorflow-io-gcs-filesystem 0.31.0\n"," Uninstalling tensorflow-io-gcs-filesystem-0.31.0:\n"," Successfully uninstalled tensorflow-io-gcs-filesystem-0.31.0\n"," Attempting uninstall: tensorflow-io\n"," Found existing installation: tensorflow-io 0.31.0\n"," Uninstalling tensorflow-io-0.31.0:\n"," Successfully uninstalled tensorflow-io-0.31.0\n","Successfully installed tensorflow-io-0.32.0 tensorflow-io-gcs-filesystem-0.32.0\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install --upgrade tensorflow-io==0.32.0"]},{"cell_type":"code","execution_count":3,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2023-06-28T02:53:01.672246Z","iopub.status.busy":"2023-06-28T02:53:01.671610Z","iopub.status.idle":"2023-06-28T02:53:15.771561Z","shell.execute_reply":"2023-06-28T02:53:15.770593Z","shell.execute_reply.started":"2023-06-28T02:53:01.672210Z"},"trusted":true},"outputs":[],"source":["import os\n","import pandas as pd\n","import json\n","from tqdm.auto import tqdm\n","tqdm.pandas()\n","from transformers import AutoModel, AutoTokenizer\n","import torch\n","from torch.utils.data import DataLoader\n","import pandas as pd\n","from sklearn.metrics.pairwise import cosine_similarity\n","import numpy as np\n","from sentence_transformers import SentenceTransformer\n","from pyvi.ViTokenizer import tokenize\n","from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup\n","from transformers import DataCollatorWithPadding\n","from scipy.stats import pearsonr, spearmanr\n","import math\n","from sklearn.metrics import *"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:15.775041Z","iopub.status.busy":"2023-06-28T02:53:15.774278Z","iopub.status.idle":"2023-06-28T02:53:15.780313Z","shell.execute_reply":"2023-06-28T02:53:15.778919Z","shell.execute_reply.started":"2023-06-28T02:53:15.774985Z"},"trusted":true},"outputs":[],"source":["AUTH_TOKEN = \"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\""]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:15.782302Z","iopub.status.busy":"2023-06-28T02:53:15.781713Z","iopub.status.idle":"2023-06-28T02:53:18.884084Z","shell.execute_reply":"2023-06-28T02:53:18.882908Z","shell.execute_reply.started":"2023-06-28T02:53:15.782268Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"6118765cb5854ac7b8f4c68eb0c654a2","version_major":2,"version_minor":0},"text/plain":["Downloading (…)okenizer_config.json: 0%| | 0.00/398 [00:00 sinh viên đại học khoa học tự nhiên\n"]}],"source":["tokenizer = AutoTokenizer.from_pretrained('nguyenvulebinh/vi-mrc-base', use_auth_token=AUTH_TOKEN)\n","print(tokenizer.decode(tokenizer.encode(\"sinh viên đại học khoa học tự nhiên\")))"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:18.886122Z","iopub.status.busy":"2023-06-28T02:53:18.885497Z","iopub.status.idle":"2023-06-28T02:53:18.897285Z","shell.execute_reply":"2023-06-28T02:53:18.896274Z","shell.execute_reply.started":"2023-06-28T02:53:18.886088Z"},"trusted":true},"outputs":[],"source":["import json \n","from glob import glob \n","import re \n","from nltk import word_tokenize as lib_tokenizer \n"," \n","dict_map = dict({}) \n"," \n","def word_tokenize(text): \n"," global dict_map \n"," words = text.split() \n"," words_norm = [] \n"," for w in words: \n"," if dict_map.get(w, None) is None: \n"," dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '\"').replace(\"''\", '\"') \n"," words_norm.append(dict_map[w]) \n"," return words_norm \n"," \n","def strip_answer_string(text): \n"," text = text.strip() \n"," while text[-1] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n"," if text[0] != '(' and text[-1] == ')' and '(' in text: \n"," break \n"," if text[-1] == '\"' and text[0] != '\"' and text.count('\"') > 1: \n"," break \n"," text = text[:-1].strip() \n"," while text[0] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n"," if text[0] == '\"' and text[-1] != '\"' and text.count('\"') > 1: \n"," break \n"," text = text[1:].strip() \n"," text = text.strip() \n"," return text \n"," \n","def strip_context(text): \n"," text = text.replace('\\n', ' ') \n"," text = re.sub(r'\\s+', ' ', text) \n"," text = text.strip() \n"," return text"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:18.899048Z","iopub.status.busy":"2023-06-28T02:53:18.898414Z","iopub.status.idle":"2023-06-28T02:53:28.963083Z","shell.execute_reply":"2023-06-28T02:53:28.962126Z","shell.execute_reply.started":"2023-06-28T02:53:18.899002Z"},"trusted":true},"outputs":[],"source":["train1 = pd.read_csv(\"/kaggle/input/e2eqa-wiki-zalo-ai/processed/train_stage1_ranking.csv\")\n","train1['text'] = train1['text'].apply(lambda x: \" \".join(word_tokenize(strip_context(x))))\n","train1['question'] = train1['question'].apply(lambda x: \" \".join(word_tokenize(strip_context(x))))\n","df = train1"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:28.965037Z","iopub.status.busy":"2023-06-28T02:53:28.964656Z","iopub.status.idle":"2023-06-28T02:53:28.976243Z","shell.execute_reply":"2023-06-28T02:53:28.974943Z","shell.execute_reply.started":"2023-06-28T02:53:28.964989Z"},"trusted":true},"outputs":[],"source":["import torch.nn as nn\n","from transformers import AutoModel, AutoConfig\n","\n","class MeanPooling(nn.Module):\n"," def __init__(self):\n"," super(MeanPooling, self).__init__()\n","\n"," def forward(self, last_hidden_state, attention_mask):\n"," input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()\n"," sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)\n"," sum_mask = input_mask_expanded.sum(1)\n"," sum_mask = torch.clamp(sum_mask, min=1e-9)\n"," mean_embeddings = sum_embeddings / sum_mask\n"," return mean_embeddings\n","\n","class PairwiseModel(nn.Module):\n"," def __init__(self, model_name):\n"," super(PairwiseModel, self).__init__()\n"," self.model = AutoModel.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)\n"," self.config = AutoConfig.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)\n"," self.drop = nn.Dropout(p=0.2)\n"," self.fc = nn.Linear(768, 1)\n"," \n"," def forward(self, ids, masks):\n"," out = self.model(input_ids=ids,\n"," attention_mask=masks,\n"," output_hidden_states=False).last_hidden_state\n"," out = out[:,0]\n"," outputs = self.fc(out)\n"," return outputs"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:28.978164Z","iopub.status.busy":"2023-06-28T02:53:28.977797Z","iopub.status.idle":"2023-06-28T02:53:28.990859Z","shell.execute_reply":"2023-06-28T02:53:28.989964Z","shell.execute_reply.started":"2023-06-28T02:53:28.978133Z"},"trusted":true},"outputs":[],"source":["from torch.utils.data import Dataset\n","\n","class SiameseDataset(Dataset):\n","\n"," def __init__(self, df, tokenizer, max_length):\n"," self.df = df\n"," self.max_length = max_length\n"," self.tokenizer = tokenizer\n"," self.content1 = tokenizer.batch_encode_plus(list(df.question.apply(lambda x: x.replace(\"_\",\" \")).values), max_length=max_length, truncation=True)[\"input_ids\"]\n"," self.content2 = tokenizer.batch_encode_plus(list(df.text.apply(lambda x: x.replace(\"_\",\" \")).values), max_length=max_length, truncation=True)[\"input_ids\"]\n"," self.targets = self.df.label\n"," \n"," def __len__(self):\n"," return len(self.df)\n","\n"," def __getitem__(self, index):\n"," return {\n"," 'ids1': torch.tensor(self.content1[index], dtype=torch.long),\n"," 'ids2': torch.tensor(self.content2[index][1:], dtype=torch.long),\n"," 'target': torch.tensor(self.targets[index], dtype=torch.float)\n"," }"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:28.995179Z","iopub.status.busy":"2023-06-28T02:53:28.994892Z","iopub.status.idle":"2023-06-28T02:53:29.004203Z","shell.execute_reply":"2023-06-28T02:53:29.003189Z","shell.execute_reply.started":"2023-06-28T02:53:28.995156Z"},"trusted":true},"outputs":[],"source":["pad_token_id = tokenizer.pad_token_id\n","def collate_fn(batch):\n"," ids = [torch.cat([x[\"ids1\"], x[\"ids2\"]]) for x in batch]\n"," targets = [x[\"target\"] for x in batch]\n"," max_len = np.max([len(x) for x in ids])\n"," masks = []\n"," for i in range(len(ids)):\n"," if len(ids[i]) < max_len:\n"," ids[i]= torch.cat((ids[i], torch.tensor([pad_token_id,]*(max_len - len(ids[i])),dtype=torch.long)))\n"," masks.append(ids[i] != pad_token_id)\n"," # print(tokenizer.decode(ids[0]))\n"," outputs = {\n"," \"ids\": torch.vstack(ids),\n"," \"masks\": torch.vstack(masks),\n"," \"target\": torch.vstack(targets).view(-1)\n"," }\n"," return outputs"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.006085Z","iopub.status.busy":"2023-06-28T02:53:29.005583Z","iopub.status.idle":"2023-06-28T02:53:29.013002Z","shell.execute_reply":"2023-06-28T02:53:29.011919Z","shell.execute_reply.started":"2023-06-28T02:53:29.006052Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import GroupKFold, KFold"]},{"cell_type":"code","execution_count":12,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.014999Z","iopub.status.busy":"2023-06-28T02:53:29.014493Z","iopub.status.idle":"2023-06-28T02:53:29.023324Z","shell.execute_reply":"2023-06-28T02:53:29.022211Z","shell.execute_reply.started":"2023-06-28T02:53:29.014969Z"},"trusted":true},"outputs":[],"source":["def optimizer_scheduler(model, num_train_steps):\n"," param_optimizer = list(model.named_parameters())\n"," no_decay = [\"bias\", \"LayerNorm.weight\"]\n"," optimizer_parameters = [\n"," {\n"," \"params\": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],\n"," \"weight_decay\": 0.001,\n"," },\n"," {\n"," \"params\": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],\n"," \"weight_decay\": 0.0,\n"," },\n"," ]\n","\n"," opt = AdamW(optimizer_parameters, lr=3e-5)\n"," sch = get_linear_schedule_with_warmup(\n"," opt,\n"," num_warmup_steps=int(0.05*num_train_steps),\n"," num_training_steps=num_train_steps,\n"," last_epoch=-1,\n"," )\n"," return opt, sch"]},{"cell_type":"code","execution_count":13,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.025531Z","iopub.status.busy":"2023-06-28T02:53:29.025091Z","iopub.status.idle":"2023-06-28T02:53:29.036112Z","shell.execute_reply":"2023-06-28T02:53:29.034921Z","shell.execute_reply.started":"2023-06-28T02:53:29.025496Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import KFold\n","kfold = KFold(n_splits=5, shuffle=True, random_state=42)"]},{"cell_type":"code","execution_count":14,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T02:53:29.038723Z","iopub.status.busy":"2023-06-28T02:53:29.037722Z","iopub.status.idle":"2023-06-28T04:54:29.777596Z","shell.execute_reply":"2023-06-28T04:54:29.776341Z","shell.execute_reply.started":"2023-06-28T02:53:29.038691Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["[ 3 6 17 ... 20845 20850 20855]\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"f0f58e6b09914620903bd80dae3eca59","version_major":2,"version_minor":0},"text/plain":["Downloading (…)lve/main/config.json: 0%| | 0.00/688 [00:00 0.5)}\")"]},{"cell_type":"code","execution_count":15,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T04:54:29.780731Z","iopub.status.busy":"2023-06-28T04:54:29.779924Z","iopub.status.idle":"2023-06-28T04:54:29.791869Z","shell.execute_reply":"2023-06-28T04:54:29.790613Z","shell.execute_reply.started":"2023-06-28T04:54:29.780691Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["F1 0.9953106682297772\n"]}],"source":["print(f\"F1 {recall_score(np.array(targets), np.array(all_preds) > 0.5)}\")"]},{"cell_type":"code","execution_count":16,"metadata":{"execution":{"iopub.execute_input":"2023-06-28T04:54:29.796171Z","iopub.status.busy":"2023-06-28T04:54:29.795752Z","iopub.status.idle":"2023-06-28T04:54:31.811461Z","shell.execute_reply":"2023-06-28T04:54:31.810427Z","shell.execute_reply.started":"2023-06-28T04:54:29.796143Z"},"trusted":true},"outputs":[],"source":["torch.save(model.state_dict(), f\"/kaggle/working/pairwise_v2.bin\")\n"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.10"}},"nbformat":4,"nbformat_minor":4}