{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/noobhocai/train-pairwise-stage2"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:34:51.417747Z","iopub.status.busy":"2023-06-29T01:34:51.417032Z","iopub.status.idle":"2023-06-29T01:35:11.748112Z","shell.execute_reply":"2023-06-29T01:35:11.746819Z","shell.execute_reply.started":"2023-06-29T01:34:51.417706Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"]}],"source":["!pip install sentence_transformers pyvi -q\n","# !pip install --upgrade tensorflow-io==0.32.0"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:11.750592Z","iopub.status.busy":"2023-06-29T01:35:11.750272Z","iopub.status.idle":"2023-06-29T01:35:11.755063Z","shell.execute_reply":"2023-06-29T01:35:11.754177Z","shell.execute_reply.started":"2023-06-29T01:35:11.750563Z"},"trusted":true},"outputs":[],"source":["# !pip install gsutil -q\n","# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py\n","# !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev"]},{"cell_type":"code","execution_count":3,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2023-06-29T01:35:11.760109Z","iopub.status.busy":"2023-06-29T01:35:11.759784Z","iopub.status.idle":"2023-06-29T01:35:46.341332Z","shell.execute_reply":"2023-06-29T01:35:46.339954Z","shell.execute_reply.started":"2023-06-29T01:35:11.760071Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n"," from .autonotebook import tqdm as notebook_tqdm\n"]}],"source":["import os\n","import pandas as pd\n","from transformers import AutoModel, AutoTokenizer\n","import torch\n","from torch.utils.data import DataLoader\n","from sklearn.metrics.pairwise import cosine_similarity\n","import numpy as np\n","from pyvi.ViTokenizer import tokenize\n","from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup\n","from transformers import DataCollatorWithPadding\n","from scipy.stats import pearsonr, spearmanr\n","import math\n","from sklearn.metrics import *\n","from sklearn.model_selection import GroupKFold, KFold"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:46.343395Z","iopub.status.busy":"2023-06-29T01:35:46.342831Z","iopub.status.idle":"2023-06-29T01:35:46.348021Z","shell.execute_reply":"2023-06-29T01:35:46.347052Z","shell.execute_reply.started":"2023-06-29T01:35:46.343363Z"},"trusted":true},"outputs":[],"source":["AUTH_TOKEN = \"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\""]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:46.349601Z","iopub.status.busy":"2023-06-29T01:35:46.349307Z","iopub.status.idle":"2023-06-29T01:35:47.904138Z","shell.execute_reply":"2023-06-29T01:35:47.903274Z","shell.execute_reply.started":"2023-06-29T01:35:46.349575Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["Downloading (…)okenizer_config.json: 100%|██████████| 398/398 [00:00<00:00, 47.5kB/s]\n","Downloading (…)/main/tokenizer.json: 100%|██████████| 9.08M/9.08M [00:00<00:00, 70.8MB/s]\n","Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 133kB/s]\n"]},{"name":"stdout","output_type":"stream","text":[" sinh viên đại học khoa học tự nhiên\n"]}],"source":["tokenizer = AutoTokenizer.from_pretrained('nguyenvulebinh/vi-mrc-base', use_auth_token=AUTH_TOKEN)\n","print(tokenizer.decode(tokenizer.encode(\"sinh viên đại học khoa học tự nhiên \")))"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:47.905567Z","iopub.status.busy":"2023-06-29T01:35:47.905232Z","iopub.status.idle":"2023-06-29T01:35:47.909874Z","shell.execute_reply":"2023-06-29T01:35:47.909125Z","shell.execute_reply.started":"2023-06-29T01:35:47.905538Z"},"trusted":true},"outputs":[],"source":["import os\n","os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:47.911132Z","iopub.status.busy":"2023-06-29T01:35:47.910848Z","iopub.status.idle":"2023-06-29T01:35:58.020338Z","shell.execute_reply":"2023-06-29T01:35:58.019355Z","shell.execute_reply.started":"2023-06-29T01:35:47.911107Z"},"trusted":true},"outputs":[],"source":["df = pd.read_csv(\"/kaggle/input/e2eqa-wiki-zalo-ai/processed/train_stage2_ranking.csv\")"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2023-06-29T01:35:58.021880Z","iopub.status.busy":"2023-06-29T01:35:58.021545Z","iopub.status.idle":"2023-06-29T01:35:58.046072Z","shell.execute_reply":"2023-06-29T01:35:58.045164Z","shell.execute_reply.started":"2023-06-29T01:35:58.021853Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n"," | question | \n","answer | \n","title | \n","candidate | \n","label | \n","group | \n","
---|---|---|---|---|---|---|
0 | \n","Đất nước nào không có quân đội | \n","Costa Rica, Iceland, Panama, Micronesia, Quần ... | \n","Costa Rica | \n","Costa Rica Costa Rica (Phiên âm: Cô-xta Ri-ca)... | \n","0 | \n","0 | \n","
1 | \n","Đất nước nào không có quân đội | \n","Costa Rica, Iceland, Panama, Micronesia, Quần ... | \n","Quần đảo Marshall | \n","Quần đảo Marshall Quần đảo Marshall, tên chính... | \n","0 | \n","0 | \n","
2 | \n","Đất nước nào không có quân đội | \n","Costa Rica, Iceland, Panama, Micronesia, Quần ... | \n","Montezuma, Costa Rica | \n","Montezuma, Costa Rica Montezuma là một thị xã ... | \n","0 | \n","0 | \n","
3 | \n","Đất nước nào không có quân đội | \n","Costa Rica, Iceland, Panama, Micronesia, Quần ... | \n","Tamarindo, Costa Rica | \n","Tamarindo, Costa Rica Tamarindo là một thị xã ... | \n","0 | \n","0 | \n","
4 | \n","Đất nước nào không có quân đội | \n","Costa Rica, Iceland, Panama, Micronesia, Quần ... | \n","Micronesia | \n","Micronesia Micronesia (, ), còn gọi là Tiểu Đả... | \n","0 | \n","0 | \n","
... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","
46075 | \n","trong thần thoại hy lạp vị thần tình yêu có tê... | \n","Eros | \n","Eros phaleratus | \n","Eros phaleratus Eros phaleratus là một loài bọ... | \n","0 | \n","4607 | \n","
46076 | \n","trong thần thoại hy lạp vị thần tình yêu có tê... | \n","Eros | \n","Eros melanurus | \n","Eros melanurus Eros melanurus là một loài bọ c... | \n","0 | \n","4607 | \n","
46077 | \n","trong thần thoại hy lạp vị thần tình yêu có tê... | \n","Eros | \n","Eros melanopterus | \n","Eros melanopterus Eros melanopterus là một loà... | \n","0 | \n","4607 | \n","
46078 | \n","trong thần thoại hy lạp vị thần tình yêu có tê... | \n","Eros | \n","Eros humeralis | \n","Eros humeralis Eros humeralis là một loài bọ c... | \n","0 | \n","4607 | \n","
46079 | \n","trong thần thoại hy lạp vị thần tình yêu có tê... | \n","Eros | \n","Eros patruelis | \n","Eros patruelis Eros patruelis là một loài bọ c... | \n","0 | \n","4607 | \n","
46080 rows × 6 columns
\n","