{"cells":[{"attachments":{},"cell_type":"markdown","id":"770bfcdb","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/code/noobhocai/0-3-create-stage2-ranking"]},{"cell_type":"code","execution_count":2,"id":"646a64db","metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:45:51.815046Z","iopub.status.busy":"2023-06-19T04:45:51.814713Z","iopub.status.idle":"2023-06-19T04:47:17.791641Z","shell.execute_reply":"2023-06-19T04:47:17.790706Z","shell.execute_reply.started":"2023-06-19T04:45:51.815020Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Collecting pyspark\n"," Downloading pyspark-3.4.0.tar.gz (310.8 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.8/310.8 MB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n","\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n","\u001b[?25hRequirement already satisfied: py4j==0.10.9.7 in /opt/conda/lib/python3.10/site-packages (from pyspark) (0.10.9.7)\n","Building wheels for collected packages: pyspark\n"," Building wheel for pyspark (setup.py) ... \u001b[?25ldone\n","\u001b[?25h Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317146 sha256=932384efffaa19a196f9bb66e53de64eb278df10f2ddf3186116516b1fff67b5\n"," Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327\n","Successfully built pyspark\n","Installing collected packages: pyspark\n","Successfully installed pyspark-3.4.0\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mRequirement already satisfied: pyarrow in /opt/conda/lib/python3.10/site-packages (9.0.0)\n","Requirement already satisfied: numpy>=1.16.6 in /opt/conda/lib/python3.10/site-packages (from pyarrow) (1.23.5)\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mRequirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.2.4)\n","Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from nltk) (1.16.0)\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mCollecting tdqm\n"," Downloading tdqm-0.0.1.tar.gz (1.4 kB)\n"," Preparing metadata (setup.py) ... \u001b[?25ldone\n","\u001b[?25hRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from tdqm) (4.64.1)\n","Building wheels for collected packages: tdqm\n"," Building wheel for tdqm (setup.py) ... \u001b[?25ldone\n","\u001b[?25h Created wheel for tdqm: filename=tdqm-0.0.1-py3-none-any.whl size=1322 sha256=98dae54af0bf0a85639c88412f77b6a49d485bbc930a83a31edfb2dbc709a9ff\n"," Stored in directory: /root/.cache/pip/wheels/37/31/b8/7b711038035720ba0df14376af06e5e76b9bd61759c861ad92\n","Successfully built tdqm\n","Installing collected packages: tdqm\n","Successfully installed tdqm-0.0.1\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mCollecting rank_bm25\n"," Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)\n","Requirement already satisfied: numpy in /opt/conda/lib/python3.10/site-packages (from rank_bm25) (1.23.5)\n","Installing collected packages: rank_bm25\n","Successfully installed rank_bm25-0.2.2\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mCollecting pyvi\n"," Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.5/8.5 MB\u001b[0m \u001b[31m57.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n","\u001b[?25hRequirement already satisfied: scikit-learn in /opt/conda/lib/python3.10/site-packages (from pyvi) (1.2.2)\n","Collecting sklearn-crfsuite (from pyvi)\n"," Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)\n","Requirement already satisfied: numpy>=1.17.3 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->pyvi) (1.23.5)\n","Requirement already satisfied: scipy>=1.3.2 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->pyvi) (1.10.1)\n","Requirement already satisfied: joblib>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->pyvi) (1.2.0)\n","Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->pyvi) (3.1.0)\n","Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite->pyvi)\n"," Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m993.5/993.5 kB\u001b[0m \u001b[31m55.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from sklearn-crfsuite->pyvi) (1.16.0)\n","Requirement already satisfied: tabulate in /opt/conda/lib/python3.10/site-packages (from sklearn-crfsuite->pyvi) (0.9.0)\n","Requirement already satisfied: tqdm>=2.0 in /opt/conda/lib/python3.10/site-packages (from sklearn-crfsuite->pyvi) (4.64.1)\n","Installing collected packages: python-crfsuite, sklearn-crfsuite, pyvi\n","Successfully installed python-crfsuite-0.9.9 pyvi-0.1.1 sklearn-crfsuite-0.3.6\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install pyspark\n","!pip install pyarrow\n","!pip install nltk\n","!pip install tdqm\n","!pip install rank_bm25\n","# !pip install pandarallel\n","!pip install pyvi"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:17.794768Z","iopub.status.busy":"2023-06-19T04:47:17.794437Z","iopub.status.idle":"2023-06-19T04:47:17.869550Z","shell.execute_reply":"2023-06-19T04:47:17.868392Z","shell.execute_reply.started":"2023-06-19T04:47:17.794729Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["3.4.0\n"]}],"source":["import pyspark\n","print(pyspark.__version__)"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:17.873676Z","iopub.status.busy":"2023-06-19T04:47:17.873324Z","iopub.status.idle":"2023-06-19T04:47:25.790613Z","shell.execute_reply":"2023-06-19T04:47:25.789712Z","shell.execute_reply.started":"2023-06-19T04:47:17.873647Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["/opt/conda/lib/python3.10/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n"," warnings.warn(\n"]},{"name":"stdout","output_type":"stream","text":["[nltk_data] Downloading package punkt to /usr/share/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n"]},{"name":"stderr","output_type":"stream","text":["Setting default log level to \"WARN\".\n","To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n","23/06/19 04:47:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"]},{"data":{"text/html":["\n","
SparkSession - in-memory
\n"," \n","SparkContext
\n","\n"," \n","\n","v3.4.0
local[*]
Pyspark
\n"," | question | \n","answer | \n","title | \n","candidate | \n","label | \n","group | \n","
---|---|---|---|---|---|---|
0 | \n","Đất nước nào không có quân đội | \n","Costa Rica, Iceland, Panama, Micronesia, Quần ... | \n","Costa Rica | \n","Costa Rica Costa Rica (Phiên âm: Cô-xta Ri-ca)... | \n","0 | \n","0 | \n","
1 | \n","Đất nước nào không có quân đội | \n","Costa Rica, Iceland, Panama, Micronesia, Quần ... | \n","Quần đảo Marshall | \n","Quần đảo Marshall Quần đảo Marshall, tên chính... | \n","0 | \n","0 | \n","
2 | \n","Đất nước nào không có quân đội | \n","Costa Rica, Iceland, Panama, Micronesia, Quần ... | \n","Montezuma, Costa Rica | \n","Montezuma, Costa Rica Montezuma là một thị xã ... | \n","0 | \n","0 | \n","
3 | \n","Đất nước nào không có quân đội | \n","Costa Rica, Iceland, Panama, Micronesia, Quần ... | \n","Tamarindo, Costa Rica | \n","Tamarindo, Costa Rica Tamarindo là một thị xã ... | \n","0 | \n","0 | \n","
4 | \n","Đất nước nào không có quân đội | \n","Costa Rica, Iceland, Panama, Micronesia, Quần ... | \n","Micronesia | \n","Micronesia Micronesia (, ), còn gọi là Tiểu Đả... | \n","0 | \n","0 | \n","