Spaces:

foxxy-hm
/

e2eqa-wiki

Build error

File size: 43,870 Bytes

c5d2283

{"cells":[{"attachments":{},"cell_type":"markdown","id":"770bfcdb","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/code/noobhocai/0-3-create-stage2-ranking"]},{"cell_type":"code","execution_count":2,"id":"646a64db","metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:45:51.815046Z","iopub.status.busy":"2023-06-19T04:45:51.814713Z","iopub.status.idle":"2023-06-19T04:47:17.791641Z","shell.execute_reply":"2023-06-19T04:47:17.790706Z","shell.execute_reply.started":"2023-06-19T04:45:51.815020Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Collecting pyspark\n","  Downloading pyspark-3.4.0.tar.gz (310.8 MB)\n","\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.8/310.8 MB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n","\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n","\u001b[?25hRequirement already satisfied: py4j==0.10.9.7 in /opt/conda/lib/python3.10/site-packages (from pyspark) (0.10.9.7)\n","Building wheels for collected packages: pyspark\n","  Building wheel for pyspark (setup.py) ... \u001b[?25ldone\n","\u001b[?25h  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317146 sha256=932384efffaa19a196f9bb66e53de64eb278df10f2ddf3186116516b1fff67b5\n","  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327\n","Successfully built pyspark\n","Installing collected packages: pyspark\n","Successfully installed pyspark-3.4.0\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mRequirement already satisfied: pyarrow in /opt/conda/lib/python3.10/site-packages (9.0.0)\n","Requirement already satisfied: numpy>=1.16.6 in /opt/conda/lib/python3.10/site-packages (from pyarrow) (1.23.5)\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mRequirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.2.4)\n","Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from nltk) (1.16.0)\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mCollecting tdqm\n","  Downloading tdqm-0.0.1.tar.gz (1.4 kB)\n","  Preparing metadata (setup.py) ... \u001b[?25ldone\n","\u001b[?25hRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from tdqm) (4.64.1)\n","Building wheels for collected packages: tdqm\n","  Building wheel for tdqm (setup.py) ... \u001b[?25ldone\n","\u001b[?25h  Created wheel for tdqm: filename=tdqm-0.0.1-py3-none-any.whl size=1322 sha256=98dae54af0bf0a85639c88412f77b6a49d485bbc930a83a31edfb2dbc709a9ff\n","  Stored in directory: /root/.cache/pip/wheels/37/31/b8/7b711038035720ba0df14376af06e5e76b9bd61759c861ad92\n","Successfully built tdqm\n","Installing collected packages: tdqm\n","Successfully installed tdqm-0.0.1\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mCollecting rank_bm25\n","  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)\n","Requirement already satisfied: numpy in /opt/conda/lib/python3.10/site-packages (from rank_bm25) (1.23.5)\n","Installing collected packages: rank_bm25\n","Successfully installed rank_bm25-0.2.2\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0mCollecting pyvi\n","  Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)\n","\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.5/8.5 MB\u001b[0m \u001b[31m57.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n","\u001b[?25hRequirement already satisfied: scikit-learn in /opt/conda/lib/python3.10/site-packages (from pyvi) (1.2.2)\n","Collecting sklearn-crfsuite (from pyvi)\n","  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)\n","Requirement already satisfied: numpy>=1.17.3 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->pyvi) (1.23.5)\n","Requirement already satisfied: scipy>=1.3.2 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->pyvi) (1.10.1)\n","Requirement already satisfied: joblib>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->pyvi) (1.2.0)\n","Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->pyvi) (3.1.0)\n","Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite->pyvi)\n","  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)\n","\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m993.5/993.5 kB\u001b[0m \u001b[31m55.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from sklearn-crfsuite->pyvi) (1.16.0)\n","Requirement already satisfied: tabulate in /opt/conda/lib/python3.10/site-packages (from sklearn-crfsuite->pyvi) (0.9.0)\n","Requirement already satisfied: tqdm>=2.0 in /opt/conda/lib/python3.10/site-packages (from sklearn-crfsuite->pyvi) (4.64.1)\n","Installing collected packages: python-crfsuite, sklearn-crfsuite, pyvi\n","Successfully installed python-crfsuite-0.9.9 pyvi-0.1.1 sklearn-crfsuite-0.3.6\n","\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install pyspark\n","!pip install pyarrow\n","!pip install nltk\n","!pip install tdqm\n","!pip install rank_bm25\n","# !pip install pandarallel\n","!pip install pyvi"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:17.794768Z","iopub.status.busy":"2023-06-19T04:47:17.794437Z","iopub.status.idle":"2023-06-19T04:47:17.869550Z","shell.execute_reply":"2023-06-19T04:47:17.868392Z","shell.execute_reply.started":"2023-06-19T04:47:17.794729Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["3.4.0\n"]}],"source":["import pyspark\n","print(pyspark.__version__)"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:17.873676Z","iopub.status.busy":"2023-06-19T04:47:17.873324Z","iopub.status.idle":"2023-06-19T04:47:25.790613Z","shell.execute_reply":"2023-06-19T04:47:25.789712Z","shell.execute_reply.started":"2023-06-19T04:47:17.873647Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["/opt/conda/lib/python3.10/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n","  warnings.warn(\n"]},{"name":"stdout","output_type":"stream","text":["[nltk_data] Downloading package punkt to /usr/share/nltk_data...\n","[nltk_data]   Package punkt is already up-to-date!\n"]},{"name":"stderr","output_type":"stream","text":["Setting default log level to \"WARN\".\n","To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n","23/06/19 04:47:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"]},{"data":{"text/html":["\n","            <div>\n","                <p><b>SparkSession - in-memory</b></p>\n","                \n","        <div>\n","            <p><b>SparkContext</b></p>\n","\n","            <p><a href=\"http://e65557ad95ad:4040\">Spark UI</a></p>\n","\n","            <dl>\n","              <dt>Version</dt>\n","                <dd><code>v3.4.0</code></dd>\n","              <dt>Master</dt>\n","                <dd><code>local[*]</code></dd>\n","              <dt>AppName</dt>\n","                <dd><code>Pyspark</code></dd>\n","            </dl>\n","        </div>\n","        \n","            </div>\n","        "],"text/plain":["<pyspark.sql.session.SparkSession at 0x7e77e1486620>"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["import json\n","import numpy as np\n","import pandas as pd\n","from tqdm.auto import tqdm\n","tqdm.pandas()\n","import pyspark.pandas as ps\n","import re\n","import nltk\n","nltk.download('punkt')\n","from nltk import word_tokenize as lib_tokenizer \n","from pyvi.ViTokenizer import tokenize\n","import string\n","from rank_bm25 import BM25Okapi\n","# from pandarallel import pandarallel\n","\n","import os\n","os.environ[\"PYARROW_IGNORE_TIMEZONE\"] = \"1\"\n","import pyspark as pyspark\n","from pyspark.sql import SparkSession\n","from pyspark.sql.functions import udf, col, lit, lower\n","from pyspark.sql.types import ArrayType, StringType, FloatType, IntegerType\n","MAX_MEMORY = '15G'\n","# Initialize a spark session.\n","# spark.stop()\n","conf = pyspark.SparkConf().setMaster(\"local[*]\") \\\n","        .set('spark.executor.heartbeatInterval', 10000) \\\n","        .set('spark.network.timeout', 10000) \\\n","        .set(\"spark.core.connection.ack.wait.timeout\", \"3600\") \\\n","        .set(\"spark.executor.memory\", MAX_MEMORY) \\\n","        .set(\"spark.driver.memory\", MAX_MEMORY) \\\n","        .set(\"spark.driver.maxResultSize\", \"4g\") \\\n","        .set(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\")\n","spark = SparkSession.builder.appName(\"Pyspark\").config(conf=conf).getOrCreate()\n","spark"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:25.792027Z","iopub.status.busy":"2023-06-19T04:47:25.791731Z","iopub.status.idle":"2023-06-19T04:47:25.799813Z","shell.execute_reply":"2023-06-19T04:47:25.796610Z","shell.execute_reply.started":"2023-06-19T04:47:25.791998Z"},"trusted":true},"outputs":[],"source":["# pandarallel.initialize(progress_bar=True, use_memory_fs=False)"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:25.803740Z","iopub.status.busy":"2023-06-19T04:47:25.802887Z","iopub.status.idle":"2023-06-19T04:47:26.433056Z","shell.execute_reply":"2023-06-19T04:47:26.431337Z","shell.execute_reply.started":"2023-06-19T04:47:25.803711Z"},"trusted":true},"outputs":[],"source":["data = json.load(open(\"/kaggle/input/e2eqa-wiki-zalo-ai/e2eqa-trainpublic_test-v1/e2eqa-train+public_test-v1/zac2022_train_merged_final.json\"))\n","data = [x for x in data['data'] if x[\"category\"] == \"FULL_ANNOTATION\" and \"wiki/\" in x[\"answer\"]]"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:26.435414Z","iopub.status.busy":"2023-06-19T04:47:26.435040Z","iopub.status.idle":"2023-06-19T04:47:39.021537Z","shell.execute_reply":"2023-06-19T04:47:39.020732Z","shell.execute_reply.started":"2023-06-19T04:47:26.435385Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["                                                                                \r"]},{"data":{"text/plain":["pyspark.sql.dataframe.DataFrame"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["df_wiki = spark.read.json(path = \"/kaggle/input/e2eqa-wiki-zalo-ai/wikipedia_20220620_cleaned/wikipedia_20220620_cleaned.jsonl\")\n","# df_wiki = df_wiki.cache().pandas_api()\n","type(df_wiki)"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:39.022723Z","iopub.status.busy":"2023-06-19T04:47:39.022452Z","iopub.status.idle":"2023-06-19T04:47:39.454049Z","shell.execute_reply":"2023-06-19T04:47:39.453277Z","shell.execute_reply.started":"2023-06-19T04:47:39.022697Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["+---+--------+--------------------+--------------------+----------------+--------------------+\n","| id|   revid|                text|           timestamp|           title|                 url|\n","+---+--------+--------------------+--------------------+----------------+--------------------+\n","|  2|68591979|Trang Chính\\n\\n<t...|2022-05-12T12:46:53Z|     Trang Chính|https://vi.wikipe...|\n","|  4|67988747|Internet Society\\...|2022-01-20T07:59:10Z|Internet Society|https://vi.wikipe...|\n","| 13|68660631|Tiếng Việt\\n\\nTiế...|2022-05-29T03:42:42Z|      Tiếng Việt|https://vi.wikipe...|\n","| 24|68482118|Ohio\\n\\nOhio (viế...|2022-04-17T08:15:22Z|            Ohio|https://vi.wikipe...|\n","| 26|68738039|California\\n\\nCal...|2022-06-16T15:27:07Z|      California|https://vi.wikipe...|\n","+---+--------+--------------------+--------------------+----------------+--------------------+\n","only showing top 5 rows\n","\n"]}],"source":["df_wiki.show(5)"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:39.455216Z","iopub.status.busy":"2023-06-19T04:47:39.454950Z","iopub.status.idle":"2023-06-19T04:47:40.096642Z","shell.execute_reply":"2023-06-19T04:47:40.095084Z","shell.execute_reply.started":"2023-06-19T04:47:39.455192Z"},"trusted":true},"outputs":[{"data":{"text/plain":["[Table(name='df_wiki_temp', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["df_wiki.createOrReplaceTempView('df_wiki_temp')\n","spark.catalog.listTables()"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:40.097915Z","iopub.status.busy":"2023-06-19T04:47:40.097624Z","iopub.status.idle":"2023-06-19T04:47:40.130812Z","shell.execute_reply":"2023-06-19T04:47:40.129860Z","shell.execute_reply.started":"2023-06-19T04:47:40.097890Z"},"trusted":true},"outputs":[],"source":["def get_topk(query, topk=100):\n","    tokenized_query = query.split()\n","    tfidf_query = dictionary.doc2bow(tokenized_query)\n","    scores = bm25_index[tfidf_query]\n","    top_n = np.argsort(scores)[::-1][:topk]\n","    titles = [df_wiki.filter(col(\"id\") == i).select(\"title\").collect()[0][0] for i in top_n]\n","    texts = [df_wiki.filter(col(\"id\") == i).select(\"text\").collect()[0][0] for i in top_n]\n","    scores = [scores[i] for i in top_n]\n","    result = \"\\n\".join([f\"{title}\\t{text}\\t{score}\" for title, text, score in zip(titles, texts, scores)])\n","    return result\n","\n","get_topk_udf = udf(get_topk, StringType())\n","\n","dict_map = {}\n","\n","def word_tokenize(text): \n","    global dict_map\n","    words = text.split() \n","    words_norm = [] \n","    for w in words: \n","        if w not in dict_map: \n","            dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '\"').replace(\"''\", '\"') \n","        words_norm.append(dict_map[w]) \n","    return words_norm \n","\n","word_tokenize_udf = udf(word_tokenize, ArrayType(StringType()))\n","\n","def strip_answer_string(text):\n","    text = text.strip() \n","    while text[-1] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n","        if text[0] != '(' and text[-1] == ')' and '(' in text: \n","            break \n","        if text[-1] == '\"' and text[0] != '\"' and text.count('\"') > 1: \n","            break \n","        text = text[:-1].strip() \n","    while text[0] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n","        if text[0] == '\"' and text[-1] != '\"' and text.count('\"') > 1: \n","            break \n","        text = text[1:].strip() \n","    text = text.strip() \n","    return text \n","\n","strip_answer_string_udf = udf(strip_answer_string, StringType())\n","\n","def strip_context(text):\n","    text = text.replace('\\n', ' ') # thay kí tự kết thúc bằng khoảng trắng\n","    text = re.sub(r'\\s+', ' ', text) # thay khoảng trắng lớn hơn 1 kí tự thành khoảng trắng 1 kí tự\n","    text = text.strip() # xoá khoảng trắng ở đầu và cuối \n","    return text \n","\n","strip_context_udf = udf(strip_context, StringType())\n","\n","def post_process(x):\n","    x = x.lower()\n","    x = \" \".join(word_tokenize(strip_context(x))).strip()\n","    x = x.replace(\"\\n\",\" \")\n","    x = \"\".join([i for i in x if i not in string.punctuation])\n","    x = \" \".join(x.split()[:128])\n","    return x \n","\n","post_process_udf = udf(post_process, StringType())"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:40.137361Z","iopub.status.busy":"2023-06-19T04:47:40.134954Z","iopub.status.idle":"2023-06-19T04:47:42.638078Z","shell.execute_reply":"2023-06-19T04:47:42.637255Z","shell.execute_reply.started":"2023-06-19T04:47:40.137321Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["[Stage 4:>                                                          (0 + 1) / 1]\r"]},{"name":"stdout","output_type":"stream","text":["+---+--------+--------------------+--------------------+----------------+--------------------+----------------+--------------------+\n","| id|   revid|                text|           timestamp|           title|                 url|     title_lower|          text_lower|\n","+---+--------+--------------------+--------------------+----------------+--------------------+----------------+--------------------+\n","|  2|68591979|Trang Chính\\n\\n<t...|2022-05-12T12:46:53Z|     Trang Chính|https://vi.wikipe...|     trang chính|trang chính templ...|\n","|  4|67988747|Internet Society\\...|2022-01-20T07:59:10Z|Internet Society|https://vi.wikipe...|internet society|internet society ...|\n","| 13|68660631|Tiếng Việt\\n\\nTiế...|2022-05-29T03:42:42Z|      Tiếng Việt|https://vi.wikipe...|      tiếng việt|tiếng việt tiếng ...|\n","| 24|68482118|Ohio\\n\\nOhio (viế...|2022-04-17T08:15:22Z|            Ohio|https://vi.wikipe...|            ohio|ohio ohio viết tắ...|\n","| 26|68738039|California\\n\\nCal...|2022-06-16T15:27:07Z|      California|https://vi.wikipe...|      california|california califo...|\n","+---+--------+--------------------+--------------------+----------------+--------------------+----------------+--------------------+\n","only showing top 5 rows\n","\n"]},{"name":"stderr","output_type":"stream","text":["                                                                                \r"]}],"source":["df_wiki = df_wiki.withColumn(\"title_lower\", post_process_udf(col(\"title\")))\n","df_wiki = df_wiki.withColumn(\"text_lower\", post_process_udf(col(\"text\")))\n","df_wiki.show(5)"]},{"cell_type":"code","execution_count":12,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:42.639237Z","iopub.status.busy":"2023-06-19T04:47:42.638940Z","iopub.status.idle":"2023-06-19T04:47:42.674851Z","shell.execute_reply":"2023-06-19T04:47:42.674065Z","shell.execute_reply.started":"2023-06-19T04:47:42.639213Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["== Physical Plan ==\n","*(1) Project [id#8, revid#9, text#10, timestamp#11, title#12, url#13, pythonUDF0#144 AS title_lower#85, pythonUDF1#145 AS text_lower#94]\n","+- BatchEvalPython [post_process(title#12)#84, post_process(text#10)#93], [pythonUDF0#144, pythonUDF1#145]\n","   +- FileScan json [id#8,revid#9,text#10,timestamp#11,title#12,url#13] Batched: false, DataFilters: [], Format: JSON, Location: InMemoryFileIndex(1 paths)[file:/kaggle/input/e2eqa-wiki-zalo-ai/wikipedia_20220620_cleaned/wikip..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:string,revid:string,text:string,timestamp:string,title:string,url:string>\n","\n","\n"]}],"source":["df_wiki.explain()"]},{"cell_type":"markdown","metadata":{},"source":["### Đọc file __stopwords__"]},{"cell_type":"code","execution_count":13,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T04:47:42.675987Z","iopub.status.busy":"2023-06-19T04:47:42.675714Z","iopub.status.idle":"2023-06-19T04:47:42.710651Z","shell.execute_reply":"2023-06-19T04:47:42.709880Z","shell.execute_reply.started":"2023-06-19T04:47:42.675963Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0         mà_thôi\n","1          tuy_có\n","2         đâu_như\n","3       quay_bước\n","4         hỏi_lại\n","          ...    \n","1937       cái_họ\n","1938          hết\n","1939          lại\n","1940      thế_thế\n","1941      dần_dần\n","Name: stopwords, Length: 1942, dtype: object"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["filename = '/kaggle/input/e2eqa-wiki-zalo-ai/external/stopwords.csv'\n","stop = pd.read_csv(filename, sep=\"\\t\", encoding='utf-8')\n","list_stopwords = stop['stopwords']\n","list_stopwords"]},{"cell_type":"markdown","metadata":{},"source":["---\n","---"]},{"cell_type":"code","execution_count":14,"metadata":{"_kg_hide-output":true,"execution":{"iopub.execute_input":"2023-06-19T04:47:42.712418Z","iopub.status.busy":"2023-06-19T04:47:42.711650Z","iopub.status.idle":"2023-06-19T04:47:44.373278Z","shell.execute_reply":"2023-06-19T04:47:44.372435Z","shell.execute_reply.started":"2023-06-19T04:47:42.712390Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["[Stage 5:>                                                          (0 + 1) / 1]\r"]},{"name":"stdout","output_type":"stream","text":["+--------------------+\n","|               words|\n","+--------------------+\n","|[trang, chính, te...|\n","|[internet, societ...|\n","|[tiếng, việt, tiế...|\n","|[ohio, ohio, viết...|\n","|[california, cali...|\n","+--------------------+\n","only showing top 5 rows\n","\n"]},{"name":"stderr","output_type":"stream","text":["                                                                                \r"]}],"source":["# Tokenize text into words\n","def tokenizer(text):\n","    tokens = [word for word in tokenize(text).split() if word not in list_stopwords]\n","    return tokens\n","\n","tokenize_udf = udf(tokenizer, ArrayType(StringType()))\n","df_wiki = df_wiki.withColumn(\"words\", tokenize_udf(col(\"text_lower\")))\n","df_wiki.select(\"words\").show(5)"]},{"cell_type":"markdown","metadata":{},"source":["### Tần suất các từ trong texts"]},{"cell_type":"code","execution_count":15,"metadata":{"_kg_hide-input":false,"_kg_hide-output":false,"execution":{"iopub.execute_input":"2023-06-19T04:47:44.379705Z","iopub.status.busy":"2023-06-19T04:47:44.378439Z","iopub.status.idle":"2023-06-19T05:06:28.712339Z","shell.execute_reply":"2023-06-19T05:06:28.710514Z","shell.execute_reply.started":"2023-06-19T04:47:44.379637Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["[Stage 8:>                                                          (0 + 4) / 4]\r"]},{"name":"stdout","output_type":"stream","text":["+--------+-------+\n","|    word|  count|\n","+--------+-------+\n","|      là|1858454|\n","|    loài|1385128|\n","|     một|1365013|\n","|   trong|1046730|\n","|     năm|1034325|\n","|    được| 923562|\n","|      họ| 846094|\n","|     này| 726077|\n","|  bullet| 671970|\n","|     của| 656976|\n","|      có| 602435|\n","|      và| 578809|\n","|khoa_học| 484960|\n","|đầu_tiên| 381971|\n","|       ở| 373949|\n","|     hoa| 352879|\n","|   thuộc| 339118|\n","|   người| 321510|\n","|     các| 308467|\n","|thực_vật| 287653|\n","+--------+-------+\n","only showing top 20 rows\n","\n"]},{"name":"stderr","output_type":"stream","text":["                                                                                \r"]}],"source":["from pyspark.sql import functions as F\n","# Increase the number of partitions\n","# df_wiki = df_wiki.repartition(100)\n","\n","# Cache the DataFrame in memory\n","df_wiki.cache()\n","\n","# Perform the computation\n","result = (\n","    df_wiki\n","    .select(F.explode(\"words\").alias(\"word\"))\n","    .groupBy(\"word\")\n","    .count()\n","    .orderBy(F.desc(\"count\"))\n",")\n","\n","# Show the results\n","result.show()"]},{"cell_type":"markdown","metadata":{},"source":["---\n","---"]},{"cell_type":"markdown","metadata":{},"source":["#### Map title with answer"]},{"cell_type":"code","execution_count":16,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:06:28.718426Z","iopub.status.busy":"2023-06-19T05:06:28.717581Z","iopub.status.idle":"2023-06-19T05:07:05.169160Z","shell.execute_reply":"2023-06-19T05:07:05.168170Z","shell.execute_reply.started":"2023-06-19T05:06:28.718393Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["                                                                                \r"]}],"source":["from pyspark.sql import Window\n","# add index column\n","df_wiki = df_wiki.rdd.zipWithIndex().toDF([\"data\", \"index\"]).select(\"data.*\", \"index\")"]},{"cell_type":"code","execution_count":17,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:07:05.170729Z","iopub.status.busy":"2023-06-19T05:07:05.170405Z","iopub.status.idle":"2023-06-19T05:08:02.748141Z","shell.execute_reply":"2023-06-19T05:08:02.746532Z","shell.execute_reply.started":"2023-06-19T05:07:05.170695Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["                                                                                \r"]},{"name":"stdout","output_type":"stream","text":["Cù Lao Chàm\n","nhà Hán\n","bán đảo Ả Rập\n"]}],"source":["from pyspark.sql.functions import trim, lower, regexp_replace\n","\n","# Create a dictionary mapping titles to indices\n","title2idx = (\n","  df_wiki\n","  .select(trim(\"title\").alias(\"title\"), \"index\")\n","  .groupBy(\"title\")\n","  .agg({\"index\": \"first\"})\n","  .rdd\n","  .collectAsMap()\n",")\n","\n","# Loop over data and check if answer is in the dictionary\n","for x in data:\n","    answer = x['answer'].replace(\"wiki/\", \"\").replace(\"_\", \" \")\n","    # Check if the cleaned up title is in the dictionary\n","    if answer not in title2idx.keys():\n","        print(answer)\n","        for key, val in title2idx.items():\n","            if answer.lower() == key.lower():\n","                x['answer'] = \"wiki/\" + key.replace(\" \", \"_\")\n","    "]},{"cell_type":"code","execution_count":18,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:08:02.753058Z","iopub.status.busy":"2023-06-19T05:08:02.752193Z","iopub.status.idle":"2023-06-19T05:08:02.764393Z","shell.execute_reply":"2023-06-19T05:08:02.763347Z","shell.execute_reply.started":"2023-06-19T05:08:02.753016Z"},"trusted":true},"outputs":[{"data":{"text/plain":["(1849, 4608)"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["l = []\n","for x in data:\n","    l.append(x['answer'])\n","len(set(l)), len(l)"]},{"cell_type":"markdown","metadata":{},"source":["#### Lọc các stopwords ra khỏi answer"]},{"cell_type":"code","execution_count":19,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:08:02.767039Z","iopub.status.busy":"2023-06-19T05:08:02.766644Z","iopub.status.idle":"2023-06-19T05:08:02.780001Z","shell.execute_reply":"2023-06-19T05:08:02.778564Z","shell.execute_reply.started":"2023-06-19T05:08:02.767009Z"},"trusted":true},"outputs":[],"source":["# from pyspark.ml.feature import StopWordsRemover\n","# # Tokenize and remove stop words\n","# remover = StopWordsRemover(inputCol=\"title_lower\", outputCol=\"tokens\")\n","# tokenized_df = remover.transform(df_wiki.select(\"title_lower\"))\n","\n","# # Convert tokens to a list of strings\n","# to_list = udf(lambda x: x, ArrayType(StringType()))\n","# tokenized_df = tokenized_df.withColumn(\"token_list\", to_list(\"tokens\"))\n","\n","# # Compute TF-IDF vectors\n","# cv = CountVectorizer(inputCol=\"token_list\", outputCol=\"raw_features\")\n","# cv_model = cv.fit(tokenized_df)\n","# featurized_df = cv_model.transform(tokenized_df)\n","# idf = IDF(inputCol=\"raw_features\", outputCol=\"features\")\n","# idf_model = idf.fit(featurized_df)\n","# rescaled_df = idf_model.transform(featurized_df)\n","\n","# # Convert features column to VectorUDT\n","# to_vector = udf(lambda x: x, VectorUDT())\n","# rescaled_df = rescaled_df.withColumn(\"features\", to_vector(\"features\"))\n","\n","# # Compute BM25Okapi score\n","# bm25 = BM25Okapi(rescaled_df.select(\"features\").rdd.flatMap(lambda x: x))\n","# bm25"]},{"cell_type":"code","execution_count":20,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:08:02.781992Z","iopub.status.busy":"2023-06-19T05:08:02.781655Z","iopub.status.idle":"2023-06-19T05:09:00.711662Z","shell.execute_reply":"2023-06-19T05:09:00.710057Z","shell.execute_reply.started":"2023-06-19T05:08:02.781961Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["                                                                                \r"]}],"source":["docs = df_wiki.select('title_lower').toPandas()\n","tokenized_corpus = [[word for word in doc.lower().split() if word not in list_stopwords]\n","                    for doc in docs['title_lower']\n","                   ] #simple tokenier\n","bm25_title = BM25Okapi(tokenized_corpus)"]},{"cell_type":"code","execution_count":21,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:09:00.713615Z","iopub.status.busy":"2023-06-19T05:09:00.713197Z","iopub.status.idle":"2023-06-19T05:09:01.135969Z","shell.execute_reply":"2023-06-19T05:09:01.135282Z","shell.execute_reply.started":"2023-06-19T05:09:00.713580Z"},"scrolled":true,"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["+--------------------+\n","|               query|\n","+--------------------+\n","|[costa, rica, ice...|\n","|       [núi, elbrus]|\n","|        [alexandria]|\n","|          [lê, chân]|\n","|                 [ý]|\n","+--------------------+\n","only showing top 5 rows\n","\n"]}],"source":["df = pd.DataFrame({\n","    \"query\": [post_process(x[\"short_candidate\"]).lower().split() for x in data]\n","})\n","df = spark.createDataFrame(df)\n","df.show(5)"]},{"cell_type":"code","execution_count":22,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:09:01.136929Z","iopub.status.busy":"2023-06-19T05:09:01.136703Z","iopub.status.idle":"2023-06-19T05:09:01.198467Z","shell.execute_reply":"2023-06-19T05:09:01.197404Z","shell.execute_reply.started":"2023-06-19T05:09:01.136901Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["+--------------------+\n","|               query|\n","+--------------------+\n","|[costa, rica, ice...|\n","|       [núi, elbrus]|\n","|        [alexandria]|\n","|          [lê, chân]|\n","|                 [ý]|\n","|       [phan, thiết]|\n","|      [xã, nhơn, lý]|\n","|            [google]|\n","|    [tỉnh, gia, lai]|\n","|  [tỉnh, quảng, nam]|\n","|   [vua, khải, định]|\n","|[theo, thể, lục, ...|\n","|   [trần, duy, hưng]|\n","|       [suối, tranh]|\n","|[chùa, làng, vũ, ...|\n","|[nguyễn, phú, trọng]|\n","|              [pháp]|\n","|   [tỉnh, an, giang]|\n","|    [tỉnh, bắc, kạn]|\n","|   [nguyễn, văn, tý]|\n","+--------------------+\n","only showing top 20 rows\n","\n"]}],"source":["df.select('query').show()"]},{"cell_type":"code","execution_count":23,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:09:01.199811Z","iopub.status.busy":"2023-06-19T05:09:01.199500Z","iopub.status.idle":"2023-06-19T05:09:08.924690Z","shell.execute_reply":"2023-06-19T05:09:08.923664Z","shell.execute_reply.started":"2023-06-19T05:09:01.199782Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["[Stage 15:>                                                         (0 + 1) / 1]\r"]},{"name":"stdout","output_type":"stream","text":["+--------------------+--------------------+\n","|               query|               top_n|\n","+--------------------+--------------------+\n","|[costa, rica, ice...|[6771, 25055, 585...|\n","|       [núi, elbrus]|[11092, 1268687, ...|\n","|        [alexandria]|[54875, 223778, 5...|\n","|          [lê, chân]|[25316, 17081, 39...|\n","|                 [ý]|[4452, 46524, 114...|\n","+--------------------+--------------------+\n","only showing top 5 rows\n","\n"]},{"name":"stderr","output_type":"stream","text":["                                                                                \r"]}],"source":["# Define UDF to compute topk titles for a query\n","def get_topk(query, topk=10):\n","    scores = bm25_title.get_scores(query)\n","    top_titles = list(map(int, np.argsort(scores)[::-1][:topk]))\n","    return top_titles\n","\n","get_topk_udf = udf(get_topk, ArrayType(IntegerType()))\n","\n","# Add topk titles column to DataFrame\n","df = df.withColumn(\"top_n\", get_topk_udf(df[\"query\"]))\n","df.show(5)"]},{"cell_type":"code","execution_count":24,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:09:08.926215Z","iopub.status.busy":"2023-06-19T05:09:08.925833Z","iopub.status.idle":"2023-06-19T05:30:40.685470Z","shell.execute_reply":"2023-06-19T05:30:40.683455Z","shell.execute_reply.started":"2023-06-19T05:09:08.926182Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["                                                                                \r"]}],"source":["candidate_ids = []\n","true_ids = []\n","top_n_df = df.select(\"top_n\").toPandas()\n","for i, x in enumerate(data):\n","    true_title = x[\"answer\"].replace(\"_\",\" \").replace(\"wiki/\",\"\").strip()\n","    top_n = top_n_df.loc[i, \"top_n\"].tolist() \n","    true_idx = title2idx[true_title]\n","    true_ids.append(true_idx)\n","    candidate_ids.append(top_n)"]},{"cell_type":"code","execution_count":25,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:30:40.688025Z","iopub.status.busy":"2023-06-19T05:30:40.687599Z","iopub.status.idle":"2023-06-19T05:30:40.696966Z","shell.execute_reply":"2023-06-19T05:30:40.695478Z","shell.execute_reply.started":"2023-06-19T05:30:40.687990Z"},"trusted":true},"outputs":[],"source":["for x, y in zip(candidate_ids, true_ids):\n","    if y not in x:\n","        x[-1] = y"]},{"cell_type":"code","execution_count":26,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:30:40.700316Z","iopub.status.busy":"2023-06-19T05:30:40.699499Z","iopub.status.idle":"2023-06-19T05:32:48.695056Z","shell.execute_reply":"2023-06-19T05:32:48.693738Z","shell.execute_reply.started":"2023-06-19T05:30:40.700278Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["                                                                                \r"]}],"source":["# Change to list of row\n","rows = df_wiki.collect()"]},{"cell_type":"code","execution_count":27,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:32:48.697279Z","iopub.status.busy":"2023-06-19T05:32:48.696520Z","iopub.status.idle":"2023-06-19T05:32:55.538212Z","shell.execute_reply":"2023-06-19T05:32:55.537131Z","shell.execute_reply.started":"2023-06-19T05:32:48.697248Z"},"trusted":true},"outputs":[],"source":["df = pd.DataFrame()\n","questions = []\n","answers = []\n","titles = []\n","candidates = []\n","labels = []\n","groups = []\n","for idx, (sample, true_idx, candidate_idxs) in enumerate(zip(data, true_ids, candidate_ids)):\n","    assert true_idx in candidate_idxs\n","    question = sample['question']\n","    answer = sample['short_candidate']\n","    title = [rows[i]['title'] for i in candidate_idxs]\n","    candidate = [rows[i]['text'] for i in candidate_idxs]\n","    label = [1 if x == true_idx else 0 for x in candidate_idxs]\n","    \n","    questions.extend([question,]*len(candidate_idxs))\n","    answers.extend([answer,]*len(candidate_idxs))\n","    groups.extend([idx,]*len(candidate_idxs))\n","    titles.extend(title)\n","    candidates.extend(candidate)\n","    labels.extend(label)\n","    \n","df[\"question\"] = questions \n","df[\"answer\"] = answers \n","df[\"title\"] = titles \n","df[\"candidate\"] = candidates \n","df[\"label\"] = labels \n","df[\"group\"] = groups \n","df.candidate = df.candidate.apply(lambda x: \" \".join(x.strip().split()))"]},{"cell_type":"code","execution_count":28,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:32:55.539819Z","iopub.status.busy":"2023-06-19T05:32:55.539519Z","iopub.status.idle":"2023-06-19T05:32:55.577719Z","shell.execute_reply":"2023-06-19T05:32:55.576585Z","shell.execute_reply.started":"2023-06-19T05:32:55.539794Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>question</th>\n","      <th>answer</th>\n","      <th>title</th>\n","      <th>candidate</th>\n","      <th>label</th>\n","      <th>group</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>Đất nước nào không có quân đội</td>\n","      <td>Costa Rica, Iceland, Panama, Micronesia, Quần ...</td>\n","      <td>Costa Rica</td>\n","      <td>Costa Rica Costa Rica (Phiên âm: Cô-xta Ri-ca)...</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>Đất nước nào không có quân đội</td>\n","      <td>Costa Rica, Iceland, Panama, Micronesia, Quần ...</td>\n","      <td>Quần đảo Marshall</td>\n","      <td>Quần đảo Marshall Quần đảo Marshall, tên chính...</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>Đất nước nào không có quân đội</td>\n","      <td>Costa Rica, Iceland, Panama, Micronesia, Quần ...</td>\n","      <td>Montezuma, Costa Rica</td>\n","      <td>Montezuma, Costa Rica Montezuma là một thị xã ...</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>Đất nước nào không có quân đội</td>\n","      <td>Costa Rica, Iceland, Panama, Micronesia, Quần ...</td>\n","      <td>Tamarindo, Costa Rica</td>\n","      <td>Tamarindo, Costa Rica Tamarindo là một thị xã ...</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>Đất nước nào không có quân đội</td>\n","      <td>Costa Rica, Iceland, Panama, Micronesia, Quần ...</td>\n","      <td>Micronesia</td>\n","      <td>Micronesia Micronesia (, ), còn gọi là Tiểu Đả...</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["                         question  \\\n","0  Đất nước nào không có quân đội   \n","1  Đất nước nào không có quân đội   \n","2  Đất nước nào không có quân đội   \n","3  Đất nước nào không có quân đội   \n","4  Đất nước nào không có quân đội   \n","\n","                                              answer                  title  \\\n","0  Costa Rica, Iceland, Panama, Micronesia, Quần ...             Costa Rica   \n","1  Costa Rica, Iceland, Panama, Micronesia, Quần ...      Quần đảo Marshall   \n","2  Costa Rica, Iceland, Panama, Micronesia, Quần ...  Montezuma, Costa Rica   \n","3  Costa Rica, Iceland, Panama, Micronesia, Quần ...  Tamarindo, Costa Rica   \n","4  Costa Rica, Iceland, Panama, Micronesia, Quần ...             Micronesia   \n","\n","                                           candidate  label  group  \n","0  Costa Rica Costa Rica (Phiên âm: Cô-xta Ri-ca)...      0      0  \n","1  Quần đảo Marshall Quần đảo Marshall, tên chính...      0      0  \n","2  Montezuma, Costa Rica Montezuma là một thị xã ...      0      0  \n","3  Tamarindo, Costa Rica Tamarindo là một thị xã ...      0      0  \n","4  Micronesia Micronesia (, ), còn gọi là Tiểu Đả...      0      0  "]},"execution_count":28,"metadata":{},"output_type":"execute_result"}],"source":["df.head()"]},{"cell_type":"code","execution_count":30,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:33:45.138800Z","iopub.status.busy":"2023-06-19T05:33:45.138413Z","iopub.status.idle":"2023-06-19T05:33:51.569429Z","shell.execute_reply":"2023-06-19T05:33:51.568261Z","shell.execute_reply.started":"2023-06-19T05:33:45.138771Z"},"trusted":true},"outputs":[],"source":["df.to_csv(\"/kaggle/working/train_stage2_ranking.csv\",index=False)"]},{"cell_type":"markdown","metadata":{},"source":["<a href=\"/kaggle/working/train_stage2_ranking.csv\"> Download File </a>"]},{"cell_type":"code","execution_count":33,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:38:13.126543Z","iopub.status.busy":"2023-06-19T05:38:13.126055Z","iopub.status.idle":"2023-06-19T05:38:47.848000Z","shell.execute_reply":"2023-06-19T05:38:47.845517Z","shell.execute_reply.started":"2023-06-19T05:38:13.126496Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["  adding: kaggle/working/train_stage2_ranking.csv (deflated 70%)\n"]}],"source":["!zip -r file.zip /kaggle/working/train_stage2_ranking.csv"]},{"cell_type":"code","execution_count":36,"metadata":{"execution":{"iopub.execute_input":"2023-06-19T05:41:56.345417Z","iopub.status.busy":"2023-06-19T05:41:56.344395Z","iopub.status.idle":"2023-06-19T05:41:56.821886Z","shell.execute_reply":"2023-06-19T05:41:56.820592Z","shell.execute_reply.started":"2023-06-19T05:41:56.345344Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["total 507M\n","drwxr-xr-x 4 root root 4.0K Jun 19 05:38 .\n","-rw-r--r-- 1 root root 118M Jun 19 05:38 file.zip\n","-rw-r--r-- 1 root root 390M Jun 19 05:33 train_stage2_ranking.csv\n","drwxr-xr-x 2 root root 4.0K Jun 19 04:47 spark-warehouse\n","drwxr-xr-x 2 root root 4.0K Jun 19 04:45 .virtual_documents\n","drwxr-xr-x 5 root root 4.0K Jun 19 04:45 ..\n","---------- 1 root root  263 Jun 19 04:45 __notebook_source__.ipynb\n"]}],"source":["!cd /kaggle && ls -halt working"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.10"}},"nbformat":4,"nbformat_minor":5}