Spaces:

foxxy-hm
/

e2eqa-wiki

Build error

File size: 15,243 Bytes

c5d2283

{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/code/noobhocai/train-stage-1"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:46:04.681825Z","iopub.status.busy":"2023-06-26T15:46:04.681521Z","iopub.status.idle":"2023-06-26T15:46:20.131828Z","shell.execute_reply":"2023-06-26T15:46:20.130853Z","shell.execute_reply.started":"2023-06-26T15:46:04.681800Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install pyvi rank_bm25 pandarallel gensim --q"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:46:20.133869Z","iopub.status.busy":"2023-06-26T15:46:20.133505Z","iopub.status.idle":"2023-06-26T15:46:21.705024Z","shell.execute_reply":"2023-06-26T15:46:21.703777Z","shell.execute_reply.started":"2023-06-26T15:46:20.133832Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["INFO: Pandarallel will run on 10 workers.\n","INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n"]}],"source":["import os\n","import re\n","from tqdm.auto import tqdm\n","tqdm.pandas()\n","import math\n","import pandas as pd\n","import string\n","from pyvi.ViTokenizer import tokenize\n","import numpy as np\n","import json, pickle\n","from rank_bm25 import BM25Okapi\n","import argparse\n","import gc\n","\n","from glob import glob \n","from nltk import word_tokenize as lib_tokenizer \n","\n","from pandarallel import pandarallel\n","from gensim.corpora import Dictionary\n","from gensim.corpora import MmCorpus\n","from gensim.models import TfidfModel, OkapiBM25Model\n","from gensim.similarities import SparseMatrixSimilarity\n","pandarallel.initialize(progress_bar=True, nb_workers=10)"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:12:52.194411Z","iopub.status.busy":"2023-06-26T15:12:52.193733Z","iopub.status.idle":"2023-06-26T15:12:52.208121Z","shell.execute_reply":"2023-06-26T15:12:52.206564Z","shell.execute_reply.started":"2023-06-26T15:12:52.194376Z"},"trusted":true},"outputs":[],"source":["def get_topk(query, topk = 100):\n","    tokenized_query = query.split()\n","    tfidf_query = tfidf_model[dictionary.doc2bow(tokenized_query)]\n","    scores = bm25_index[tfidf_query]\n","    top_n = np.argsort(scores)[::-1][:topk]\n","    titles = [df_wiki.title.values[i] for i in top_n]\n","    texts = [df_wiki.text.values[i] for i in top_n]\n","    # print(titles)\n","    # print(tfidf_query, scores)\n","    return titles, texts, scores[top_n]\n","\n","def post_process(x):\n","    x = \" \".join(word_tokenize(strip_context(x))).strip()\n","    x = x.replace(\"\\n\",\" \")\n","    x = \"\".join([i for i in x if i not in string.punctuation])\n","    return x\n","\n","dict_map = dict({})  \n","def word_tokenize(text): \n","    global dict_map \n","    words = text.split() \n","    words_norm = [] \n","    for w in words: \n","        if dict_map.get(w, None) is None: \n","            dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '\"').replace(\"''\", '\"') \n","        words_norm.append(dict_map[w]) \n","    return words_norm \n"," \n","def strip_context(text): \n","    text = text.replace('\\n', ' ') \n","    text = re.sub(r'\\s+', ' ', text) \n","    text = text.strip() \n","    return text"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:12:52.210996Z","iopub.status.busy":"2023-06-26T15:12:52.210623Z","iopub.status.idle":"2023-06-26T15:12:52.227767Z","shell.execute_reply":"2023-06-26T15:12:52.226604Z","shell.execute_reply.started":"2023-06-26T15:12:52.210948Z"},"trusted":true},"outputs":[],"source":["wiki_cleaned_path = \"/kaggle/input/e2eqa-wiki-zalo-ai/processed/wikipedia_20220620_cleaned_v2.csv\"\n","test_data_path =  \"/kaggle/input/e2eqa-wiki-zalo-ai/e2eqa-trainpublic_test-v1/e2eqa-train+public_test-v1/zac2022_testa_sample_submission.json\"\n","topk = 300"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:12:52.230234Z","iopub.status.busy":"2023-06-26T15:12:52.229690Z","iopub.status.idle":"2023-06-26T15:14:32.740956Z","shell.execute_reply":"2023-06-26T15:14:32.739621Z","shell.execute_reply.started":"2023-06-26T15:12:52.230185Z"},"trusted":true},"outputs":[],"source":["df_wiki = pd.read_csv(wiki_cleaned_path)"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:14:32.743587Z","iopub.status.busy":"2023-06-26T15:14:32.743104Z","iopub.status.idle":"2023-06-26T15:14:35.140539Z","shell.execute_reply":"2023-06-26T15:14:35.139511Z","shell.execute_reply.started":"2023-06-26T15:14:32.743544Z"},"trusted":true},"outputs":[],"source":["df_wiki = df_wiki.fillna(\"NaN\")\n","if \"title\" not in df_wiki.columns:\n","    df_wiki[\"title\"] = df_wiki[\"titles=\"].fillna(\"\")"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:14:35.143448Z","iopub.status.busy":"2023-06-26T15:14:35.142192Z","iopub.status.idle":"2023-06-26T15:14:35.178049Z","shell.execute_reply":"2023-06-26T15:14:35.176844Z","shell.execute_reply.started":"2023-06-26T15:14:35.143403Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>title</th>\n","      <th>text</th>\n","      <th>bm25_text</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>Trang Chính</td>\n","      <td>Trang Chính\\n\\n&lt;templatestyles src=\"Wiki2021/s...</td>\n","      <td>trang chính &lt;templatestyles src= wiki2021 styl...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>Internet Society</td>\n","      <td>Internet Society  hay ISOC là một tổ chức quốc...</td>\n","      <td>internet society hay isoc là một tổ chức quốc ...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>Tiếng Việt</td>\n","      <td>Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...</td>\n","      <td>tiếng việt cũng gọi là tiếng việt nam hay việt...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>Tiếng Việt</td>\n","      <td>hệ thống thanh điệu phát triển cao hơn, hệ thố...</td>\n","      <td>hệ thống thanh điệu phát triển cao hơn hệ thốn...</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>Tiếng Việt</td>\n","      <td>tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...</td>\n","      <td>tiếp xúc hán – việt thành 2 giai đoạn chính bu...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["              title                                               text  \\\n","0       Trang Chính  Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s...   \n","1  Internet Society  Internet Society  hay ISOC là một tổ chức quốc...   \n","2        Tiếng Việt  Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...   \n","3        Tiếng Việt  hệ thống thanh điệu phát triển cao hơn, hệ thố...   \n","4        Tiếng Việt  tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...   \n","\n","                                           bm25_text  \n","0  trang chính <templatestyles src= wiki2021 styl...  \n","1  internet society hay isoc là một tổ chức quốc ...  \n","2  tiếng việt cũng gọi là tiếng việt nam hay việt...  \n","3  hệ thống thanh điệu phát triển cao hơn hệ thốn...  \n","4  tiếp xúc hán – việt thành 2 giai đoạn chính bu...  "]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["df_wiki.head()"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:14:35.180010Z","iopub.status.busy":"2023-06-26T15:14:35.179628Z","iopub.status.idle":"2023-06-26T15:21:28.016391Z","shell.execute_reply":"2023-06-26T15:21:28.014113Z","shell.execute_reply.started":"2023-06-26T15:14:35.179950Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"8492baeb724e46bd80570a9450bdc18c","version_major":2,"version_minor":0},"text/plain":["VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=194441), Label(value='0 / 194441')…"]},"metadata":{},"output_type":"display_data"}],"source":["df_wiki['bm25_text'] = df_wiki['bm25_text'].parallel_apply(post_process)\n","# corpus = [x.split() for x in df_wiki['bm25_text'].values]"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:22:23.179076Z","iopub.status.busy":"2023-06-26T15:22:23.178506Z","iopub.status.idle":"2023-06-26T15:31:22.030632Z","shell.execute_reply":"2023-06-26T15:31:22.028624Z","shell.execute_reply.started":"2023-06-26T15:22:23.179027Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"01c5f715c22d4ec682dc0c7b4812ec30","version_major":2,"version_minor":0},"text/plain":["  0%|          | 0/1944406 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"}],"source":["# Convert the column to a numpy array \n","texts = df_wiki['bm25_text'].to_numpy()\n","# Create an empty dictionary \n","dictionary = Dictionary()\n","# Update the dictionary with each batch of texts \n","for text in tqdm(texts): \n","    dictionary.add_documents([text.split()])"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T14:30:00.288679Z","iopub.status.busy":"2023-06-26T14:30:00.288263Z"},"trusted":true},"outputs":[],"source":["# dictionary = Dictionary(corpus)"]},{"cell_type":"code","execution_count":14,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:35:34.130599Z","iopub.status.busy":"2023-06-26T15:35:34.130050Z","iopub.status.idle":"2023-06-26T15:36:52.503096Z","shell.execute_reply":"2023-06-26T15:36:52.501050Z","shell.execute_reply.started":"2023-06-26T15:35:34.130560Z"},"trusted":true},"outputs":[],"source":["try:\n","    corpus = [text.split() for text in texts]\n","except:\n","    print(\"nope\")"]},{"cell_type":"code","execution_count":17,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:37:34.690339Z","iopub.status.busy":"2023-06-26T15:37:34.689872Z","iopub.status.idle":"2023-06-26T15:37:36.745760Z","shell.execute_reply":"2023-06-26T15:37:36.743959Z","shell.execute_reply.started":"2023-06-26T15:37:34.690307Z"},"trusted":true},"outputs":[],"source":["!mkdir /kaggle/working/bm25_stage1"]},{"cell_type":"markdown","metadata":{},"source":["Save dictionary và corpus vào bộ nhớ"]},{"cell_type":"code","execution_count":19,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:39:40.089532Z","iopub.status.busy":"2023-06-26T15:39:40.089024Z","iopub.status.idle":"2023-06-26T15:39:55.080856Z","shell.execute_reply":"2023-06-26T15:39:55.079008Z","shell.execute_reply.started":"2023-06-26T15:39:40.089498Z"},"trusted":true},"outputs":[],"source":["# open a file for writing\n","with open('/kaggle/working/bm25_stage1/corpus.txt', 'w') as f:\n","    # use a loop to write each inner list to a line in the file\n","    for text in texts:\n","        line = text + '\\n'\n","        f.write(line)"]},{"cell_type":"code","execution_count":18,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:37:38.871933Z","iopub.status.busy":"2023-06-26T15:37:38.870377Z","iopub.status.idle":"2023-06-26T15:37:48.653312Z","shell.execute_reply":"2023-06-26T15:37:48.651908Z","shell.execute_reply.started":"2023-06-26T15:37:38.871878Z"},"trusted":true},"outputs":[],"source":["tfidf_model = TfidfModel(dictionary=dictionary, smartirs='bnn')  # Enforce binary weighting of queries\n","dictionary.save(\"/kaggle/working/bm25_stage1/dict\")\n","tfidf_model.save(\"/kaggle/working/bm25_stage1/tfidf\")"]},{"cell_type":"markdown","metadata":{},"source":["---"]},{"cell_type":"markdown","metadata":{},"source":["Dừng session và tiếp tục chạy"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:51:00.031211Z","iopub.status.busy":"2023-06-26T15:51:00.030816Z","iopub.status.idle":"2023-06-26T15:51:00.036153Z","shell.execute_reply":"2023-06-26T15:51:00.034981Z","shell.execute_reply.started":"2023-06-26T15:51:00.031173Z"},"trusted":true},"outputs":[],"source":["from gensim.models.word2vec import LineSentence"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:51:04.581262Z","iopub.status.busy":"2023-06-26T15:51:04.580512Z","iopub.status.idle":"2023-06-26T15:51:07.005096Z","shell.execute_reply":"2023-06-26T15:51:07.004157Z","shell.execute_reply.started":"2023-06-26T15:51:04.581221Z"},"trusted":true},"outputs":[],"source":["# Create a dictionary and a LineSentence object\n","dictionary = Dictionary.load(\"/kaggle/working/bm25_stage1/dict\")\n","corpus = LineSentence(\"/kaggle/working/bm25_stage1/corpus.txt\")"]},{"cell_type":"code","execution_count":13,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:59:04.442336Z","iopub.status.busy":"2023-06-26T15:59:04.441363Z","iopub.status.idle":"2023-06-26T16:06:21.971493Z","shell.execute_reply":"2023-06-26T16:06:21.969936Z","shell.execute_reply.started":"2023-06-26T15:59:04.442296Z"},"trusted":true},"outputs":[],"source":["# # Get an iterator over the corpus\n","# corpus_iterator = corpus.getstream()\n","# Create a BM25 model\n","bm25_model = OkapiBM25Model(dictionary=dictionary)\n","# Create a BM25 corpus by applying doc2bow to each document in the iterator\n","bm25_corpus = bm25_model[list(map(dictionary.doc2bow, corpus))]\n","# Get the number of documents in the corpus from the dictionary or by counting them manually \n","num_docs = dictionary.num_docs\n","# Create a BM25 index\n","bm25_index = SparseMatrixSimilarity(bm25_corpus, num_docs=num_docs,\n","                num_terms=len(dictionary),normalize_queries=False, normalize_documents=False)\n","# Save the BM25 index to a file\n","bm25_index.save(\"/kaggle/working/bm25_stage1/bm25_index\")"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.10"}},"nbformat":4,"nbformat_minor":4}