### Kaggle link: https://www.kaggle.com/code/noobhocai/train-stage-1

In [1]:
!pip install pyvi rank_bm25 pandarallel gensim --q

[0m

In [2]:
import os
import re
from tqdm.auto import tqdm
tqdm.pandas()
import math
import pandas as pd
import string
from pyvi.ViTokenizer import tokenize
import numpy as np
import json, pickle
from rank_bm25 import BM25Okapi
import argparse
import gc

from glob import glob 
from nltk import word_tokenize as lib_tokenizer 

from pandarallel import pandarallel
from gensim.corpora import Dictionary
from gensim.corpora import MmCorpus
from gensim.models import TfidfModel, OkapiBM25Model
from gensim.similarities import SparseMatrixSimilarity
pandarallel.initialize(progress_bar=True, nb_workers=10)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
def get_topk(query, topk = 100):
    tokenized_query = query.split()
    tfidf_query = tfidf_model[dictionary.doc2bow(tokenized_query)]
    scores = bm25_index[tfidf_query]
    top_n = np.argsort(scores)[::-1][:topk]
    titles = [df_wiki.title.values[i] for i in top_n]
    texts = [df_wiki.text.values[i] for i in top_n]
    # print(titles)
    # print(tfidf_query, scores)
    return titles, texts, scores[top_n]

def post_process(x):
    x = " ".join(word_tokenize(strip_context(x))).strip()
    x = x.replace("\n"," ")
    x = "".join([i for i in x if i not in string.punctuation])
    return x

dict_map = dict({})  
def word_tokenize(text): 
    global dict_map 
    words = text.split() 
    words_norm = [] 
    for w in words: 
        if dict_map.get(w, None) is None: 
            dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '"').replace("''", '"') 
        words_norm.append(dict_map[w]) 
    return words_norm 
 
def strip_context(text): 
    text = text.replace('\n', ' ') 
    text = re.sub(r'\s+', ' ', text) 
    text = text.strip() 
    return text

In [4]:
wiki_cleaned_path = "/kaggle/input/e2eqa-wiki-zalo-ai/processed/wikipedia_20220620_cleaned_v2.csv"
test_data_path =  "/kaggle/input/e2eqa-wiki-zalo-ai/e2eqa-trainpublic_test-v1/e2eqa-train+public_test-v1/zac2022_testa_sample_submission.json"
topk = 300

In [5]:
df_wiki = pd.read_csv(wiki_cleaned_path)

In [6]:
df_wiki = df_wiki.fillna("NaN")
if "title" not in df_wiki.columns:
    df_wiki["title"] = df_wiki["titles="].fillna("")

In [7]:
df_wiki.head()

Unnamed: 0,title,text,bm25_text
0,Trang Chính,"Trang Chính\n\n<templatestyles src=""Wiki2021/s...",trang chính <templatestyles src= wiki2021 styl...
1,Internet Society,Internet Society hay ISOC là một tổ chức quốc...,internet society hay isoc là một tổ chức quốc ...
2,Tiếng Việt,"Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...",tiếng việt cũng gọi là tiếng việt nam hay việt...
3,Tiếng Việt,"hệ thống thanh điệu phát triển cao hơn, hệ thố...",hệ thống thanh điệu phát triển cao hơn hệ thốn...
4,Tiếng Việt,tiếp xúc Hán – Việt thành 2 giai đoạn chính: \...,tiếp xúc hán – việt thành 2 giai đoạn chính bu...


In [8]:
df_wiki['bm25_text'] = df_wiki['bm25_text'].parallel_apply(post_process)
# corpus = [x.split() for x in df_wiki['bm25_text'].values]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=194441), Label(value='0 / 194441')…

In [10]:
# Convert the column to a numpy array 
texts = df_wiki['bm25_text'].to_numpy()
# Create an empty dictionary 
dictionary = Dictionary()
# Update the dictionary with each batch of texts 
for text in tqdm(texts): 
    dictionary.add_documents([text.split()])

  0%|          | 0/1944406 [00:00<?, ?it/s]

In [None]:
# dictionary = Dictionary(corpus)

In [14]:
try:
    corpus = [text.split() for text in texts]
except:
    print("nope")

In [17]:
!mkdir /kaggle/working/bm25_stage1

Save dictionary và corpus vào bộ nhớ

In [19]:
# open a file for writing
with open('/kaggle/working/bm25_stage1/corpus.txt', 'w') as f:
    # use a loop to write each inner list to a line in the file
    for text in texts:
        line = text + '\n'
        f.write(line)

In [18]:
tfidf_model = TfidfModel(dictionary=dictionary, smartirs='bnn')  # Enforce binary weighting of queries
dictionary.save("/kaggle/working/bm25_stage1/dict")
tfidf_model.save("/kaggle/working/bm25_stage1/tfidf")

---

Dừng session và tiếp tục chạy

In [9]:
from gensim.models.word2vec import LineSentence

In [10]:
# Create a dictionary and a LineSentence object
dictionary = Dictionary.load("/kaggle/working/bm25_stage1/dict")
corpus = LineSentence("/kaggle/working/bm25_stage1/corpus.txt")

In [13]:
# # Get an iterator over the corpus
# corpus_iterator = corpus.getstream()
# Create a BM25 model
bm25_model = OkapiBM25Model(dictionary=dictionary)
# Create a BM25 corpus by applying doc2bow to each document in the iterator
bm25_corpus = bm25_model[list(map(dictionary.doc2bow, corpus))]
# Get the number of documents in the corpus from the dictionary or by counting them manually 
num_docs = dictionary.num_docs
# Create a BM25 index
bm25_index = SparseMatrixSimilarity(bm25_corpus, num_docs=num_docs,
                num_terms=len(dictionary),normalize_queries=False, normalize_documents=False)
# Save the BM25 index to a file
bm25_index.save("/kaggle/working/bm25_stage1/bm25_index")