Spaces:

foxxy-hm
/

e2eqa-wiki

Build error

File size: 12,775 Bytes

c5d2283

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install pandas \n",
    "# !pip install tdqm\n",
    "# !pip install pandarallel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import string\n",
    "import math\n",
    "import json\n",
    "import pandas as pd\n",
    "from tqdm.auto import tqdm\n",
    "tqdm.pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "_WORD_SPLIT = re.compile(\"([.,!?\\\"/':;)(])\")\n",
    "_DIGIT_RE = re.compile(br\"\\d\")\n",
    "STOP_WORDS = \"\\\" \\' [ ] . , ! : ; ?\".split(\" \")\n",
    "\n",
    "def basic_tokenizer(sentence):\n",
    "    words = []\n",
    "    for space_separated_fragment in sentence.strip().split():\n",
    "        words.extend(_WORD_SPLIT.split(space_separated_fragment))\n",
    "    return [w.lower() for w in words if w != '' and w != ' ' and w not in string.punctuation]\n",
    "\n",
    "def remove_appending_title(text,title):\n",
    "    return text.replace(f\"{title}\\n\\n{title}\",f\"{title} \")\n",
    "\n",
    "def create_sliding_window(text, size=256, overlap=32):\n",
    "    actual_size = size - overlap\n",
    "    windows = []\n",
    "    n_windows = math.ceil(len(text)/actual_size)\n",
    "    for i in range(n_windows):\n",
    "        windows.append(\" \".join(text[i*actual_size:i*actual_size + size]))\n",
    "    return windows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "16dcb3eb371e4084be66ee507040fe18",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "all_titles = []\n",
    "all_texts = []\n",
    "all_bm25_texts = []\n",
    "with open(\"../data/raw/wikipedia_20220620_cleaned/wikipedia_20220620_cleaned.jsonl\", encoding=\"utf-8\") as f:\n",
    "    for i,line in tqdm(enumerate(f)):\n",
    "        x = json.loads(line)\n",
    "        text = remove_appending_title(x[\"text\"],x[\"title\"])\n",
    "        text = text.split(\" \")\n",
    "        sliding_windows = create_sliding_window(text)\n",
    "        all_texts.extend(sliding_windows)\n",
    "        # all_bm25_texts.extend(bm25_windows)\n",
    "        all_titles.extend([x['title'],]*len(sliding_windows))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Trang Chính</td>\n",
       "      <td>Trang Chính\\n\\n&lt;templatestyles src=\"Wiki2021/s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Internet Society</td>\n",
       "      <td>Internet Society  hay ISOC là một tổ chức quốc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Tiếng Việt</td>\n",
       "      <td>Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Tiếng Việt</td>\n",
       "      <td>hệ thống thanh điệu phát triển cao hơn, hệ thố...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Tiếng Việt</td>\n",
       "      <td>tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1944401</th>\n",
       "      <td>VNOI</td>\n",
       "      <td>chuyên viên của các tập đoàn công nghệ lớn như...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1944402</th>\n",
       "      <td>Vòng hoa</td>\n",
       "      <td>Vòng hoa  (; IPA: ) là một loại hoa, lá, quả, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1944403</th>\n",
       "      <td>Vòng hoa</td>\n",
       "      <td>vật trong thần thoại La Mã và Hy Lạp được miêu...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1944404</th>\n",
       "      <td>Vòng hoa</td>\n",
       "      <td>một vai trò nổi bật, tương tự như ở Anh.\\n\\n==...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1944405</th>\n",
       "      <td>Vòng hoa</td>\n",
       "      <td>trên giá đỡ vòng hoa thắp sáng những ngọn nến ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1944406 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    title                                               text\n",
       "0             Trang Chính  Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s...\n",
       "1        Internet Society  Internet Society  hay ISOC là một tổ chức quốc...\n",
       "2              Tiếng Việt  Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...\n",
       "3              Tiếng Việt  hệ thống thanh điệu phát triển cao hơn, hệ thố...\n",
       "4              Tiếng Việt  tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...\n",
       "...                   ...                                                ...\n",
       "1944401              VNOI  chuyên viên của các tập đoàn công nghệ lớn như...\n",
       "1944402          Vòng hoa  Vòng hoa  (; IPA: ) là một loại hoa, lá, quả, ...\n",
       "1944403          Vòng hoa  vật trong thần thoại La Mã và Hy Lạp được miêu...\n",
       "1944404          Vòng hoa  một vai trò nổi bật, tương tự như ở Anh.\\n\\n==...\n",
       "1944405          Vòng hoa  trên giá đỡ vòng hoa thắp sáng những ngọn nến ...\n",
       "\n",
       "[1944406 rows x 2 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame()\n",
    "df[\"title\"] = all_titles\n",
    "df[\"text\"] = all_texts\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandarallel import pandarallel\n",
    "\n",
    "pandarallel.initialize(progress_bar=True, use_memory_fs=False, nb_workers=6) \n",
    "\n",
    "def apply_tokenizer(df, num_chunks):\n",
    "    chunk_sr = pd.Series()\n",
    "    i = 0\n",
    "    end = df.shape[0]\n",
    "    chunk_size = math.floor(end/num_chunks)\n",
    "    while i != end:\n",
    "        if end - i < chunk_size:\n",
    "            chunk_size = end - i\n",
    "#         print((i,i+chunk_size))\n",
    "        temp_df = df[i:i+chunk_size].copy()\n",
    "        chunk_sr = pd.concat([chunk_sr, temp_df[\"text\"].parallel_apply(lambda x: \" \".join(basic_tokenizer(x)))], ignore_index=True)\n",
    "        i+=chunk_size\n",
    "    return chunk_sr\n",
    "\n",
    "df[\"bm25_text\"] = apply_tokenizer(df, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>text</th>\n",
       "      <th>bm25_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Trang Chính</td>\n",
       "      <td>Trang Chính\\n\\n&lt;templatestyles src=\"Wiki2021/s...</td>\n",
       "      <td>trang chính &lt;templatestyles src= wiki2021 styl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Internet Society</td>\n",
       "      <td>Internet Society  hay ISOC là một tổ chức quốc...</td>\n",
       "      <td>internet society hay isoc là một tổ chức quốc ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Tiếng Việt</td>\n",
       "      <td>Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...</td>\n",
       "      <td>tiếng việt cũng gọi là tiếng việt nam hay việt...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Tiếng Việt</td>\n",
       "      <td>hệ thống thanh điệu phát triển cao hơn, hệ thố...</td>\n",
       "      <td>hệ thống thanh điệu phát triển cao hơn hệ thốn...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Tiếng Việt</td>\n",
       "      <td>tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...</td>\n",
       "      <td>tiếp xúc hán – việt thành 2 giai đoạn chính bu...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              title                                               text  \\\n",
       "0       Trang Chính  Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s...   \n",
       "1  Internet Society  Internet Society  hay ISOC là một tổ chức quốc...   \n",
       "2        Tiếng Việt  Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...   \n",
       "3        Tiếng Việt  hệ thống thanh điệu phát triển cao hơn, hệ thố...   \n",
       "4        Tiếng Việt  tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...   \n",
       "\n",
       "                                           bm25_text  \n",
       "0  trang chính <templatestyles src= wiki2021 styl...  \n",
       "1  internet society hay isoc là một tổ chức quốc ...  \n",
       "2  tiếng việt cũng gọi là tiếng việt nam hay việt...  \n",
       "3  hệ thống thanh điệu phát triển cao hơn hệ thốn...  \n",
       "4  tiếp xúc hán – việt thành 2 giai đoạn chính bu...  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1944406, 3)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"../data/processed/wikipedia_20220620_cleaned_v2.csv\",index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}