{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### **Twitter Sentiment Analysis**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "import pickle\n",
    "from sklearn.metrics import accuracy_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "import pandas as pd "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>clean_text</th>\n",
       "      <th>category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>when modi promised “minimum government maximum...</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>talk all the nonsense and continue all the dra...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>what did just say vote for modi  welcome bjp t...</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>asking his supporters prefix chowkidar their n...</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>answer who among these the most powerful world...</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          clean_text  category\n",
       "0  when modi promised “minimum government maximum...      -1.0\n",
       "1  talk all the nonsense and continue all the dra...       0.0\n",
       "2  what did just say vote for modi  welcome bjp t...       1.0\n",
       "3  asking his supporters prefix chowkidar their n...       1.0\n",
       "4  answer who among these the most powerful world...       1.0"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('./Twitter_Data.csv' )\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(75682, 3)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**drop unnecessary columns**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[[2,3]].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im getting on borderlands and i will murder yo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Positive</td>\n",
       "      <td>I am coming to the borders and I will kill you...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im getting on borderlands and i will kill you ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im coming on borderlands and i will murder you...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im getting on borderlands 2 and i will murder ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          2                                                  3\n",
       "0  Positive  im getting on borderlands and i will murder yo...\n",
       "1  Positive  I am coming to the borders and I will kill you...\n",
       "2  Positive  im getting on borderlands and i will kill you ...\n",
       "3  Positive  im coming on borderlands and i will murder you...\n",
       "4  Positive  im getting on borderlands 2 and i will murder ..."
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sentiments</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im getting on borderlands and i will murder yo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Positive</td>\n",
       "      <td>I am coming to the borders and I will kill you...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im getting on borderlands and i will kill you ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im coming on borderlands and i will murder you...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im getting on borderlands 2 and i will murder ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  sentiments                                               text\n",
       "0   Positive  im getting on borderlands and i will murder yo...\n",
       "1   Positive  I am coming to the borders and I will kill you...\n",
       "2   Positive  im getting on borderlands and i will kill you ...\n",
       "3   Positive  im coming on borderlands and i will murder you...\n",
       "4   Positive  im getting on borderlands 2 and i will murder ..."
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# df.columns = ['sentiments','text']\n",
    "df.rename(columns={2 : \"sentiments\" , 3 : \"text\"} , inplace= True)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 75682 entries, 0 to 75681\n",
      "Data columns (total 2 columns):\n",
      " #   Column      Non-Null Count  Dtype \n",
      "---  ------      --------------  ----- \n",
      " 0   sentiments  75682 non-null  object\n",
      " 1   text        74996 non-null  object\n",
      "dtypes: object(2)\n",
      "memory usage: 1.2+ MB\n"
     ]
    }
   ],
   "source": [
    "df.info() # to see data types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.isna().sum()\n",
    "df.dropna(inplace= True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sentiments</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im getting on borderlands and i will murder yo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Positive</td>\n",
       "      <td>I am coming to the borders and I will kill you...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im getting on borderlands and i will kill you ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im coming on borderlands and i will murder you...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im getting on borderlands 2 and i will murder ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  sentiments                                               text\n",
       "0   Positive  im getting on borderlands and i will murder yo...\n",
       "1   Positive  I am coming to the borders and I will kill you...\n",
       "2   Positive  im getting on borderlands and i will kill you ...\n",
       "3   Positive  im coming on borderlands and i will murder you...\n",
       "4   Positive  im getting on borderlands 2 and i will murder ..."
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_text(text):\n",
    "    text = text.lower()\n",
    "    text = re.sub(f'http\\S+','',text)\n",
    "    text = re.sub(r'@[a-zA-Z0-9_]+','',text)\n",
    "    text = re.sub(r'#','',text)\n",
    "    text = re.sub(r'[^a-zA-Z\\S]','',text)\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['clean_text'] = df['text'].apply(process_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sentiments</th>\n",
       "      <th>text</th>\n",
       "      <th>clean_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im getting on borderlands and i will murder yo...</td>\n",
       "      <td>imgettingonborderlandsandiwillmurderyouall,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Positive</td>\n",
       "      <td>I am coming to the borders and I will kill you...</td>\n",
       "      <td>iamcomingtothebordersandiwillkillyouall,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im getting on borderlands and i will kill you ...</td>\n",
       "      <td>imgettingonborderlandsandiwillkillyouall,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im coming on borderlands and i will murder you...</td>\n",
       "      <td>imcomingonborderlandsandiwillmurderyouall,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Positive</td>\n",
       "      <td>im getting on borderlands 2 and i will murder ...</td>\n",
       "      <td>imgettingonborderlands2andiwillmurderyoumeall,</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  sentiments                                               text  \\\n",
       "0   Positive  im getting on borderlands and i will murder yo...   \n",
       "1   Positive  I am coming to the borders and I will kill you...   \n",
       "2   Positive  im getting on borderlands and i will kill you ...   \n",
       "3   Positive  im coming on borderlands and i will murder you...   \n",
       "4   Positive  im getting on borderlands 2 and i will murder ...   \n",
       "\n",
       "                                       clean_text  \n",
       "0     imgettingonborderlandsandiwillmurderyouall,  \n",
       "1        iamcomingtothebordersandiwillkillyouall,  \n",
       "2       imgettingonborderlandsandiwillkillyouall,  \n",
       "3      imcomingonborderlandsandiwillmurderyouall,  \n",
       "4  imgettingonborderlands2andiwillmurderyoumeall,  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sentiments\n",
       "Negative      22624\n",
       "Positive      20932\n",
       "Neutral       18393\n",
       "Irrelevant    13047\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['sentiments'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "count_vectorizer = CountVectorizer(max_features=5000)\n",
    "count_matrix = count_vectorizer.fit_transform(df['clean_text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train , X_test , y_train , y_test = train_test_split(count_matrix, df['clean_text'],test_size=0.2 , random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# nb_classifier = MultinomialNB()\n",
    "# nb_classifier.fit(X_train , y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# y_pred = nb_classifier.predict(X_test)\n",
    "# accuracy = accuracy_score(y_test , y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}