{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load model directly\n",
    "from transformers import AutoModel,AutoTokenizer\n",
    "model = AutoModel.from_pretrained(\"ai4bharat/indic-bert\")\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"ai4bharat/indic-bert\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "param ={\n",
    "    'maxLen' :256,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# keras.backend.set_image_data_format('channels_last')\n",
    "# from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.0):\n",
    "    padded_sequences = []\n",
    "    for seq in sequences:\n",
    "        if padding == 'pre':\n",
    "            padded_seq = np.pad(seq, (maxlen - len(seq), 0), 'constant', constant_values=value)\n",
    "        elif padding == 'post':\n",
    "            padded_seq = np.pad(seq, (0, maxlen - len(seq)), 'constant', constant_values=value)\n",
    "        else:\n",
    "            raise ValueError(\"Padding should be 'pre' or 'post'.\")\n",
    "\n",
    "        if truncating == 'pre':\n",
    "            padded_seq = padded_seq[-maxlen:]\n",
    "        elif truncating == 'post':\n",
    "            padded_seq = padded_seq[:maxlen]\n",
    "        else:\n",
    "            raise ValueError(\"Truncating should be 'pre' or 'post'.\")\n",
    "\n",
    "        padded_sequences.append(padded_seq)\n",
    "\n",
    "    return np.array(padded_sequences, dtype=dtype)\n",
    "\n",
    "\n",
    "def create_attention_masks(input_ids):\n",
    "    attention_masks = []\n",
    "    for seq in tqdm(input_ids):\n",
    "        seq_mask = [float(i>0) for i in seq]\n",
    "        attention_masks.append(seq_mask)\n",
    "    return np.array(attention_masks)\n",
    "\n",
    "def getFeaturesandLabel(single_string, label):\n",
    "    # Wrap the single string in a list\n",
    "    sentences = [\"[CLS] \" + single_string + \" [SEP]\"]\n",
    "\n",
    "    # Tokenize and preprocess\n",
    "    tokenizer_texts = list(map(lambda t: tokenizer.tokenize(t)[:512], tqdm(sentences)))\n",
    "    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tqdm(tokenizer_texts)]\n",
    "\n",
    "    # Pad sequences and create attention masks\n",
    "    input_ids = pad_sequences(sequences=input_ids, maxlen=param['maxLen'], dtype='long', padding='post', truncating='post')\n",
    "    attention_masks_data = create_attention_masks(input_ids)\n",
    "\n",
    "    # Convert to torch tensors\n",
    "    X_data = torch.tensor(input_ids)\n",
    "    attention_masks_data = torch.tensor(attention_masks_data)\n",
    "    y_data = torch.tensor(label)\n",
    "\n",
    "    return X_data, attention_masks_data, y_data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "with open('classifier.pkl','rb') as file:\n",
    "  clf=pickle.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1/1 [00:00<00:00, 260.61it/s]\n",
      "100%|██████████| 1/1 [00:00<00:00, 6765.01it/s]\n",
      "100%|██████████| 1/1 [00:00<00:00, 1562.12it/s]\n"
     ]
    }
   ],
   "source": [
    "text_input=\"థమ్స్ అప్ ఓకే బాయ్\"\n",
    "# from app import predict\n",
    "# text_input=predict.input_string\n",
    "label_input = [1]\n",
    "X_data, attention_masks_data, y_data = getFeaturesandLabel(text_input, label_input)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "torch.save(X_data, 'X_data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_tensor_variable():\n",
    "    # Define a PyTorch tensor variable\n",
    "    return X_data\n",
    "\n",
    "# Save the variable to a file (optional, but can be useful)\n",
    "torch.save(get_tensor_variable(), 'X_data.pth')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[     2,  29913,   3780,   4491,  24484,    884,   1399, 129604,      8,\n",
      "              3,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0,      0,      0,      0,      0,      0,\n",
      "              0,      0,      0,      0]])\n"
     ]
    }
   ],
   "source": [
    "print(X_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1]\n"
     ]
    }
   ],
   "source": [
    "prediction = clf.predict(X_data)\n",
    "print(prediction)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}