{ "cells": [ { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "import torch" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# Load model directly\n", "from transformers import AutoModel,AutoTokenizer\n", "model = AutoModel.from_pretrained(\"ai4bharat/indic-bert\")\n", "tokenizer = AutoTokenizer.from_pretrained(\"ai4bharat/indic-bert\")" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "param ={\n", " 'maxLen' :256,\n", "}" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "# keras.backend.set_image_data_format('channels_last')\n", "# from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "\n", "import numpy as np\n", "\n", "def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.0):\n", " padded_sequences = []\n", " for seq in sequences:\n", " if padding == 'pre':\n", " padded_seq = np.pad(seq, (maxlen - len(seq), 0), 'constant', constant_values=value)\n", " elif padding == 'post':\n", " padded_seq = np.pad(seq, (0, maxlen - len(seq)), 'constant', constant_values=value)\n", " else:\n", " raise ValueError(\"Padding should be 'pre' or 'post'.\")\n", "\n", " if truncating == 'pre':\n", " padded_seq = padded_seq[-maxlen:]\n", " elif truncating == 'post':\n", " padded_seq = padded_seq[:maxlen]\n", " else:\n", " raise ValueError(\"Truncating should be 'pre' or 'post'.\")\n", "\n", " padded_sequences.append(padded_seq)\n", "\n", " return np.array(padded_sequences, dtype=dtype)\n", "\n", "\n", "def create_attention_masks(input_ids):\n", " attention_masks = []\n", " for seq in tqdm(input_ids):\n", " seq_mask = [float(i>0) for i in seq]\n", " attention_masks.append(seq_mask)\n", " return np.array(attention_masks)\n", "\n", "def getFeaturesandLabel(single_string, label):\n", " # Wrap the single string in a list\n", " sentences = [\"[CLS] \" + single_string + \" [SEP]\"]\n", "\n", " # Tokenize and preprocess\n", " tokenizer_texts = list(map(lambda t: tokenizer.tokenize(t)[:512], tqdm(sentences)))\n", " input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tqdm(tokenizer_texts)]\n", "\n", " # Pad sequences and create attention masks\n", " input_ids = pad_sequences(sequences=input_ids, maxlen=param['maxLen'], dtype='long', padding='post', truncating='post')\n", " attention_masks_data = create_attention_masks(input_ids)\n", "\n", " # Convert to torch tensors\n", " X_data = torch.tensor(input_ids)\n", " attention_masks_data = torch.tensor(attention_masks_data)\n", " y_data = torch.tensor(label)\n", "\n", " return X_data, attention_masks_data, y_data\n" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "with open('classifier.pkl','rb') as file:\n", " clf=pickle.load(file)" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1/1 [00:00<00:00, 260.61it/s]\n", "100%|██████████| 1/1 [00:00<00:00, 6765.01it/s]\n", "100%|██████████| 1/1 [00:00<00:00, 1562.12it/s]\n" ] } ], "source": [ "text_input=\"థమ్స్ అప్ ఓకే బాయ్\"\n", "# from app import predict\n", "# text_input=predict.input_string\n", "label_input = [1]\n", "X_data, attention_masks_data, y_data = getFeaturesandLabel(text_input, label_input)" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "torch.save(X_data, 'X_data')" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "def get_tensor_variable():\n", " # Define a PyTorch tensor variable\n", " return X_data\n", "\n", "# Save the variable to a file (optional, but can be useful)\n", "torch.save(get_tensor_variable(), 'X_data.pth')\n" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[ 2, 29913, 3780, 4491, 24484, 884, 1399, 129604, 8,\n", " 3, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0]])\n" ] } ], "source": [ "print(X_data)" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1]\n" ] } ], "source": [ "prediction = clf.predict(X_data)\n", "print(prediction)" ] } ], "metadata": { "kernelspec": { "display_name": "env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 2 }