File size: 35,137 Bytes

6af88c1

{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "633fetsKg5cv",
        "outputId": "379a3769-9478-4749-cc71-bbf46e6478f9"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Collecting transformers\n",
            "  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)\n",
            "\u001b[K     |████████████████████████████████| 2.9 MB 5.2 MB/s \n",
            "\u001b[?25hCollecting pyyaml>=5.1\n",
            "  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n",
            "\u001b[K     |████████████████████████████████| 596 kB 37.4 MB/s \n",
            "\u001b[?25hCollecting huggingface-hub>=0.0.17\n",
            "  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)\n",
            "\u001b[K     |████████████████████████████████| 56 kB 4.7 MB/s \n",
            "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.0)\n",
            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n",
            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.3)\n",
            "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.8.1)\n",
            "Collecting sacremoses\n",
            "  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)\n",
            "\u001b[K     |████████████████████████████████| 895 kB 41.5 MB/s \n",
            "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.3.0)\n",
            "Collecting tokenizers<0.11,>=0.10.1\n",
            "  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n",
            "\u001b[K     |████████████████████████████████| 3.3 MB 26.2 MB/s \n",
            "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.17->transformers) (3.7.4.3)\n",
            "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (2.4.7)\n",
            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.6.0)\n",
            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n",
            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n",
            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.5.30)\n",
            "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n",
            "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n",
            "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n",
            "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n",
            "  Attempting uninstall: pyyaml\n",
            "    Found existing installation: PyYAML 3.13\n",
            "    Uninstalling PyYAML-3.13:\n",
            "      Successfully uninstalled PyYAML-3.13\n",
            "Successfully installed huggingface-hub-0.0.19 pyyaml-6.0 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.11.3\n"
          ]
        }
      ],
      "source": [
        "!pip install transformers"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 37,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "9pi31_2cndZU",
        "outputId": "f04cc4a8-7baf-404c-d059-66675a6dda63"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']\n",
            "- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
            "- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
            "Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias']\n",
            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
          ]
        }
      ],
      "source": [
        "import tensorflow as tf\n",
        "import json\n",
        "from transformers import AutoConfig, AutoTokenizer, TFAutoModelForSequenceClassification\n",
        "\n",
        "config = AutoConfig.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased', id2label={\"0\": \"negative\",\"1\": \"positive\"}, \n",
        "                                    label2id={\"negative\": 0,\"positive\": 1})\n",
        "tokenizer = AutoTokenizer.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased')\n",
        "model = TFAutoModelForSequenceClassification.from_pretrained(\"malay-huggingface/bert-tiny-bahasa-cased\", from_pt=True, config=config)\n",
        "\n",
        "# config = AutoConfig.from_pretrained('malay-huggingface/bert-base-bahasa-cased', id2label={\"0\": \"negative\",\"1\": \"positive\"}, \n",
        "#                                     label2id={\"negative\": 0,\"positive\": 1})\n",
        "\n",
        "# tokenizer = AutoTokenizer.from_pretrained(\"malay-huggingface/bert-base-bahasa-cased\")\n",
        "# model = TFAutoModelForSequenceClassification.from_pretrained(\"malay-huggingface/bert-base-bahasa-cased\", from_pt=True, config=config)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "id": "6mkizKwiJFeZ"
      },
      "outputs": [],
      "source": [
        "import pandas as pd"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 422
        },
        "id": "kgMs04IDJx2z",
        "outputId": "6ba3687d-4ac9-48f6-a275-1a652a073dcc"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>label</th>\n",
              "      <th>text</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Negative</td>\n",
              "      <td>Lebih-lebih lagi dengan  kemudahan internet da...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Positive</td>\n",
              "      <td>boleh memberi teguran kepada parti tetapi perl...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Negative</td>\n",
              "      <td>Adalah membingungkan mengapa masyarakat Cina b...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Positive</td>\n",
              "      <td>Kami menurunkan defisit daripada 6.7 peratus p...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Negative</td>\n",
              "      <td>Ini masalahnya. Bukan rakyat, tetapi sistem</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3680</th>\n",
              "      <td>Positive</td>\n",
              "      <td>Jelas pembangkang buat tuduhan untuk mengeliru...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3681</th>\n",
              "      <td>Positive</td>\n",
              "      <td>demokrasi adalah kuasa rakyat di mana pegawai ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3682</th>\n",
              "      <td>Positive</td>\n",
              "      <td>Selain dapat menyelesaikan isu beg berat, peng...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3683</th>\n",
              "      <td>Positive</td>\n",
              "      <td>Hospital Langkawi buat masa ini hanya dapat me...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3684</th>\n",
              "      <td>Positive</td>\n",
              "      <td>Jika sebelum ini kita selesa bergerak dalam ‘g...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>3685 rows × 2 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "         label                                               text\n",
              "0     Negative  Lebih-lebih lagi dengan  kemudahan internet da...\n",
              "1     Positive  boleh memberi teguran kepada parti tetapi perl...\n",
              "2     Negative  Adalah membingungkan mengapa masyarakat Cina b...\n",
              "3     Positive  Kami menurunkan defisit daripada 6.7 peratus p...\n",
              "4     Negative        Ini masalahnya. Bukan rakyat, tetapi sistem\n",
              "...        ...                                                ...\n",
              "3680  Positive  Jelas pembangkang buat tuduhan untuk mengeliru...\n",
              "3681  Positive  demokrasi adalah kuasa rakyat di mana pegawai ...\n",
              "3682  Positive  Selain dapat menyelesaikan isu beg berat, peng...\n",
              "3683  Positive  Hospital Langkawi buat masa ini hanya dapat me...\n",
              "3684  Positive  Jika sebelum ini kita selesa bergerak dalam ‘g...\n",
              "\n",
              "[3685 rows x 2 columns]"
            ]
          },
          "execution_count": 5,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "sentiment_df = pd.read_csv(\"https://raw.githubusercontent.com/huseinzol05/malaya/master/finetune/sentiment-data-v2.csv\")\n",
        "sentiment_df"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "id": "hEfJHRjEo1uk"
      },
      "outputs": [],
      "source": [
        "sentiment_df[\"label\"] = sentiment_df[\"label\"].map({'Positive': 1, 'Negative': 0})\n",
        "\n",
        "positive_df = pd.read_csv(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/polarity/polarity-positive-translated.txt\", names=[\"text\"])\n",
        "positive_df[\"label\"] = 1\n",
        "\n",
        "negative_df = pd.read_csv(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/polarity/polarity-negative-translated.txt\", names=[\"text\"])\n",
        "negative_df[\"label\"] = 0"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "id": "iciAB9tss4tW"
      },
      "outputs": [],
      "source": [
        "amazon_df = pd.read_json(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-amazon.json\", orient='index').T\n",
        "yelp_df = pd.read_json(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-yelp.json\", orient='index').T\n",
        "imdb_df = pd.read_json(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-imdb.json\", orient='index').T\n",
        "\n",
        "def process_json_df(df):\n",
        "  positive_df = df[[\"positive\"]].dropna()\n",
        "  positive_df.columns = [\"text\"]\n",
        "  positive_df[\"label\"] = 1\n",
        "\n",
        "  negative_df = df[[\"negative\"]].dropna()\n",
        "  negative_df.columns = [\"text\"]\n",
        "  negative_df[\"label\"] = 0\n",
        "\n",
        "  return pd.concat([positive_df, negative_df])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 422
        },
        "id": "GRX3doXvvqjw",
        "outputId": "6c202e02-04d9-4560-8c16-d44163d92ce6"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>label</th>\n",
              "      <th>text</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "      <td>Lebih-lebih lagi dengan  kemudahan internet da...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>boleh memberi teguran kepada parti tetapi perl...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>0</td>\n",
              "      <td>Adalah membingungkan mengapa masyarakat Cina b...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1</td>\n",
              "      <td>Kami menurunkan defisit daripada 6.7 peratus p...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>0</td>\n",
              "      <td>Ini masalahnya. Bukan rakyat, tetapi sistem</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>16720</th>\n",
              "      <td>0</td>\n",
              "      <td>dalam satu perkataan, ia memalukan.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>16721</th>\n",
              "      <td>0</td>\n",
              "      <td>Saya tidak pernah keluar dari filem dengan pan...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>16722</th>\n",
              "      <td>0</td>\n",
              "      <td>saya hanya bosan menonton jessica lange mengam...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>16723</th>\n",
              "      <td>0</td>\n",
              "      <td>semua dalam satu penghinaan terhadap kecerdasa...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>16724</th>\n",
              "      <td>0</td>\n",
              "      <td>yang ingin melayari gelombang kecil filem angk...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>16725 rows × 2 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "       label                                               text\n",
              "0          0  Lebih-lebih lagi dengan  kemudahan internet da...\n",
              "1          1  boleh memberi teguran kepada parti tetapi perl...\n",
              "2          0  Adalah membingungkan mengapa masyarakat Cina b...\n",
              "3          1  Kami menurunkan defisit daripada 6.7 peratus p...\n",
              "4          0        Ini masalahnya. Bukan rakyat, tetapi sistem\n",
              "...      ...                                                ...\n",
              "16720      0                dalam satu perkataan, ia memalukan.\n",
              "16721      0  Saya tidak pernah keluar dari filem dengan pan...\n",
              "16722      0  saya hanya bosan menonton jessica lange mengam...\n",
              "16723      0  semua dalam satu penghinaan terhadap kecerdasa...\n",
              "16724      0  yang ingin melayari gelombang kecil filem angk...\n",
              "\n",
              "[16725 rows x 2 columns]"
            ]
          },
          "execution_count": 8,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# df = pd.concat([sentiment_df, positive_df, negative_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)\n",
        "# df = pd.concat([sentiment_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)\n",
        "df = pd.concat([sentiment_df, positive_df, negative_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)\n",
        "\n",
        "df"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "FeWmvyotp9RP",
        "outputId": "c3b34cb1-28d6-4c60-a4f0-778bd398ba02"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "13380\n",
            "3345\n"
          ]
        }
      ],
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "# sentences = sarcasm_df[\"headline\"].tolist()\n",
        "# labels = sarcasm_df[\"is_sarcastic\"].tolist()\n",
        "\n",
        "\n",
        "sentences = df[\"text\"].tolist()\n",
        "labels = df[\"label\"].tolist()\n",
        "\n",
        "training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(sentences, labels, train_size=0.8, random_state=1)\n",
        "\n",
        "print(len(training_sentences))\n",
        "print(len(validation_sentences))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "KCxtcxObndZk",
        "outputId": "0c3de610-02d1-4a8f-f7bf-993e1f644d63",
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
          ]
        }
      ],
      "source": [
        "train_encodings = tokenizer(training_sentences, truncation=True, padding=True)\n",
        "val_encodings = tokenizer(validation_sentences, truncation=True, padding=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {
        "id": "Tg7zcOpVndZm",
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [],
      "source": [
        "train_dataset = tf.data.Dataset.from_tensor_slices((\n",
        "    dict(train_encodings),\n",
        "    training_labels\n",
        "))\n",
        "\n",
        "val_dataset = tf.data.Dataset.from_tensor_slices((\n",
        "    dict(val_encodings),\n",
        "    validation_labels\n",
        "))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "id": "vfwrq3eMXDi1"
      },
      "outputs": [],
      "source": [
        "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
        "\n",
        "es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)\n",
        "# mc = ModelCheckpoint('best_model', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "8_gjepLSndZq",
        "outputId": "3091b5d2-40c6-4cfd-82fd-fcbc094cbc3b",
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Epoch 1/10\n",
            "837/837 [==============================] - 91s 95ms/step - loss: 0.5531 - accuracy: 0.7115 - val_loss: 0.5028 - val_accuracy: 0.7474\n",
            "Epoch 2/10\n",
            "837/837 [==============================] - 78s 93ms/step - loss: 0.4301 - accuracy: 0.8006 - val_loss: 0.4745 - val_accuracy: 0.7731\n",
            "Epoch 3/10\n",
            "837/837 [==============================] - 78s 93ms/step - loss: 0.3201 - accuracy: 0.8635 - val_loss: 0.5232 - val_accuracy: 0.7773\n",
            "Epoch 4/10\n",
            "837/837 [==============================] - 78s 93ms/step - loss: 0.2226 - accuracy: 0.9113 - val_loss: 0.5835 - val_accuracy: 0.7611\n",
            "Epoch 5/10\n",
            "837/837 [==============================] - 78s 93ms/step - loss: 0.1604 - accuracy: 0.9389 - val_loss: 0.6551 - val_accuracy: 0.7638\n",
            "Epoch 00005: early stopping\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "<keras.callbacks.History at 0x7efdb1594e10>"
            ]
          },
          "execution_count": 13,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)\n",
        "model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])\n",
        "model.fit(train_dataset.shuffle(100).batch(16),\n",
        "          epochs=10,\n",
        "          batch_size=16,\n",
        "          callbacks=[es],\n",
        "          validation_data=val_dataset.shuffle(100).batch(16))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "metadata": {
        "id": "dmfeNn8hndZs",
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [],
      "source": [
        "model.save_pretrained(\"model\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 38,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "D_nYwVTY8W1M",
        "outputId": "913383cd-983d-41f4-efa7-d727275fab09"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "('tokenize/tokenizer_config.json',\n",
              " 'tokenize/special_tokens_map.json',\n",
              " 'tokenize/vocab.txt',\n",
              " 'tokenize/added_tokens.json',\n",
              " 'tokenize/tokenizer.json')"
            ]
          },
          "execution_count": 38,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "tokenizer.save_pretrained(\"tokenize\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "metadata": {
        "id": "_jwvD6AUndZu",
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [],
      "source": [
        "#### Load saved model and run predict function"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "s71ZiN0bndZw",
        "outputId": "42b7412d-7fe3-439c-8c89-1f5b4e688ee0",
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Some layers from the model checkpoint at model were not used when initializing TFBertForSequenceClassification: ['dropout_13']\n",
            "- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
            "- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
            "All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at model.\n",
            "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.\n"
          ]
        }
      ],
      "source": [
        "loaded_model = TFAutoModelForSequenceClassification.from_pretrained(\"model\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 18,
      "metadata": {
        "id": "3QCgtNI8nlmX"
      },
      "outputs": [],
      "source": [
        "from transformers import pipeline\n",
        "\n",
        "pipe = pipeline('text-classification', model=loaded_model, tokenizer=tokenizer)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 30,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4QWLGTRpPDeZ",
        "outputId": "29837e60-6d35-43cd-d6e5-14ecfc3c2c33"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "[{'label': 'positive', 'score': 0.9960972666740417},\n",
              " {'label': 'positive', 'score': 0.9960286617279053},\n",
              " {'label': 'positive', 'score': 0.9795612692832947}]"
            ]
          },
          "execution_count": 30,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "pipe([\"Saya gembira kerana saya boleh meluangkan masa bersama keluarga.\", \"Cikgu Azam adalah yang terbaik!\", \"Terima kasih, pertolongan anda adalah amat dihargai\"])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 29,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Y9RvdOZcnU3p",
        "outputId": "088ed08d-4402-4889-f047-b3a20ae1f473"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "[{'label': 'positive', 'score': 0.9666869640350342},\n",
              " {'label': 'positive', 'score': 0.9939473867416382},\n",
              " {'label': 'negative', 'score': 0.949023425579071},\n",
              " {'label': 'positive', 'score': 0.7437461018562317}]"
            ]
          },
          "execution_count": 29,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "pipe([\"I'm happy to spend time with my family\", \"Mr Azam is the best!\", \"Thank you, your help is much appreciated\", \"Thank you, I appreciate your help\"])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 32,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "cRp2vmxeRSam",
        "outputId": "c983365b-57b8-4b16-ec3b-30722b120235"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "[{'label': 'negative', 'score': 0.9914922118186951},\n",
              " {'label': 'negative', 'score': 0.9830396771430969},\n",
              " {'label': 'negative', 'score': 0.9941385984420776}]"
            ]
          },
          "execution_count": 32,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "pipe([\"Sikap tidak peduli dia menyebabkan ibu bapa dia geram\", \"Saya sangat benci warna merah\", \"Cis! Dompet aku hilang!\"])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 34,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "czWBDOvlo20m",
        "outputId": "25705b2d-32e8-42d9-866c-84cf499fd22e"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "[{'label': 'negative', 'score': 0.9114706516265869},\n",
              " {'label': 'positive', 'score': 0.9896261692047119},\n",
              " {'label': 'negative', 'score': 0.9341222047805786}]"
            ]
          },
          "execution_count": 34,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "pipe([\"His don't care attitude causes much strife to his parents\", \"I hate red color\", \"Gah! My Wallet is missing!\"])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 21,
      "metadata": {
        "id": "akGTf-l_ndZy",
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [],
      "source": [
        "def predict_sentiment(sentence):\n",
        "  predict_input = tokenizer.encode(sentence,\n",
        "                                  truncation=True,\n",
        "                                  padding=True,\n",
        "                                  return_tensors=\"tf\")\n",
        "\n",
        "  tf_output = loaded_model.predict(predict_input)[0]\n",
        "  tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]\n",
        "\n",
        "  sentiment = 0 if tf_prediction[0] > tf_prediction[1] else 1\n",
        "  print(tf_prediction)\n",
        "  return sentiment"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 22,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "SG7PCrB3nlH0",
        "outputId": "dc07eecc-13b0-4c02-94e6-c6c8e8036fa1"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "[0.0143008  0.98569924]\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "1"
            ]
          },
          "execution_count": 22,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "predict_sentiment(\"gembira\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 23,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "lWiz1MO1nlbO",
        "outputId": "1ebca034-79cc-4774-e79b-88925c58b34d"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "[0.57475716 0.4252428 ]\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "0"
            ]
          },
          "execution_count": 23,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "predict_sentiment(\"marah\")"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [],
      "name": "Hugging Face Bert Malay Sentiment.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 2
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython2",
      "version": "2.7.6"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}