{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "633fetsKg5cv", "outputId": "379a3769-9478-4749-cc71-bbf46e6478f9" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting transformers\n", " Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)\n", "\u001b[K |████████████████████████████████| 2.9 MB 5.2 MB/s \n", "\u001b[?25hCollecting pyyaml>=5.1\n", " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", "\u001b[K |████████████████████████████████| 596 kB 37.4 MB/s \n", "\u001b[?25hCollecting huggingface-hub>=0.0.17\n", " Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)\n", "\u001b[K |████████████████████████████████| 56 kB 4.7 MB/s \n", "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.3)\n", "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.8.1)\n", "Collecting sacremoses\n", " Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)\n", "\u001b[K |████████████████████████████████| 895 kB 41.5 MB/s \n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.3.0)\n", "Collecting tokenizers<0.11,>=0.10.1\n", " Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n", "\u001b[K |████████████████████████████████| 3.3 MB 26.2 MB/s \n", "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.17->transformers) (3.7.4.3)\n", "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (2.4.7)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.6.0)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.5.30)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n", "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n", "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n", " Attempting uninstall: pyyaml\n", " Found existing installation: PyYAML 3.13\n", " Uninstalling PyYAML-3.13:\n", " Successfully uninstalled PyYAML-3.13\n", "Successfully installed huggingface-hub-0.0.19 pyyaml-6.0 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.11.3\n" ] } ], "source": [ "!pip install transformers" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9pi31_2cndZU", "outputId": "f04cc4a8-7baf-404c-d059-66675a6dda63" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']\n", "- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "import tensorflow as tf\n", "import json\n", "from transformers import AutoConfig, AutoTokenizer, TFAutoModelForSequenceClassification\n", "\n", "config = AutoConfig.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased', id2label={\"0\": \"negative\",\"1\": \"positive\"}, \n", " label2id={\"negative\": 0,\"positive\": 1})\n", "tokenizer = AutoTokenizer.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased')\n", "model = TFAutoModelForSequenceClassification.from_pretrained(\"malay-huggingface/bert-tiny-bahasa-cased\", from_pt=True, config=config)\n", "\n", "# config = AutoConfig.from_pretrained('malay-huggingface/bert-base-bahasa-cased', id2label={\"0\": \"negative\",\"1\": \"positive\"}, \n", "# label2id={\"negative\": 0,\"positive\": 1})\n", "\n", "# tokenizer = AutoTokenizer.from_pretrained(\"malay-huggingface/bert-base-bahasa-cased\")\n", "# model = TFAutoModelForSequenceClassification.from_pretrained(\"malay-huggingface/bert-base-bahasa-cased\", from_pt=True, config=config)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "6mkizKwiJFeZ" }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 422 }, "id": "kgMs04IDJx2z", "outputId": "6ba3687d-4ac9-48f6-a275-1a652a073dcc" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labeltext
0NegativeLebih-lebih lagi dengan  kemudahan internet da...
1Positiveboleh memberi teguran kepada parti tetapi perl...
2NegativeAdalah membingungkan mengapa masyarakat Cina b...
3PositiveKami menurunkan defisit daripada 6.7 peratus p...
4NegativeIni masalahnya. Bukan rakyat, tetapi sistem
.........
3680PositiveJelas pembangkang buat tuduhan untuk mengeliru...
3681Positivedemokrasi adalah kuasa rakyat di mana pegawai ...
3682PositiveSelain dapat menyelesaikan isu beg berat, peng...
3683PositiveHospital Langkawi buat masa ini hanya dapat me...
3684PositiveJika sebelum ini kita selesa bergerak dalam ‘g...
\n", "

3685 rows × 2 columns

\n", "
" ], "text/plain": [ " label text\n", "0 Negative Lebih-lebih lagi dengan  kemudahan internet da...\n", "1 Positive boleh memberi teguran kepada parti tetapi perl...\n", "2 Negative Adalah membingungkan mengapa masyarakat Cina b...\n", "3 Positive Kami menurunkan defisit daripada 6.7 peratus p...\n", "4 Negative Ini masalahnya. Bukan rakyat, tetapi sistem\n", "... ... ...\n", "3680 Positive Jelas pembangkang buat tuduhan untuk mengeliru...\n", "3681 Positive demokrasi adalah kuasa rakyat di mana pegawai ...\n", "3682 Positive Selain dapat menyelesaikan isu beg berat, peng...\n", "3683 Positive Hospital Langkawi buat masa ini hanya dapat me...\n", "3684 Positive Jika sebelum ini kita selesa bergerak dalam ‘g...\n", "\n", "[3685 rows x 2 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sentiment_df = pd.read_csv(\"https://raw.githubusercontent.com/huseinzol05/malaya/master/finetune/sentiment-data-v2.csv\")\n", "sentiment_df" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "hEfJHRjEo1uk" }, "outputs": [], "source": [ "sentiment_df[\"label\"] = sentiment_df[\"label\"].map({'Positive': 1, 'Negative': 0})\n", "\n", "positive_df = pd.read_csv(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/polarity/polarity-positive-translated.txt\", names=[\"text\"])\n", "positive_df[\"label\"] = 1\n", "\n", "negative_df = pd.read_csv(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/polarity/polarity-negative-translated.txt\", names=[\"text\"])\n", "negative_df[\"label\"] = 0" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "iciAB9tss4tW" }, "outputs": [], "source": [ "amazon_df = pd.read_json(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-amazon.json\", orient='index').T\n", "yelp_df = pd.read_json(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-yelp.json\", orient='index').T\n", "imdb_df = pd.read_json(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-imdb.json\", orient='index').T\n", "\n", "def process_json_df(df):\n", " positive_df = df[[\"positive\"]].dropna()\n", " positive_df.columns = [\"text\"]\n", " positive_df[\"label\"] = 1\n", "\n", " negative_df = df[[\"negative\"]].dropna()\n", " negative_df.columns = [\"text\"]\n", " negative_df[\"label\"] = 0\n", "\n", " return pd.concat([positive_df, negative_df])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 422 }, "id": "GRX3doXvvqjw", "outputId": "6c202e02-04d9-4560-8c16-d44163d92ce6" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labeltext
00Lebih-lebih lagi dengan  kemudahan internet da...
11boleh memberi teguran kepada parti tetapi perl...
20Adalah membingungkan mengapa masyarakat Cina b...
31Kami menurunkan defisit daripada 6.7 peratus p...
40Ini masalahnya. Bukan rakyat, tetapi sistem
.........
167200dalam satu perkataan, ia memalukan.
167210Saya tidak pernah keluar dari filem dengan pan...
167220saya hanya bosan menonton jessica lange mengam...
167230semua dalam satu penghinaan terhadap kecerdasa...
167240yang ingin melayari gelombang kecil filem angk...
\n", "

16725 rows × 2 columns

\n", "
" ], "text/plain": [ " label text\n", "0 0 Lebih-lebih lagi dengan  kemudahan internet da...\n", "1 1 boleh memberi teguran kepada parti tetapi perl...\n", "2 0 Adalah membingungkan mengapa masyarakat Cina b...\n", "3 1 Kami menurunkan defisit daripada 6.7 peratus p...\n", "4 0 Ini masalahnya. Bukan rakyat, tetapi sistem\n", "... ... ...\n", "16720 0 dalam satu perkataan, ia memalukan.\n", "16721 0 Saya tidak pernah keluar dari filem dengan pan...\n", "16722 0 saya hanya bosan menonton jessica lange mengam...\n", "16723 0 semua dalam satu penghinaan terhadap kecerdasa...\n", "16724 0 yang ingin melayari gelombang kecil filem angk...\n", "\n", "[16725 rows x 2 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# df = pd.concat([sentiment_df, positive_df, negative_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)\n", "# df = pd.concat([sentiment_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)\n", "df = pd.concat([sentiment_df, positive_df, negative_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FeWmvyotp9RP", "outputId": "c3b34cb1-28d6-4c60-a4f0-778bd398ba02" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "13380\n", "3345\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "# sentences = sarcasm_df[\"headline\"].tolist()\n", "# labels = sarcasm_df[\"is_sarcastic\"].tolist()\n", "\n", "\n", "sentences = df[\"text\"].tolist()\n", "labels = df[\"label\"].tolist()\n", "\n", "training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(sentences, labels, train_size=0.8, random_state=1)\n", "\n", "print(len(training_sentences))\n", "print(len(validation_sentences))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KCxtcxObndZk", "outputId": "0c3de610-02d1-4a8f-f7bf-993e1f644d63", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n" ] } ], "source": [ "train_encodings = tokenizer(training_sentences, truncation=True, padding=True)\n", "val_encodings = tokenizer(validation_sentences, truncation=True, padding=True)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "Tg7zcOpVndZm", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "train_dataset = tf.data.Dataset.from_tensor_slices((\n", " dict(train_encodings),\n", " training_labels\n", "))\n", "\n", "val_dataset = tf.data.Dataset.from_tensor_slices((\n", " dict(val_encodings),\n", " validation_labels\n", "))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "id": "vfwrq3eMXDi1" }, "outputs": [], "source": [ "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", "\n", "es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)\n", "# mc = ModelCheckpoint('best_model', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8_gjepLSndZq", "outputId": "3091b5d2-40c6-4cfd-82fd-fcbc094cbc3b", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "837/837 [==============================] - 91s 95ms/step - loss: 0.5531 - accuracy: 0.7115 - val_loss: 0.5028 - val_accuracy: 0.7474\n", "Epoch 2/10\n", "837/837 [==============================] - 78s 93ms/step - loss: 0.4301 - accuracy: 0.8006 - val_loss: 0.4745 - val_accuracy: 0.7731\n", "Epoch 3/10\n", "837/837 [==============================] - 78s 93ms/step - loss: 0.3201 - accuracy: 0.8635 - val_loss: 0.5232 - val_accuracy: 0.7773\n", "Epoch 4/10\n", "837/837 [==============================] - 78s 93ms/step - loss: 0.2226 - accuracy: 0.9113 - val_loss: 0.5835 - val_accuracy: 0.7611\n", "Epoch 5/10\n", "837/837 [==============================] - 78s 93ms/step - loss: 0.1604 - accuracy: 0.9389 - val_loss: 0.6551 - val_accuracy: 0.7638\n", "Epoch 00005: early stopping\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)\n", "model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])\n", "model.fit(train_dataset.shuffle(100).batch(16),\n", " epochs=10,\n", " batch_size=16,\n", " callbacks=[es],\n", " validation_data=val_dataset.shuffle(100).batch(16))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "dmfeNn8hndZs", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "model.save_pretrained(\"model\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "D_nYwVTY8W1M", "outputId": "913383cd-983d-41f4-efa7-d727275fab09" }, "outputs": [ { "data": { "text/plain": [ "('tokenize/tokenizer_config.json',\n", " 'tokenize/special_tokens_map.json',\n", " 'tokenize/vocab.txt',\n", " 'tokenize/added_tokens.json',\n", " 'tokenize/tokenizer.json')" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.save_pretrained(\"tokenize\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "id": "_jwvD6AUndZu", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "#### Load saved model and run predict function" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "s71ZiN0bndZw", "outputId": "42b7412d-7fe3-439c-8c89-1f5b4e688ee0", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some layers from the model checkpoint at model were not used when initializing TFBertForSequenceClassification: ['dropout_13']\n", "- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at model.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.\n" ] } ], "source": [ "loaded_model = TFAutoModelForSequenceClassification.from_pretrained(\"model\")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "id": "3QCgtNI8nlmX" }, "outputs": [], "source": [ "from transformers import pipeline\n", "\n", "pipe = pipeline('text-classification', model=loaded_model, tokenizer=tokenizer)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4QWLGTRpPDeZ", "outputId": "29837e60-6d35-43cd-d6e5-14ecfc3c2c33" }, "outputs": [ { "data": { "text/plain": [ "[{'label': 'positive', 'score': 0.9960972666740417},\n", " {'label': 'positive', 'score': 0.9960286617279053},\n", " {'label': 'positive', 'score': 0.9795612692832947}]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe([\"Saya gembira kerana saya boleh meluangkan masa bersama keluarga.\", \"Cikgu Azam adalah yang terbaik!\", \"Terima kasih, pertolongan anda adalah amat dihargai\"])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Y9RvdOZcnU3p", "outputId": "088ed08d-4402-4889-f047-b3a20ae1f473" }, "outputs": [ { "data": { "text/plain": [ "[{'label': 'positive', 'score': 0.9666869640350342},\n", " {'label': 'positive', 'score': 0.9939473867416382},\n", " {'label': 'negative', 'score': 0.949023425579071},\n", " {'label': 'positive', 'score': 0.7437461018562317}]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe([\"I'm happy to spend time with my family\", \"Mr Azam is the best!\", \"Thank you, your help is much appreciated\", \"Thank you, I appreciate your help\"])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "cRp2vmxeRSam", "outputId": "c983365b-57b8-4b16-ec3b-30722b120235" }, "outputs": [ { "data": { "text/plain": [ "[{'label': 'negative', 'score': 0.9914922118186951},\n", " {'label': 'negative', 'score': 0.9830396771430969},\n", " {'label': 'negative', 'score': 0.9941385984420776}]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe([\"Sikap tidak peduli dia menyebabkan ibu bapa dia geram\", \"Saya sangat benci warna merah\", \"Cis! Dompet aku hilang!\"])" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "czWBDOvlo20m", "outputId": "25705b2d-32e8-42d9-866c-84cf499fd22e" }, "outputs": [ { "data": { "text/plain": [ "[{'label': 'negative', 'score': 0.9114706516265869},\n", " {'label': 'positive', 'score': 0.9896261692047119},\n", " {'label': 'negative', 'score': 0.9341222047805786}]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe([\"His don't care attitude causes much strife to his parents\", \"I hate red color\", \"Gah! My Wallet is missing!\"])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "id": "akGTf-l_ndZy", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "def predict_sentiment(sentence):\n", " predict_input = tokenizer.encode(sentence,\n", " truncation=True,\n", " padding=True,\n", " return_tensors=\"tf\")\n", "\n", " tf_output = loaded_model.predict(predict_input)[0]\n", " tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]\n", "\n", " sentiment = 0 if tf_prediction[0] > tf_prediction[1] else 1\n", " print(tf_prediction)\n", " return sentiment" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SG7PCrB3nlH0", "outputId": "dc07eecc-13b0-4c02-94e6-c6c8e8036fa1" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0.0143008 0.98569924]\n" ] }, { "data": { "text/plain": [ "1" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict_sentiment(\"gembira\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lWiz1MO1nlbO", "outputId": "1ebca034-79cc-4774-e79b-88925c58b34d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0.57475716 0.4252428 ]\n" ] }, { "data": { "text/plain": [ "0" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict_sentiment(\"marah\")" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "Hugging Face Bert Malay Sentiment.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }