patrickxchong commited on
Commit
6af88c1
1 Parent(s): f3871f4

initial commit

Browse files
Hugging_Face_Bert_Malay_Sentiment.ipynb ADDED
@@ -0,0 +1,892 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "colab": {
8
+ "base_uri": "https://localhost:8080/"
9
+ },
10
+ "id": "633fetsKg5cv",
11
+ "outputId": "379a3769-9478-4749-cc71-bbf46e6478f9"
12
+ },
13
+ "outputs": [
14
+ {
15
+ "name": "stdout",
16
+ "output_type": "stream",
17
+ "text": [
18
+ "Collecting transformers\n",
19
+ " Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)\n",
20
+ "\u001b[K |████████████████████████████████| 2.9 MB 5.2 MB/s \n",
21
+ "\u001b[?25hCollecting pyyaml>=5.1\n",
22
+ " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n",
23
+ "\u001b[K |████████████████████████████████| 596 kB 37.4 MB/s \n",
24
+ "\u001b[?25hCollecting huggingface-hub>=0.0.17\n",
25
+ " Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)\n",
26
+ "\u001b[K |████████████████████████████████| 56 kB 4.7 MB/s \n",
27
+ "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n",
28
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n",
29
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.0)\n",
30
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n",
31
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.3)\n",
32
+ "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.8.1)\n",
33
+ "Collecting sacremoses\n",
34
+ " Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)\n",
35
+ "\u001b[K |████████████████████████████████| 895 kB 41.5 MB/s \n",
36
+ "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.3.0)\n",
37
+ "Collecting tokenizers<0.11,>=0.10.1\n",
38
+ " Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n",
39
+ "\u001b[K |████████████████████████████████| 3.3 MB 26.2 MB/s \n",
40
+ "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.17->transformers) (3.7.4.3)\n",
41
+ "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (2.4.7)\n",
42
+ "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.6.0)\n",
43
+ "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n",
44
+ "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n",
45
+ "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n",
46
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.5.30)\n",
47
+ "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n",
48
+ "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n",
49
+ "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n",
50
+ "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n",
51
+ " Attempting uninstall: pyyaml\n",
52
+ " Found existing installation: PyYAML 3.13\n",
53
+ " Uninstalling PyYAML-3.13:\n",
54
+ " Successfully uninstalled PyYAML-3.13\n",
55
+ "Successfully installed huggingface-hub-0.0.19 pyyaml-6.0 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.11.3\n"
56
+ ]
57
+ }
58
+ ],
59
+ "source": [
60
+ "!pip install transformers"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 37,
66
+ "metadata": {
67
+ "colab": {
68
+ "base_uri": "https://localhost:8080/"
69
+ },
70
+ "id": "9pi31_2cndZU",
71
+ "outputId": "f04cc4a8-7baf-404c-d059-66675a6dda63"
72
+ },
73
+ "outputs": [
74
+ {
75
+ "name": "stderr",
76
+ "output_type": "stream",
77
+ "text": [
78
+ "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']\n",
79
+ "- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
80
+ "- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
81
+ "Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias']\n",
82
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
83
+ ]
84
+ }
85
+ ],
86
+ "source": [
87
+ "import tensorflow as tf\n",
88
+ "import json\n",
89
+ "from transformers import AutoConfig, AutoTokenizer, TFAutoModelForSequenceClassification\n",
90
+ "\n",
91
+ "config = AutoConfig.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased', id2label={\"0\": \"negative\",\"1\": \"positive\"}, \n",
92
+ " label2id={\"negative\": 0,\"positive\": 1})\n",
93
+ "tokenizer = AutoTokenizer.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased')\n",
94
+ "model = TFAutoModelForSequenceClassification.from_pretrained(\"malay-huggingface/bert-tiny-bahasa-cased\", from_pt=True, config=config)\n",
95
+ "\n",
96
+ "# config = AutoConfig.from_pretrained('malay-huggingface/bert-base-bahasa-cased', id2label={\"0\": \"negative\",\"1\": \"positive\"}, \n",
97
+ "# label2id={\"negative\": 0,\"positive\": 1})\n",
98
+ "\n",
99
+ "# tokenizer = AutoTokenizer.from_pretrained(\"malay-huggingface/bert-base-bahasa-cased\")\n",
100
+ "# model = TFAutoModelForSequenceClassification.from_pretrained(\"malay-huggingface/bert-base-bahasa-cased\", from_pt=True, config=config)"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 4,
106
+ "metadata": {
107
+ "id": "6mkizKwiJFeZ"
108
+ },
109
+ "outputs": [],
110
+ "source": [
111
+ "import pandas as pd"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 5,
117
+ "metadata": {
118
+ "colab": {
119
+ "base_uri": "https://localhost:8080/",
120
+ "height": 422
121
+ },
122
+ "id": "kgMs04IDJx2z",
123
+ "outputId": "6ba3687d-4ac9-48f6-a275-1a652a073dcc"
124
+ },
125
+ "outputs": [
126
+ {
127
+ "data": {
128
+ "text/html": [
129
+ "<div>\n",
130
+ "<style scoped>\n",
131
+ " .dataframe tbody tr th:only-of-type {\n",
132
+ " vertical-align: middle;\n",
133
+ " }\n",
134
+ "\n",
135
+ " .dataframe tbody tr th {\n",
136
+ " vertical-align: top;\n",
137
+ " }\n",
138
+ "\n",
139
+ " .dataframe thead th {\n",
140
+ " text-align: right;\n",
141
+ " }\n",
142
+ "</style>\n",
143
+ "<table border=\"1\" class=\"dataframe\">\n",
144
+ " <thead>\n",
145
+ " <tr style=\"text-align: right;\">\n",
146
+ " <th></th>\n",
147
+ " <th>label</th>\n",
148
+ " <th>text</th>\n",
149
+ " </tr>\n",
150
+ " </thead>\n",
151
+ " <tbody>\n",
152
+ " <tr>\n",
153
+ " <th>0</th>\n",
154
+ " <td>Negative</td>\n",
155
+ " <td>Lebih-lebih lagi dengan  kemudahan internet da...</td>\n",
156
+ " </tr>\n",
157
+ " <tr>\n",
158
+ " <th>1</th>\n",
159
+ " <td>Positive</td>\n",
160
+ " <td>boleh memberi teguran kepada parti tetapi perl...</td>\n",
161
+ " </tr>\n",
162
+ " <tr>\n",
163
+ " <th>2</th>\n",
164
+ " <td>Negative</td>\n",
165
+ " <td>Adalah membingungkan mengapa masyarakat Cina b...</td>\n",
166
+ " </tr>\n",
167
+ " <tr>\n",
168
+ " <th>3</th>\n",
169
+ " <td>Positive</td>\n",
170
+ " <td>Kami menurunkan defisit daripada 6.7 peratus p...</td>\n",
171
+ " </tr>\n",
172
+ " <tr>\n",
173
+ " <th>4</th>\n",
174
+ " <td>Negative</td>\n",
175
+ " <td>Ini masalahnya. Bukan rakyat, tetapi sistem</td>\n",
176
+ " </tr>\n",
177
+ " <tr>\n",
178
+ " <th>...</th>\n",
179
+ " <td>...</td>\n",
180
+ " <td>...</td>\n",
181
+ " </tr>\n",
182
+ " <tr>\n",
183
+ " <th>3680</th>\n",
184
+ " <td>Positive</td>\n",
185
+ " <td>Jelas pembangkang buat tuduhan untuk mengeliru...</td>\n",
186
+ " </tr>\n",
187
+ " <tr>\n",
188
+ " <th>3681</th>\n",
189
+ " <td>Positive</td>\n",
190
+ " <td>demokrasi adalah kuasa rakyat di mana pegawai ...</td>\n",
191
+ " </tr>\n",
192
+ " <tr>\n",
193
+ " <th>3682</th>\n",
194
+ " <td>Positive</td>\n",
195
+ " <td>Selain dapat menyelesaikan isu beg berat, peng...</td>\n",
196
+ " </tr>\n",
197
+ " <tr>\n",
198
+ " <th>3683</th>\n",
199
+ " <td>Positive</td>\n",
200
+ " <td>Hospital Langkawi buat masa ini hanya dapat me...</td>\n",
201
+ " </tr>\n",
202
+ " <tr>\n",
203
+ " <th>3684</th>\n",
204
+ " <td>Positive</td>\n",
205
+ " <td>Jika sebelum ini kita selesa bergerak dalam ‘g...</td>\n",
206
+ " </tr>\n",
207
+ " </tbody>\n",
208
+ "</table>\n",
209
+ "<p>3685 rows × 2 columns</p>\n",
210
+ "</div>"
211
+ ],
212
+ "text/plain": [
213
+ " label text\n",
214
+ "0 Negative Lebih-lebih lagi dengan  kemudahan internet da...\n",
215
+ "1 Positive boleh memberi teguran kepada parti tetapi perl...\n",
216
+ "2 Negative Adalah membingungkan mengapa masyarakat Cina b...\n",
217
+ "3 Positive Kami menurunkan defisit daripada 6.7 peratus p...\n",
218
+ "4 Negative Ini masalahnya. Bukan rakyat, tetapi sistem\n",
219
+ "... ... ...\n",
220
+ "3680 Positive Jelas pembangkang buat tuduhan untuk mengeliru...\n",
221
+ "3681 Positive demokrasi adalah kuasa rakyat di mana pegawai ...\n",
222
+ "3682 Positive Selain dapat menyelesaikan isu beg berat, peng...\n",
223
+ "3683 Positive Hospital Langkawi buat masa ini hanya dapat me...\n",
224
+ "3684 Positive Jika sebelum ini kita selesa bergerak dalam ‘g...\n",
225
+ "\n",
226
+ "[3685 rows x 2 columns]"
227
+ ]
228
+ },
229
+ "execution_count": 5,
230
+ "metadata": {},
231
+ "output_type": "execute_result"
232
+ }
233
+ ],
234
+ "source": [
235
+ "sentiment_df = pd.read_csv(\"https://raw.githubusercontent.com/huseinzol05/malaya/master/finetune/sentiment-data-v2.csv\")\n",
236
+ "sentiment_df"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "code",
241
+ "execution_count": 6,
242
+ "metadata": {
243
+ "id": "hEfJHRjEo1uk"
244
+ },
245
+ "outputs": [],
246
+ "source": [
247
+ "sentiment_df[\"label\"] = sentiment_df[\"label\"].map({'Positive': 1, 'Negative': 0})\n",
248
+ "\n",
249
+ "positive_df = pd.read_csv(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/polarity/polarity-positive-translated.txt\", names=[\"text\"])\n",
250
+ "positive_df[\"label\"] = 1\n",
251
+ "\n",
252
+ "negative_df = pd.read_csv(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/polarity/polarity-negative-translated.txt\", names=[\"text\"])\n",
253
+ "negative_df[\"label\"] = 0"
254
+ ]
255
+ },
256
+ {
257
+ "cell_type": "code",
258
+ "execution_count": 7,
259
+ "metadata": {
260
+ "id": "iciAB9tss4tW"
261
+ },
262
+ "outputs": [],
263
+ "source": [
264
+ "amazon_df = pd.read_json(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-amazon.json\", orient='index').T\n",
265
+ "yelp_df = pd.read_json(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-yelp.json\", orient='index').T\n",
266
+ "imdb_df = pd.read_json(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-imdb.json\", orient='index').T\n",
267
+ "\n",
268
+ "def process_json_df(df):\n",
269
+ " positive_df = df[[\"positive\"]].dropna()\n",
270
+ " positive_df.columns = [\"text\"]\n",
271
+ " positive_df[\"label\"] = 1\n",
272
+ "\n",
273
+ " negative_df = df[[\"negative\"]].dropna()\n",
274
+ " negative_df.columns = [\"text\"]\n",
275
+ " negative_df[\"label\"] = 0\n",
276
+ "\n",
277
+ " return pd.concat([positive_df, negative_df])"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": 8,
283
+ "metadata": {
284
+ "colab": {
285
+ "base_uri": "https://localhost:8080/",
286
+ "height": 422
287
+ },
288
+ "id": "GRX3doXvvqjw",
289
+ "outputId": "6c202e02-04d9-4560-8c16-d44163d92ce6"
290
+ },
291
+ "outputs": [
292
+ {
293
+ "data": {
294
+ "text/html": [
295
+ "<div>\n",
296
+ "<style scoped>\n",
297
+ " .dataframe tbody tr th:only-of-type {\n",
298
+ " vertical-align: middle;\n",
299
+ " }\n",
300
+ "\n",
301
+ " .dataframe tbody tr th {\n",
302
+ " vertical-align: top;\n",
303
+ " }\n",
304
+ "\n",
305
+ " .dataframe thead th {\n",
306
+ " text-align: right;\n",
307
+ " }\n",
308
+ "</style>\n",
309
+ "<table border=\"1\" class=\"dataframe\">\n",
310
+ " <thead>\n",
311
+ " <tr style=\"text-align: right;\">\n",
312
+ " <th></th>\n",
313
+ " <th>label</th>\n",
314
+ " <th>text</th>\n",
315
+ " </tr>\n",
316
+ " </thead>\n",
317
+ " <tbody>\n",
318
+ " <tr>\n",
319
+ " <th>0</th>\n",
320
+ " <td>0</td>\n",
321
+ " <td>Lebih-lebih lagi dengan  kemudahan internet da...</td>\n",
322
+ " </tr>\n",
323
+ " <tr>\n",
324
+ " <th>1</th>\n",
325
+ " <td>1</td>\n",
326
+ " <td>boleh memberi teguran kepada parti tetapi perl...</td>\n",
327
+ " </tr>\n",
328
+ " <tr>\n",
329
+ " <th>2</th>\n",
330
+ " <td>0</td>\n",
331
+ " <td>Adalah membingungkan mengapa masyarakat Cina b...</td>\n",
332
+ " </tr>\n",
333
+ " <tr>\n",
334
+ " <th>3</th>\n",
335
+ " <td>1</td>\n",
336
+ " <td>Kami menurunkan defisit daripada 6.7 peratus p...</td>\n",
337
+ " </tr>\n",
338
+ " <tr>\n",
339
+ " <th>4</th>\n",
340
+ " <td>0</td>\n",
341
+ " <td>Ini masalahnya. Bukan rakyat, tetapi sistem</td>\n",
342
+ " </tr>\n",
343
+ " <tr>\n",
344
+ " <th>...</th>\n",
345
+ " <td>...</td>\n",
346
+ " <td>...</td>\n",
347
+ " </tr>\n",
348
+ " <tr>\n",
349
+ " <th>16720</th>\n",
350
+ " <td>0</td>\n",
351
+ " <td>dalam satu perkataan, ia memalukan.</td>\n",
352
+ " </tr>\n",
353
+ " <tr>\n",
354
+ " <th>16721</th>\n",
355
+ " <td>0</td>\n",
356
+ " <td>Saya tidak pernah keluar dari filem dengan pan...</td>\n",
357
+ " </tr>\n",
358
+ " <tr>\n",
359
+ " <th>16722</th>\n",
360
+ " <td>0</td>\n",
361
+ " <td>saya hanya bosan menonton jessica lange mengam...</td>\n",
362
+ " </tr>\n",
363
+ " <tr>\n",
364
+ " <th>16723</th>\n",
365
+ " <td>0</td>\n",
366
+ " <td>semua dalam satu penghinaan terhadap kecerdasa...</td>\n",
367
+ " </tr>\n",
368
+ " <tr>\n",
369
+ " <th>16724</th>\n",
370
+ " <td>0</td>\n",
371
+ " <td>yang ingin melayari gelombang kecil filem angk...</td>\n",
372
+ " </tr>\n",
373
+ " </tbody>\n",
374
+ "</table>\n",
375
+ "<p>16725 rows × 2 columns</p>\n",
376
+ "</div>"
377
+ ],
378
+ "text/plain": [
379
+ " label text\n",
380
+ "0 0 Lebih-lebih lagi dengan  kemudahan internet da...\n",
381
+ "1 1 boleh memberi teguran kepada parti tetapi perl...\n",
382
+ "2 0 Adalah membingungkan mengapa masyarakat Cina b...\n",
383
+ "3 1 Kami menurunkan defisit daripada 6.7 peratus p...\n",
384
+ "4 0 Ini masalahnya. Bukan rakyat, tetapi sistem\n",
385
+ "... ... ...\n",
386
+ "16720 0 dalam satu perkataan, ia memalukan.\n",
387
+ "16721 0 Saya tidak pernah keluar dari filem dengan pan...\n",
388
+ "16722 0 saya hanya bosan menonton jessica lange mengam...\n",
389
+ "16723 0 semua dalam satu penghinaan terhadap kecerdasa...\n",
390
+ "16724 0 yang ingin melayari gelombang kecil filem angk...\n",
391
+ "\n",
392
+ "[16725 rows x 2 columns]"
393
+ ]
394
+ },
395
+ "execution_count": 8,
396
+ "metadata": {},
397
+ "output_type": "execute_result"
398
+ }
399
+ ],
400
+ "source": [
401
+ "# df = pd.concat([sentiment_df, positive_df, negative_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)\n",
402
+ "# df = pd.concat([sentiment_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)\n",
403
+ "df = pd.concat([sentiment_df, positive_df, negative_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)\n",
404
+ "\n",
405
+ "df"
406
+ ]
407
+ },
408
+ {
409
+ "cell_type": "code",
410
+ "execution_count": 9,
411
+ "metadata": {
412
+ "colab": {
413
+ "base_uri": "https://localhost:8080/"
414
+ },
415
+ "id": "FeWmvyotp9RP",
416
+ "outputId": "c3b34cb1-28d6-4c60-a4f0-778bd398ba02"
417
+ },
418
+ "outputs": [
419
+ {
420
+ "name": "stdout",
421
+ "output_type": "stream",
422
+ "text": [
423
+ "13380\n",
424
+ "3345\n"
425
+ ]
426
+ }
427
+ ],
428
+ "source": [
429
+ "from sklearn.model_selection import train_test_split\n",
430
+ "\n",
431
+ "# sentences = sarcasm_df[\"headline\"].tolist()\n",
432
+ "# labels = sarcasm_df[\"is_sarcastic\"].tolist()\n",
433
+ "\n",
434
+ "\n",
435
+ "sentences = df[\"text\"].tolist()\n",
436
+ "labels = df[\"label\"].tolist()\n",
437
+ "\n",
438
+ "training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(sentences, labels, train_size=0.8, random_state=1)\n",
439
+ "\n",
440
+ "print(len(training_sentences))\n",
441
+ "print(len(validation_sentences))"
442
+ ]
443
+ },
444
+ {
445
+ "cell_type": "code",
446
+ "execution_count": 10,
447
+ "metadata": {
448
+ "colab": {
449
+ "base_uri": "https://localhost:8080/"
450
+ },
451
+ "id": "KCxtcxObndZk",
452
+ "outputId": "0c3de610-02d1-4a8f-f7bf-993e1f644d63",
453
+ "pycharm": {
454
+ "name": "#%%\n"
455
+ }
456
+ },
457
+ "outputs": [
458
+ {
459
+ "name": "stderr",
460
+ "output_type": "stream",
461
+ "text": [
462
+ "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
463
+ ]
464
+ }
465
+ ],
466
+ "source": [
467
+ "train_encodings = tokenizer(training_sentences, truncation=True, padding=True)\n",
468
+ "val_encodings = tokenizer(validation_sentences, truncation=True, padding=True)"
469
+ ]
470
+ },
471
+ {
472
+ "cell_type": "code",
473
+ "execution_count": 11,
474
+ "metadata": {
475
+ "id": "Tg7zcOpVndZm",
476
+ "pycharm": {
477
+ "name": "#%%\n"
478
+ }
479
+ },
480
+ "outputs": [],
481
+ "source": [
482
+ "train_dataset = tf.data.Dataset.from_tensor_slices((\n",
483
+ " dict(train_encodings),\n",
484
+ " training_labels\n",
485
+ "))\n",
486
+ "\n",
487
+ "val_dataset = tf.data.Dataset.from_tensor_slices((\n",
488
+ " dict(val_encodings),\n",
489
+ " validation_labels\n",
490
+ "))"
491
+ ]
492
+ },
493
+ {
494
+ "cell_type": "code",
495
+ "execution_count": 12,
496
+ "metadata": {
497
+ "id": "vfwrq3eMXDi1"
498
+ },
499
+ "outputs": [],
500
+ "source": [
501
+ "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
502
+ "\n",
503
+ "es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)\n",
504
+ "# mc = ModelCheckpoint('best_model', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)"
505
+ ]
506
+ },
507
+ {
508
+ "cell_type": "code",
509
+ "execution_count": 13,
510
+ "metadata": {
511
+ "colab": {
512
+ "base_uri": "https://localhost:8080/"
513
+ },
514
+ "id": "8_gjepLSndZq",
515
+ "outputId": "3091b5d2-40c6-4cfd-82fd-fcbc094cbc3b",
516
+ "pycharm": {
517
+ "name": "#%%\n"
518
+ }
519
+ },
520
+ "outputs": [
521
+ {
522
+ "name": "stdout",
523
+ "output_type": "stream",
524
+ "text": [
525
+ "Epoch 1/10\n",
526
+ "837/837 [==============================] - 91s 95ms/step - loss: 0.5531 - accuracy: 0.7115 - val_loss: 0.5028 - val_accuracy: 0.7474\n",
527
+ "Epoch 2/10\n",
528
+ "837/837 [==============================] - 78s 93ms/step - loss: 0.4301 - accuracy: 0.8006 - val_loss: 0.4745 - val_accuracy: 0.7731\n",
529
+ "Epoch 3/10\n",
530
+ "837/837 [==============================] - 78s 93ms/step - loss: 0.3201 - accuracy: 0.8635 - val_loss: 0.5232 - val_accuracy: 0.7773\n",
531
+ "Epoch 4/10\n",
532
+ "837/837 [==============================] - 78s 93ms/step - loss: 0.2226 - accuracy: 0.9113 - val_loss: 0.5835 - val_accuracy: 0.7611\n",
533
+ "Epoch 5/10\n",
534
+ "837/837 [==============================] - 78s 93ms/step - loss: 0.1604 - accuracy: 0.9389 - val_loss: 0.6551 - val_accuracy: 0.7638\n",
535
+ "Epoch 00005: early stopping\n"
536
+ ]
537
+ },
538
+ {
539
+ "data": {
540
+ "text/plain": [
541
+ "<keras.callbacks.History at 0x7efdb1594e10>"
542
+ ]
543
+ },
544
+ "execution_count": 13,
545
+ "metadata": {},
546
+ "output_type": "execute_result"
547
+ }
548
+ ],
549
+ "source": [
550
+ "optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)\n",
551
+ "model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])\n",
552
+ "model.fit(train_dataset.shuffle(100).batch(16),\n",
553
+ " epochs=10,\n",
554
+ " batch_size=16,\n",
555
+ " callbacks=[es],\n",
556
+ " validation_data=val_dataset.shuffle(100).batch(16))"
557
+ ]
558
+ },
559
+ {
560
+ "cell_type": "code",
561
+ "execution_count": 14,
562
+ "metadata": {
563
+ "id": "dmfeNn8hndZs",
564
+ "pycharm": {
565
+ "name": "#%%\n"
566
+ }
567
+ },
568
+ "outputs": [],
569
+ "source": [
570
+ "model.save_pretrained(\"model\")"
571
+ ]
572
+ },
573
+ {
574
+ "cell_type": "code",
575
+ "execution_count": 38,
576
+ "metadata": {
577
+ "colab": {
578
+ "base_uri": "https://localhost:8080/"
579
+ },
580
+ "id": "D_nYwVTY8W1M",
581
+ "outputId": "913383cd-983d-41f4-efa7-d727275fab09"
582
+ },
583
+ "outputs": [
584
+ {
585
+ "data": {
586
+ "text/plain": [
587
+ "('tokenize/tokenizer_config.json',\n",
588
+ " 'tokenize/special_tokens_map.json',\n",
589
+ " 'tokenize/vocab.txt',\n",
590
+ " 'tokenize/added_tokens.json',\n",
591
+ " 'tokenize/tokenizer.json')"
592
+ ]
593
+ },
594
+ "execution_count": 38,
595
+ "metadata": {},
596
+ "output_type": "execute_result"
597
+ }
598
+ ],
599
+ "source": [
600
+ "tokenizer.save_pretrained(\"tokenize\")"
601
+ ]
602
+ },
603
+ {
604
+ "cell_type": "code",
605
+ "execution_count": 16,
606
+ "metadata": {
607
+ "id": "_jwvD6AUndZu",
608
+ "pycharm": {
609
+ "name": "#%%\n"
610
+ }
611
+ },
612
+ "outputs": [],
613
+ "source": [
614
+ "#### Load saved model and run predict function"
615
+ ]
616
+ },
617
+ {
618
+ "cell_type": "code",
619
+ "execution_count": 17,
620
+ "metadata": {
621
+ "colab": {
622
+ "base_uri": "https://localhost:8080/"
623
+ },
624
+ "id": "s71ZiN0bndZw",
625
+ "outputId": "42b7412d-7fe3-439c-8c89-1f5b4e688ee0",
626
+ "pycharm": {
627
+ "name": "#%%\n"
628
+ }
629
+ },
630
+ "outputs": [
631
+ {
632
+ "name": "stderr",
633
+ "output_type": "stream",
634
+ "text": [
635
+ "Some layers from the model checkpoint at model were not used when initializing TFBertForSequenceClassification: ['dropout_13']\n",
636
+ "- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
637
+ "- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
638
+ "All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at model.\n",
639
+ "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.\n"
640
+ ]
641
+ }
642
+ ],
643
+ "source": [
644
+ "loaded_model = TFAutoModelForSequenceClassification.from_pretrained(\"model\")"
645
+ ]
646
+ },
647
+ {
648
+ "cell_type": "code",
649
+ "execution_count": 18,
650
+ "metadata": {
651
+ "id": "3QCgtNI8nlmX"
652
+ },
653
+ "outputs": [],
654
+ "source": [
655
+ "from transformers import pipeline\n",
656
+ "\n",
657
+ "pipe = pipeline('text-classification', model=loaded_model, tokenizer=tokenizer)"
658
+ ]
659
+ },
660
+ {
661
+ "cell_type": "code",
662
+ "execution_count": 30,
663
+ "metadata": {
664
+ "colab": {
665
+ "base_uri": "https://localhost:8080/"
666
+ },
667
+ "id": "4QWLGTRpPDeZ",
668
+ "outputId": "29837e60-6d35-43cd-d6e5-14ecfc3c2c33"
669
+ },
670
+ "outputs": [
671
+ {
672
+ "data": {
673
+ "text/plain": [
674
+ "[{'label': 'positive', 'score': 0.9960972666740417},\n",
675
+ " {'label': 'positive', 'score': 0.9960286617279053},\n",
676
+ " {'label': 'positive', 'score': 0.9795612692832947}]"
677
+ ]
678
+ },
679
+ "execution_count": 30,
680
+ "metadata": {},
681
+ "output_type": "execute_result"
682
+ }
683
+ ],
684
+ "source": [
685
+ "pipe([\"Saya gembira kerana saya boleh meluangkan masa bersama keluarga.\", \"Cikgu Azam adalah yang terbaik!\", \"Terima kasih, pertolongan anda adalah amat dihargai\"])"
686
+ ]
687
+ },
688
+ {
689
+ "cell_type": "code",
690
+ "execution_count": 29,
691
+ "metadata": {
692
+ "colab": {
693
+ "base_uri": "https://localhost:8080/"
694
+ },
695
+ "id": "Y9RvdOZcnU3p",
696
+ "outputId": "088ed08d-4402-4889-f047-b3a20ae1f473"
697
+ },
698
+ "outputs": [
699
+ {
700
+ "data": {
701
+ "text/plain": [
702
+ "[{'label': 'positive', 'score': 0.9666869640350342},\n",
703
+ " {'label': 'positive', 'score': 0.9939473867416382},\n",
704
+ " {'label': 'negative', 'score': 0.949023425579071},\n",
705
+ " {'label': 'positive', 'score': 0.7437461018562317}]"
706
+ ]
707
+ },
708
+ "execution_count": 29,
709
+ "metadata": {},
710
+ "output_type": "execute_result"
711
+ }
712
+ ],
713
+ "source": [
714
+ "pipe([\"I'm happy to spend time with my family\", \"Mr Azam is the best!\", \"Thank you, your help is much appreciated\", \"Thank you, I appreciate your help\"])"
715
+ ]
716
+ },
717
+ {
718
+ "cell_type": "code",
719
+ "execution_count": 32,
720
+ "metadata": {
721
+ "colab": {
722
+ "base_uri": "https://localhost:8080/"
723
+ },
724
+ "id": "cRp2vmxeRSam",
725
+ "outputId": "c983365b-57b8-4b16-ec3b-30722b120235"
726
+ },
727
+ "outputs": [
728
+ {
729
+ "data": {
730
+ "text/plain": [
731
+ "[{'label': 'negative', 'score': 0.9914922118186951},\n",
732
+ " {'label': 'negative', 'score': 0.9830396771430969},\n",
733
+ " {'label': 'negative', 'score': 0.9941385984420776}]"
734
+ ]
735
+ },
736
+ "execution_count": 32,
737
+ "metadata": {},
738
+ "output_type": "execute_result"
739
+ }
740
+ ],
741
+ "source": [
742
+ "pipe([\"Sikap tidak peduli dia menyebabkan ibu bapa dia geram\", \"Saya sangat benci warna merah\", \"Cis! Dompet aku hilang!\"])"
743
+ ]
744
+ },
745
+ {
746
+ "cell_type": "code",
747
+ "execution_count": 34,
748
+ "metadata": {
749
+ "colab": {
750
+ "base_uri": "https://localhost:8080/"
751
+ },
752
+ "id": "czWBDOvlo20m",
753
+ "outputId": "25705b2d-32e8-42d9-866c-84cf499fd22e"
754
+ },
755
+ "outputs": [
756
+ {
757
+ "data": {
758
+ "text/plain": [
759
+ "[{'label': 'negative', 'score': 0.9114706516265869},\n",
760
+ " {'label': 'positive', 'score': 0.9896261692047119},\n",
761
+ " {'label': 'negative', 'score': 0.9341222047805786}]"
762
+ ]
763
+ },
764
+ "execution_count": 34,
765
+ "metadata": {},
766
+ "output_type": "execute_result"
767
+ }
768
+ ],
769
+ "source": [
770
+ "pipe([\"His don't care attitude causes much strife to his parents\", \"I hate red color\", \"Gah! My Wallet is missing!\"])"
771
+ ]
772
+ },
773
+ {
774
+ "cell_type": "code",
775
+ "execution_count": 21,
776
+ "metadata": {
777
+ "id": "akGTf-l_ndZy",
778
+ "pycharm": {
779
+ "name": "#%%\n"
780
+ }
781
+ },
782
+ "outputs": [],
783
+ "source": [
784
+ "def predict_sentiment(sentence):\n",
785
+ " predict_input = tokenizer.encode(sentence,\n",
786
+ " truncation=True,\n",
787
+ " padding=True,\n",
788
+ " return_tensors=\"tf\")\n",
789
+ "\n",
790
+ " tf_output = loaded_model.predict(predict_input)[0]\n",
791
+ " tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]\n",
792
+ "\n",
793
+ " sentiment = 0 if tf_prediction[0] > tf_prediction[1] else 1\n",
794
+ " print(tf_prediction)\n",
795
+ " return sentiment"
796
+ ]
797
+ },
798
+ {
799
+ "cell_type": "code",
800
+ "execution_count": 22,
801
+ "metadata": {
802
+ "colab": {
803
+ "base_uri": "https://localhost:8080/"
804
+ },
805
+ "id": "SG7PCrB3nlH0",
806
+ "outputId": "dc07eecc-13b0-4c02-94e6-c6c8e8036fa1"
807
+ },
808
+ "outputs": [
809
+ {
810
+ "name": "stdout",
811
+ "output_type": "stream",
812
+ "text": [
813
+ "[0.0143008 0.98569924]\n"
814
+ ]
815
+ },
816
+ {
817
+ "data": {
818
+ "text/plain": [
819
+ "1"
820
+ ]
821
+ },
822
+ "execution_count": 22,
823
+ "metadata": {},
824
+ "output_type": "execute_result"
825
+ }
826
+ ],
827
+ "source": [
828
+ "predict_sentiment(\"gembira\")"
829
+ ]
830
+ },
831
+ {
832
+ "cell_type": "code",
833
+ "execution_count": 23,
834
+ "metadata": {
835
+ "colab": {
836
+ "base_uri": "https://localhost:8080/"
837
+ },
838
+ "id": "lWiz1MO1nlbO",
839
+ "outputId": "1ebca034-79cc-4774-e79b-88925c58b34d"
840
+ },
841
+ "outputs": [
842
+ {
843
+ "name": "stdout",
844
+ "output_type": "stream",
845
+ "text": [
846
+ "[0.57475716 0.4252428 ]\n"
847
+ ]
848
+ },
849
+ {
850
+ "data": {
851
+ "text/plain": [
852
+ "0"
853
+ ]
854
+ },
855
+ "execution_count": 23,
856
+ "metadata": {},
857
+ "output_type": "execute_result"
858
+ }
859
+ ],
860
+ "source": [
861
+ "predict_sentiment(\"marah\")"
862
+ ]
863
+ }
864
+ ],
865
+ "metadata": {
866
+ "accelerator": "GPU",
867
+ "colab": {
868
+ "collapsed_sections": [],
869
+ "name": "Hugging Face Bert Malay Sentiment.ipynb",
870
+ "provenance": []
871
+ },
872
+ "kernelspec": {
873
+ "display_name": "Python 3",
874
+ "language": "python",
875
+ "name": "python3"
876
+ },
877
+ "language_info": {
878
+ "codemirror_mode": {
879
+ "name": "ipython",
880
+ "version": 2
881
+ },
882
+ "file_extension": ".py",
883
+ "mimetype": "text/x-python",
884
+ "name": "python",
885
+ "nbconvert_exporter": "python",
886
+ "pygments_lexer": "ipython2",
887
+ "version": "2.7.6"
888
+ }
889
+ },
890
+ "nbformat": 4,
891
+ "nbformat_minor": 0
892
+ }
README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ms
4
+ - en
5
+ license: apache-2.0
6
+ tags:
7
+ - sentiment-analysis
8
+ widget:
9
+ - text: "Gembiranya saya hari ini!"
10
+ ---
11
+
12
+ # bert-tiny-bahasa-cased-sentiment
13
+
14
+ Proof of concept of creating a sentiment analysis model with using
15
+ https://huggingface.co/malay-huggingface/bert-base-bahasa-cased as the base model.
16
+
17
+ Tokenizer is copied directly from https://huggingface.co/malay-huggingface/bert-base-bahasa-cased.
18
+
19
+ Sentiment analysis fine tuning was done with data compiled by [huseinzol05](https://github.com/huseinzol05/) at https://github.com/huseinzol05/malay-dataset/tree/master/sentiment.
archive/model-20211015/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "malay-huggingface/bert-tiny-bahasa-cased",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "id2label": {
7
+ "0": "negative",
8
+ "1": "positive"
9
+ },
10
+ "label2id": {
11
+ "negative": 0,
12
+ "positive": 1
13
+ },
14
+ "attention_probs_dropout_prob": 0.1,
15
+ "classifier_dropout": null,
16
+ "gradient_checkpointing": false,
17
+ "hidden_act": "gelu",
18
+ "hidden_dropout_prob": 0.1,
19
+ "hidden_size": 336,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 1344,
22
+ "layer_norm_eps": 1e-12,
23
+ "max_position_embeddings": 512,
24
+ "model_type": "bert",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 4,
27
+ "pad_token_id": 0,
28
+ "position_embedding_type": "absolute",
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.11.3",
31
+ "type_vocab_size": 2,
32
+ "use_cache": true,
33
+ "vocab_size": 32000
34
+ }
archive/model-20211015/tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a8ba89838943372c63b4d6741b97589b76c7044a29110377e6d379b246cf01e
3
+ size 66016632
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "malay-huggingface/bert-tiny-bahasa-cased",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 336,
12
+ "id2label": {
13
+ "0": "negative",
14
+ "1": "positive"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 1344,
18
+ "label2id": {
19
+ "negative": 0,
20
+ "positive": 1
21
+ },
22
+ "layer_norm_eps": 1e-12,
23
+ "max_position_embeddings": 512,
24
+ "model_type": "bert",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 4,
27
+ "pad_token_id": 0,
28
+ "position_embedding_type": "absolute",
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.11.3",
31
+ "type_vocab_size": 2,
32
+ "use_cache": true,
33
+ "vocab_size": 32000
34
+ }
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:807c95d88ec7570eb5d9091a9f1eeea30795ef6f5fbea58c35cc1470a5192154
3
+ size 66016632
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "special_tokens_map_file": "/home/patrick/.cache/huggingface/transformers/8976f17381927c83231ffc41ac983516a57f6d0d6e7addbd5f38fa654e4269e0.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d", "tokenizer_file": null, "name_or_path": "malay-huggingface/bert-tiny-bahasa-cased", "tokenizer_class": "BertTokenizer"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff