yasirfaizahmed commited on
Commit
75f0757
1 Parent(s): afe3748

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .venv
2
+ *.h5
3
+ models
4
+ dist
5
+ *info
6
+ data
notes/BC_model_base.ipynb ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 8,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "# reading dataset files\n",
11
+ "muslim_file = \"../data/right/Haqiqatjou.csv\"\n",
12
+ "muslim_df = pd.read_csv(muslim_file)\n",
13
+ "\n",
14
+ "\n",
15
+ "sanghi_file = \"../data/wrong/MrSinha_.csv\"\n",
16
+ "sanghi_df = pd.read_csv(sanghi_file)"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 9,
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "# dataset formatting\n",
26
+ "muslim_df['category'] = 'MUSLIM'\n",
27
+ "\n",
28
+ "sanghi_df['category'] = 'SANGHI'\n",
29
+ "\n",
30
+ "\n",
31
+ "frames = [muslim_df, sanghi_df]\n",
32
+ "final_dataset = pd.concat(frames)\n",
33
+ "final_dataset = final_dataset.drop(['index'], axis=1)\n",
34
+ "\n",
35
+ "\n"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 10,
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "# ref : https://stackoverflow.com/questions/43777243/how-to-split-a-dataframe-in-pandas-in-predefined-percentages\n",
45
+ "\n",
46
+ "# 70% train dataset\n",
47
+ "# train_frames = [muslim_df.sample(frac=0.7), sanghi_df.sample(frac=0.7).reset_index(drop=True)]\n",
48
+ "train_frames = [muslim_df, sanghi_df]\n",
49
+ "train_dataset = pd.concat(train_frames)\n",
50
+ "train_dataset['content'] = train_dataset['content'].astype(str)\n",
51
+ "\n",
52
+ "\n",
53
+ "# 30% test dataset\n",
54
+ "remaining_frames = [muslim_df.drop(muslim_df.sample(frac=0.7).index), sanghi_df.drop(sanghi_df.sample(frac=0.7).index).reset_index(drop=True)]\n",
55
+ "test_dataset = pd.concat(remaining_frames)\n",
56
+ "# test_dataset['content'] = train_dataset['content'].astype(str)\n",
57
+ "# test_dataset\n",
58
+ "\n"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 1,
64
+ "metadata": {},
65
+ "outputs": [
66
+ {
67
+ "name": "stderr",
68
+ "output_type": "stream",
69
+ "text": [
70
+ "2024-03-22 14:25:54.140036: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
71
+ "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
72
+ "2024-03-22 14:25:58.302374: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
73
+ ]
74
+ }
75
+ ],
76
+ "source": [
77
+ "import matplotlib.pyplot as plt\n",
78
+ "import os\n",
79
+ "import re\n",
80
+ "import shutil\n",
81
+ "import string\n",
82
+ "import tensorflow as tf\n",
83
+ "from keras import layers\n",
84
+ "from keras import losses\n",
85
+ "import keras\n",
86
+ "from tensorflow.keras.preprocessing.text import Tokenizer\n",
87
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": null,
93
+ "metadata": {},
94
+ "outputs": [],
95
+ "source": [
96
+ "# Tokenization\n",
97
+ "tokenizer = Tokenizer()\n",
98
+ "tokenizer.fit_on_texts(train_dataset['content'])\n",
99
+ "\n",
100
+ "vocab_size = len(tokenizer.word_index) + 1\n",
101
+ "max_length = 100 # adjust as needed\n",
102
+ "embedding_dim = 50 # adjust as needed\n",
103
+ "\n",
104
+ "X = tokenizer.texts_to_sequences(train_dataset['content'])\n",
105
+ "X = pad_sequences(X, maxlen=max_length, padding='post')\n",
106
+ "\n",
107
+ "# Encode labels\n",
108
+ "labels = train_dataset['category'].map({'MUSLIM': '1', 'SANGHI': 0}).astype('float32').values"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": null,
114
+ "metadata": {},
115
+ "outputs": [
116
+ {
117
+ "name": "stderr",
118
+ "output_type": "stream",
119
+ "text": [
120
+ "/home/xd/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/keras/src/layers/core/embedding.py:81: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n",
121
+ " super().__init__(**kwargs)\n",
122
+ "2024-03-21 20:20:29.968292: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
123
+ "2024-03-21 20:20:33.617785: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
124
+ "2024-03-21 20:20:33.618176: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
125
+ "2024-03-21 20:20:33.620147: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
126
+ "2024-03-21 20:20:33.620655: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
127
+ "2024-03-21 20:20:33.620962: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
128
+ "2024-03-21 20:20:33.824877: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
129
+ "2024-03-21 20:20:33.825245: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
130
+ "2024-03-21 20:20:33.825539: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
131
+ "2024-03-21 20:20:33.841867: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1754 MB memory: -> device: 0, name: NVIDIA GeForce 920MX, pci bus id: 0000:01:00.0, compute capability: 5.0\n"
132
+ ]
133
+ }
134
+ ],
135
+ "source": [
136
+ "# build the model\n",
137
+ "model = keras.Sequential([\n",
138
+ " keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,)),\n",
139
+ " keras.layers.GlobalAveragePooling1D(),\n",
140
+ " keras.layers.Dense(16, activation='relu'),\n",
141
+ " keras.layers.Dense(1, activation='sigmoid')\n",
142
+ "])"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": null,
148
+ "metadata": {},
149
+ "outputs": [],
150
+ "source": [
151
+ "# Compile the model\n",
152
+ "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": null,
158
+ "metadata": {},
159
+ "outputs": [],
160
+ "source": [
161
+ "# Split into train and test datasets\n",
162
+ "from sklearn.model_selection import train_test_split\n",
163
+ "X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": null,
169
+ "metadata": {},
170
+ "outputs": [
171
+ {
172
+ "name": "stdout",
173
+ "output_type": "stream",
174
+ "text": [
175
+ "Epoch 1/10\n"
176
+ ]
177
+ },
178
+ {
179
+ "name": "stderr",
180
+ "output_type": "stream",
181
+ "text": [
182
+ "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
183
+ "I0000 00:00:1711032641.198169 31002 service.cc:145] XLA service 0x7fa54c006910 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n",
184
+ "I0000 00:00:1711032641.198244 31002 service.cc:153] StreamExecutor device (0): NVIDIA GeForce 920MX, Compute Capability 5.0\n",
185
+ "2024-03-21 20:20:41.877803: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
186
+ "2024-03-21 20:20:42.791103: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907\n"
187
+ ]
188
+ },
189
+ {
190
+ "name": "stdout",
191
+ "output_type": "stream",
192
+ "text": [
193
+ "\u001b[1m34/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━━\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - accuracy: 0.5261 - loss: 0.6929"
194
+ ]
195
+ },
196
+ {
197
+ "name": "stderr",
198
+ "output_type": "stream",
199
+ "text": [
200
+ "I0000 00:00:1711032645.847800 31002 device_compiler.h:188] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.\n"
201
+ ]
202
+ },
203
+ {
204
+ "name": "stdout",
205
+ "output_type": "stream",
206
+ "text": [
207
+ "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 71ms/step - accuracy: 0.5337 - loss: 0.6922 - val_accuracy: 0.5514 - val_loss: 0.6814\n",
208
+ "Epoch 2/10\n",
209
+ "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.5740 - loss: 0.6781 - val_accuracy: 0.5563 - val_loss: 0.6636\n",
210
+ "Epoch 3/10\n",
211
+ "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.6603 - loss: 0.6492 - val_accuracy: 0.7781 - val_loss: 0.6244\n",
212
+ "Epoch 4/10\n",
213
+ "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.8210 - loss: 0.5806 - val_accuracy: 0.8328 - val_loss: 0.5476\n",
214
+ "Epoch 5/10\n",
215
+ "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - accuracy: 0.8506 - loss: 0.4952 - val_accuracy: 0.7653 - val_loss: 0.4860\n",
216
+ "Epoch 6/10\n",
217
+ "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.8238 - loss: 0.4224 - val_accuracy: 0.8842 - val_loss: 0.4122\n",
218
+ "Epoch 7/10\n",
219
+ "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.9288 - loss: 0.3257 - val_accuracy: 0.8408 - val_loss: 0.3694\n",
220
+ "Epoch 8/10\n",
221
+ "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.9280 - loss: 0.2705 - val_accuracy: 0.9357 - val_loss: 0.3032\n",
222
+ "Epoch 9/10\n",
223
+ "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.9178 - loss: 0.2446 - val_accuracy: 0.9357 - val_loss: 0.2772\n",
224
+ "Epoch 10/10\n",
225
+ "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.9724 - loss: 0.1830 - val_accuracy: 0.8826 - val_loss: 0.2798\n"
226
+ ]
227
+ },
228
+ {
229
+ "data": {
230
+ "text/plain": [
231
+ "<keras.src.callbacks.history.History at 0x7fa5504b6080>"
232
+ ]
233
+ },
234
+ "execution_count": 12,
235
+ "metadata": {},
236
+ "output_type": "execute_result"
237
+ }
238
+ ],
239
+ "source": [
240
+ "# Train the model\n",
241
+ "model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "execution_count": null,
247
+ "metadata": {},
248
+ "outputs": [
249
+ {
250
+ "name": "stdout",
251
+ "output_type": "stream",
252
+ "text": [
253
+ "\u001b[1m20/20\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 4ms/step - accuracy: 0.8722 - loss: 0.2829\n",
254
+ "Test Accuracy: 0.8826366662979126\n"
255
+ ]
256
+ }
257
+ ],
258
+ "source": [
259
+ "# Evaluate the model\n",
260
+ "loss, accuracy = model.evaluate(X_test, y_test)\n",
261
+ "print(\"Test Accuracy:\", accuracy)"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "code",
266
+ "execution_count": null,
267
+ "metadata": {},
268
+ "outputs": [],
269
+ "source": [
270
+ "# save the model\n",
271
+ "try:\n",
272
+ " model.save(\"../models/muslim-sanghi-binary-classification.keras\")\n",
273
+ "except FileNotFoundError:\n",
274
+ " os.mkdir(\"../models\")\n",
275
+ " model.save(\"../models/muslim-sanghi-binary-classification.keras\")"
276
+ ]
277
+ },
278
+ {
279
+ "cell_type": "code",
280
+ "execution_count": null,
281
+ "metadata": {},
282
+ "outputs": [
283
+ {
284
+ "name": "stdout",
285
+ "output_type": "stream",
286
+ "text": [
287
+ "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 250ms/step\n"
288
+ ]
289
+ },
290
+ {
291
+ "data": {
292
+ "text/plain": [
293
+ "0.3806811"
294
+ ]
295
+ },
296
+ "execution_count": 15,
297
+ "metadata": {},
298
+ "output_type": "execute_result"
299
+ }
300
+ ],
301
+ "source": [
302
+ "# use model\n",
303
+ "def preprocess_text(text):\n",
304
+ " # Tokenize the text\n",
305
+ " tokenized_text = tokenizer.texts_to_sequences([text])\n",
306
+ " # Pad sequences to the same length as training data\n",
307
+ " padded_text = pad_sequences(tokenized_text, maxlen=max_length, padding='post')\n",
308
+ " return padded_text\n",
309
+ "\n",
310
+ "# load model\n",
311
+ "model = keras.models.load_model(\"../models/muslim-sanghi-binary-classification.keras\") # Replace \"your_model.h5\" with the path to your trained model\n",
312
+ "\n",
313
+ "# Preprocess the custom input text\n",
314
+ "preprocessed_text = preprocess_text(\"'Kanwariyas are slaves, they're doing slavery' A team member of Pappu shamelessly says it & Pappu happily agrees\")\n",
315
+ "\n",
316
+ "# Make predictions\n",
317
+ "predictions = model.predict(preprocessed_text)\n",
318
+ "\n",
319
+ "predictions[0][0]"
320
+ ]
321
+ }
322
+ ],
323
+ "metadata": {
324
+ "kernelspec": {
325
+ "display_name": "Python 3 (ipykernel)",
326
+ "language": "python",
327
+ "name": "python3"
328
+ }
329
+ },
330
+ "nbformat": 4,
331
+ "nbformat_minor": 2
332
+ }
notes/custom/right_wrong_BC_model.ipynb ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "from pathlib import Path as pp\n",
11
+ "from pandas.core.frame import DataFrame as df\n",
12
+ "import os\n",
13
+ "\n",
14
+ "# reading dataset files\n",
15
+ "right_dir: pp = pp(\"../data/right\")\n",
16
+ "wrong_dir: pp = pp(\"../data/wrong\")\n",
17
+ "\n",
18
+ "# dataframes\n",
19
+ "right_df : df = df()\n",
20
+ "wrong_df : df = df()"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 4,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "# merging right datafiles into a dataframe\n",
30
+ "rdf_list = []\n",
31
+ "for file in os.scandir(right_dir.absolute()):\n",
32
+ " if os.path.exists(file):\n",
33
+ " rdf_list.append(pd.read_csv(file))\n",
34
+ "right_df = pd.concat(rdf_list)"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 5,
40
+ "metadata": {},
41
+ "outputs": [
42
+ {
43
+ "data": {
44
+ "text/html": [
45
+ "<div>\n",
46
+ "<style scoped>\n",
47
+ " .dataframe tbody tr th:only-of-type {\n",
48
+ " vertical-align: middle;\n",
49
+ " }\n",
50
+ "\n",
51
+ " .dataframe tbody tr th {\n",
52
+ " vertical-align: top;\n",
53
+ " }\n",
54
+ "\n",
55
+ " .dataframe thead th {\n",
56
+ " text-align: right;\n",
57
+ " }\n",
58
+ "</style>\n",
59
+ "<table border=\"1\" class=\"dataframe\">\n",
60
+ " <thead>\n",
61
+ " <tr style=\"text-align: right;\">\n",
62
+ " <th></th>\n",
63
+ " <th>index</th>\n",
64
+ " <th>content</th>\n",
65
+ " </tr>\n",
66
+ " </thead>\n",
67
+ " <tbody>\n",
68
+ " <tr>\n",
69
+ " <th>0</th>\n",
70
+ " <td>0</td>\n",
71
+ " <td>Feeling annoyed? Follow this thread</td>\n",
72
+ " </tr>\n",
73
+ " <tr>\n",
74
+ " <th>1</th>\n",
75
+ " <td>1</td>\n",
76
+ " <td>Thread, why you should vote for BJP. #PhirEKBa...</td>\n",
77
+ " </tr>\n",
78
+ " <tr>\n",
79
+ " <th>2</th>\n",
80
+ " <td>2</td>\n",
81
+ " <td>The biggest festival of democracy is here! EC ...</td>\n",
82
+ " </tr>\n",
83
+ " <tr>\n",
84
+ " <th>3</th>\n",
85
+ " <td>3</td>\n",
86
+ " <td>Slogan competition \\nShare your slogan idea ( ...</td>\n",
87
+ " </tr>\n",
88
+ " <tr>\n",
89
+ " <th>4</th>\n",
90
+ " <td>4</td>\n",
91
+ " <td>4 जून की करो तैयारी,\\n\\nआ रहे हैं भगवाधारी....</td>\n",
92
+ " </tr>\n",
93
+ " <tr>\n",
94
+ " <th>...</th>\n",
95
+ " <td>...</td>\n",
96
+ " <td>...</td>\n",
97
+ " </tr>\n",
98
+ " <tr>\n",
99
+ " <th>92</th>\n",
100
+ " <td>92</td>\n",
101
+ " <td>मैं बता रहा हूँ, बेंगलुरू मामले में भी बहुत ते...</td>\n",
102
+ " </tr>\n",
103
+ " <tr>\n",
104
+ " <th>93</th>\n",
105
+ " <td>93</td>\n",
106
+ " <td>ईरान में करीब 1200 भारतीय फंसे हैं। 800 छात्र ...</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>94</th>\n",
110
+ " <td>94</td>\n",
111
+ " <td>वैसे तो TikTok ने PMCares फंड में LAC पर चीन स...</td>\n",
112
+ " </tr>\n",
113
+ " <tr>\n",
114
+ " <th>95</th>\n",
115
+ " <td>95</td>\n",
116
+ " <td>बंगाल में भीड़ से खचाखच भरी रैलियां हो सकती है...</td>\n",
117
+ " </tr>\n",
118
+ " <tr>\n",
119
+ " <th>96</th>\n",
120
+ " <td>96</td>\n",
121
+ " <td>Thank You PM \\n@narendramodi\\n ji for follow b...</td>\n",
122
+ " </tr>\n",
123
+ " </tbody>\n",
124
+ "</table>\n",
125
+ "<p>1675 rows × 2 columns</p>\n",
126
+ "</div>"
127
+ ],
128
+ "text/plain": [
129
+ " index content\n",
130
+ "0 0 Feeling annoyed? Follow this thread \n",
131
+ "1 1 Thread, why you should vote for BJP. #PhirEKBa...\n",
132
+ "2 2 The biggest festival of democracy is here! EC ...\n",
133
+ "3 3 Slogan competition \\nShare your slogan idea ( ...\n",
134
+ "4 4 4 जून की करो तैयारी,\\n\\nआ रहे हैं भगवाधारी....\n",
135
+ ".. ... ...\n",
136
+ "92 92 मैं बता रहा हूँ, बेंगलुरू मामले में भी बहुत ते...\n",
137
+ "93 93 ईरान में करीब 1200 भारतीय फंसे हैं। 800 छात्र ...\n",
138
+ "94 94 वैसे तो TikTok ने PMCares फंड मे��� LAC पर चीन स...\n",
139
+ "95 95 बंगाल में भीड़ से खचाखच भरी रैलियां हो सकती है...\n",
140
+ "96 96 Thank You PM \\n@narendramodi\\n ji for follow b...\n",
141
+ "\n",
142
+ "[1675 rows x 2 columns]"
143
+ ]
144
+ },
145
+ "execution_count": 5,
146
+ "metadata": {},
147
+ "output_type": "execute_result"
148
+ }
149
+ ],
150
+ "source": [
151
+ "# merging left datafiles into a dataframe\n",
152
+ "ldf_list = []\n",
153
+ "for file in os.scandir(wrong_dir.absolute()):\n",
154
+ " if os.path.exists(file):\n",
155
+ " ldf_list.append(pd.read_csv(file))\n",
156
+ "wrong_df = pd.concat(ldf_list)\n",
157
+ "wrong_df"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 6,
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": [
166
+ "# add category to the dataframes\n",
167
+ "right_df['category'] = 'RIGHT'\n",
168
+ "wrong_df['category'] = 'WRONG'\n",
169
+ "wrong_df\n",
170
+ "\n",
171
+ "# cleaning up the columns and adding the binray labels to the dataframes\n",
172
+ "frame = [right_df, wrong_df]\n",
173
+ "final_dataset: df = pd.concat(frame)\n",
174
+ "\n",
175
+ "final_dataset.drop('index', axis=1)\n",
176
+ "final_dataset\n",
177
+ "final_dataset['content'] = final_dataset['content'].astype(str) # all rows in 'content' column must be of type str\n"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 7,
183
+ "metadata": {},
184
+ "outputs": [
185
+ {
186
+ "name": "stderr",
187
+ "output_type": "stream",
188
+ "text": [
189
+ "2024-03-29 19:54:59.568730: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
190
+ "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
191
+ "2024-03-29 19:55:07.003956: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
192
+ ]
193
+ }
194
+ ],
195
+ "source": [
196
+ ".to_pandas()"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "execution_count": 8,
202
+ "metadata": {},
203
+ "outputs": [],
204
+ "source": [
205
+ "# Tokenization of dataset\n",
206
+ "tokenizer = Tokenizer()\n",
207
+ "tokenizer.fit_on_texts(final_dataset['content'])\n",
208
+ "\n",
209
+ "vocab_size = len(tokenizer.word_index) + 1 # our dataset vocab size (space split)\n",
210
+ "max_length = 200 # max words in a sentence\n",
211
+ "embedding_dim = 50 # TODO: need to adjust accordingly\n",
212
+ "\n",
213
+ "X = tokenizer.texts_to_sequences(final_dataset['content'])\n",
214
+ "X = pad_sequences(X, maxlen=max_length, padding='post')"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": 9,
220
+ "metadata": {},
221
+ "outputs": [],
222
+ "source": [
223
+ "# Encode the lables\n",
224
+ "labels = final_dataset['category'].map({'RIGHT': '1', 'WRONG': '0'}).astype('float32').values\n"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 10,
230
+ "metadata": {},
231
+ "outputs": [
232
+ {
233
+ "name": "stderr",
234
+ "output_type": "stream",
235
+ "text": [
236
+ "/home/xd/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/keras/src/layers/core/embedding.py:81: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n",
237
+ " super().__init__(**kwargs)\n",
238
+ "2024-03-29 19:55:16.544096: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
239
+ "2024-03-29 19:55:21.152411: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
240
+ "2024-03-29 19:55:21.152806: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
241
+ "2024-03-29 19:55:21.154795: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
242
+ "2024-03-29 19:55:21.155560: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
243
+ "2024-03-29 19:55:21.156123: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
244
+ "2024-03-29 19:55:21.347819: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
245
+ "2024-03-29 19:55:21.348262: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
246
+ "2024-03-29 19:55:21.348616: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
247
+ "2024-03-29 19:55:21.365839: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1754 MB memory: -> device: 0, name: NVIDIA GeForce 920MX, pci bus id: 0000:01:00.0, compute capability: 5.0\n"
248
+ ]
249
+ }
250
+ ],
251
+ "source": [
252
+ "# Build the model\n",
253
+ "model = keras.Sequential([\n",
254
+ " keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,)),\n",
255
+ " keras.layers.GlobalAveragePooling1D(),\n",
256
+ " keras.layers.Dense(16, activation='relu'),\n",
257
+ " keras.layers.Dense(1, activation='sigmoid')\n",
258
+ "])"
259
+ ]
260
+ },
261
+ {
262
+ "cell_type": "code",
263
+ "execution_count": 11,
264
+ "metadata": {},
265
+ "outputs": [],
266
+ "source": [
267
+ "# Compile the model\n",
268
+ "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": 18,
274
+ "metadata": {},
275
+ "outputs": [
276
+ {
277
+ "name": "stdout",
278
+ "output_type": "stream",
279
+ "text": [
280
+ "(3140, 200)\n",
281
+ "(1347, 200)\n",
282
+ "(3140,)\n",
283
+ "(1347,)\n",
284
+ "(4487,)\n"
285
+ ]
286
+ }
287
+ ],
288
+ "source": [
289
+ "# split the dataset into train and test\n",
290
+ "from sklearn.model_selection import train_test_split\n",
291
+ "\n",
292
+ "X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42, shuffle=True)\n",
293
+ "print(X_train.shape)\n",
294
+ "print(X_test.shape)\n",
295
+ "print(y_train.shape)\n",
296
+ "print(y_test.shape)\n",
297
+ "\n",
298
+ "print(labels.shape)"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "code",
303
+ "execution_count": 13,
304
+ "metadata": {},
305
+ "outputs": [
306
+ {
307
+ "name": "stdout",
308
+ "output_type": "stream",
309
+ "text": [
310
+ "Epoch 1/10\n"
311
+ ]
312
+ },
313
+ {
314
+ "name": "stderr",
315
+ "output_type": "stream",
316
+ "text": [
317
+ "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
318
+ "I0000 00:00:1711722330.932635 76533 service.cc:145] XLA service 0x7bbdd40053b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n",
319
+ "I0000 00:00:1711722330.932689 76533 service.cc:153] StreamExecutor device (0): NVIDIA GeForce 920MX, Compute Capability 5.0\n",
320
+ "2024-03-29 19:55:31.370178: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
321
+ "2024-03-29 19:55:33.092199: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907\n"
322
+ ]
323
+ },
324
+ {
325
+ "name": "stdout",
326
+ "output_type": "stream",
327
+ "text": [
328
+ "\u001b[1m17/99\u001b[0m \u001b[32m━━━\u001b[0m\u001b[37m━━━━━━━━━━━━━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.5972 - loss: 0.6806"
329
+ ]
330
+ },
331
+ {
332
+ "name": "stderr",
333
+ "output_type": "stream",
334
+ "text": [
335
+ "I0000 00:00:1711722338.226902 76533 device_compiler.h:188] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.\n"
336
+ ]
337
+ },
338
+ {
339
+ "name": "stdout",
340
+ "output_type": "stream",
341
+ "text": [
342
+ "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m15s\u001b[0m 56ms/step - accuracy: 0.6186 - loss: 0.6631 - val_accuracy: 0.6511 - val_loss: 0.6453\n",
343
+ "Epoch 2/10\n",
344
+ "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.6242 - loss: 0.6554 - val_accuracy: 0.6511 - val_loss: 0.6393\n",
345
+ "Epoch 3/10\n",
346
+ "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.6134 - loss: 0.6635 - val_accuracy: 0.6511 - val_loss: 0.6375\n",
347
+ "Epoch 4/10\n",
348
+ "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.6270 - loss: 0.6466 - val_accuracy: 0.6511 - val_loss: 0.6305\n",
349
+ "Epoch 5/10\n",
350
+ "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.6063 - loss: 0.6464 - val_accuracy: 0.6548 - val_loss: 0.6168\n",
351
+ "Epoch 6/10\n",
352
+ "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.6640 - loss: 0.6361 - val_accuracy: 0.6600 - val_loss: 0.5972\n",
353
+ "Epoch 7/10\n",
354
+ "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 8ms/step - accuracy: 0.6912 - loss: 0.5938 - val_accuracy: 0.7053 - val_loss: 0.5723\n",
355
+ "Epoch 8/10\n",
356
+ "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.7263 - loss: 0.5345 - val_accuracy: 0.5880 - val_loss: 0.6528\n",
357
+ "Epoch 9/10\n",
358
+ "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 6ms/step - accuracy: 0.7473 - loss: 0.5100 - val_accuracy: 0.7669 - val_loss: 0.5012\n",
359
+ "Epoch 10/10\n",
360
+ "\u001b[1m99/99\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 6ms/step - accuracy: 0.8396 - loss: 0.4156 - val_accuracy: 0.7194 - val_loss: 0.5071\n"
361
+ ]
362
+ },
363
+ {
364
+ "data": {
365
+ "text/plain": [
366
+ "<keras.src.callbacks.history.History at 0x7bbdc43be710>"
367
+ ]
368
+ },
369
+ "execution_count": 13,
370
+ "metadata": {},
371
+ "output_type": "execute_result"
372
+ }
373
+ ],
374
+ "source": [
375
+ "# train the model\n",
376
+ "model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "execution_count": 14,
382
+ "metadata": {},
383
+ "outputs": [
384
+ {
385
+ "name": "stdout",
386
+ "output_type": "stream",
387
+ "text": [
388
+ "\u001b[1m43/43\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 4ms/step - accuracy: 0.7254 - loss: 0.5047\n"
389
+ ]
390
+ },
391
+ {
392
+ "data": {
393
+ "text/plain": [
394
+ "0.7193763852119446"
395
+ ]
396
+ },
397
+ "execution_count": 14,
398
+ "metadata": {},
399
+ "output_type": "execute_result"
400
+ }
401
+ ],
402
+ "source": [
403
+ "# Evaluate the model\n",
404
+ "loss, accuracy = model.evaluate(X_test, y_test)\n",
405
+ "accuracy"
406
+ ]
407
+ },
408
+ {
409
+ "cell_type": "code",
410
+ "execution_count": 15,
411
+ "metadata": {},
412
+ "outputs": [],
413
+ "source": [
414
+ "# save the model\n",
415
+ "try:\n",
416
+ " model.save(\"../models/right-wrong-BC.keras\")\n",
417
+ "except FileNotFoundError:\n",
418
+ " os.mkdir(\"../models\")\n",
419
+ " model.save(\"../models/right-wrong-BC.keras\")"
420
+ ]
421
+ },
422
+ {
423
+ "cell_type": "code",
424
+ "execution_count": 16,
425
+ "metadata": {},
426
+ "outputs": [
427
+ {
428
+ "name": "stdout",
429
+ "output_type": "stream",
430
+ "text": [
431
+ "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 347ms/step\n"
432
+ ]
433
+ },
434
+ {
435
+ "data": {
436
+ "text/plain": [
437
+ "0.71704614"
438
+ ]
439
+ },
440
+ "execution_count": 16,
441
+ "metadata": {},
442
+ "output_type": "execute_result"
443
+ }
444
+ ],
445
+ "source": [
446
+ "# use model\n",
447
+ "def preprocess_text(text):\n",
448
+ " # Tokenize the text\n",
449
+ " tokenized_text = tokenizer.texts_to_sequences([text])\n",
450
+ " # Pad sequences to the same length as training data\n",
451
+ " padded_text = pad_sequences(tokenized_text, maxlen=max_length, padding='post')\n",
452
+ " return padded_text\n",
453
+ "\n",
454
+ "# load model\n",
455
+ "model = keras.models.load_model(\"../models/right-wrong-BC.keras\") # Replace \"your_model.h5\" with the path to your trained model\n",
456
+ "\n",
457
+ "# Preprocess the custom input text\n",
458
+ "preprocessed_text = preprocess_text(\"Modi ji is Moon Pappu on Bangkok honeymoon\")\n",
459
+ "\n",
460
+ "# Make predictions\n",
461
+ "predictions = model.predict(preprocessed_text)\n",
462
+ "\n",
463
+ "predictions[0][0]"
464
+ ]
465
+ }
466
+ ],
467
+ "metadata": {
468
+ "kernelspec": {
469
+ "display_name": "Python 3 (ipykernel)",
470
+ "language": "python",
471
+ "name": "python3"
472
+ }
473
+ },
474
+ "nbformat": 4,
475
+ "nbformat_minor": 2
476
+ }
notes/custom/sentimental_analysis_model.ipynb ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "ename": "KeyboardInterrupt",
10
+ "evalue": "",
11
+ "output_type": "error",
12
+ "traceback": [
13
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
14
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
15
+ "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[1;32m 3\u001b[0m dataset \u001b[38;5;241m=\u001b[39m load_dataset(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimsoumyaneel/sentiment-analysis-llama2\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
16
+ "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/datasets/__init__.py:18\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# ruff: noqa\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[1;32m 16\u001b[0m __version__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m2.18.0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01marrow_dataset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Dataset\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01marrow_reader\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ReadInstruction\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbuilder\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ArrowBasedBuilder, BeamBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder\n",
17
+ "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/datasets/arrow_dataset.py:59\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mfsspec\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m---> 59\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpa\u001b[39;00m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompute\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpc\u001b[39;00m\n",
18
+ "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/pandas/__init__.py:26\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m _hard_dependencies, _dependency, _missing_dependencies\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m# numpy compat\u001b[39;00m\n\u001b[0;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 27\u001b[0m is_numpy_dev \u001b[38;5;28;01mas\u001b[39;00m _is_numpy_dev, \u001b[38;5;66;03m# pyright: ignore[reportUnusedImport] # noqa: F401\u001b[39;00m\n\u001b[1;32m 28\u001b[0m )\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m _err: \u001b[38;5;66;03m# pragma: no cover\u001b[39;00m\n\u001b[1;32m 30\u001b[0m _module \u001b[38;5;241m=\u001b[39m _err\u001b[38;5;241m.\u001b[39mname\n",
19
+ "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/pandas/compat/__init__.py:27\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompressors\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m is_numpy_dev\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpyarrow\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 28\u001b[0m pa_version_under10p1,\n\u001b[1;32m 29\u001b[0m pa_version_under11p0,\n\u001b[1;32m 30\u001b[0m pa_version_under13p0,\n\u001b[1;32m 31\u001b[0m pa_version_under14p0,\n\u001b[1;32m 32\u001b[0m pa_version_under14p1,\n\u001b[1;32m 33\u001b[0m )\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m TYPE_CHECKING:\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_typing\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m F\n",
20
+ "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/pandas/compat/pyarrow.py:8\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutil\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversion\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Version\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpa\u001b[39;00m\n\u001b[1;32m 10\u001b[0m _palv \u001b[38;5;241m=\u001b[39m Version(Version(pa\u001b[38;5;241m.\u001b[39m__version__)\u001b[38;5;241m.\u001b[39mbase_version)\n\u001b[1;32m 11\u001b[0m pa_version_under10p1 \u001b[38;5;241m=\u001b[39m _palv \u001b[38;5;241m<\u001b[39m Version(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m10.0.1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
21
+ "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/pyarrow/__init__.py:65\u001b[0m\n\u001b[1;32m 63\u001b[0m _gc_enabled \u001b[38;5;241m=\u001b[39m _gc\u001b[38;5;241m.\u001b[39misenabled()\n\u001b[1;32m 64\u001b[0m _gc\u001b[38;5;241m.\u001b[39mdisable()\n\u001b[0;32m---> 65\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_lib\u001b[39;00m\n\u001b[1;32m 66\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _gc_enabled:\n\u001b[1;32m 67\u001b[0m _gc\u001b[38;5;241m.\u001b[39menable()\n",
22
+ "File \u001b[0;32m<frozen importlib._bootstrap>:404\u001b[0m, in \u001b[0;36mparent\u001b[0;34m(self)\u001b[0m\n",
23
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
24
+ ]
25
+ }
26
+ ],
27
+ "source": [
28
+ "from datasets import load_dataset\n",
29
+ "\n",
30
+ "dataset = load_dataset(\"imsoumyaneel/sentiment-analysis-llama2\")"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "import pandas as pd\n",
40
+ "from pandas.core.frame import DataFrame as df\n",
41
+ "\n",
42
+ "train_dataset = df(dataset['train'])"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": null,
48
+ "metadata": {},
49
+ "outputs": [
50
+ {
51
+ "data": {
52
+ "text/html": [
53
+ "<div>\n",
54
+ "<style scoped>\n",
55
+ " .dataframe tbody tr th:only-of-type {\n",
56
+ " vertical-align: middle;\n",
57
+ " }\n",
58
+ "\n",
59
+ " .dataframe tbody tr th {\n",
60
+ " vertical-align: top;\n",
61
+ " }\n",
62
+ "\n",
63
+ " .dataframe thead th {\n",
64
+ " text-align: right;\n",
65
+ " }\n",
66
+ "</style>\n",
67
+ "<table border=\"1\" class=\"dataframe\">\n",
68
+ " <thead>\n",
69
+ " <tr style=\"text-align: right;\">\n",
70
+ " <th></th>\n",
71
+ " <th>sentence</th>\n",
72
+ " <th>label</th>\n",
73
+ " <th>text</th>\n",
74
+ " </tr>\n",
75
+ " </thead>\n",
76
+ " <tbody>\n",
77
+ " <tr>\n",
78
+ " <th>0</th>\n",
79
+ " <td>I'll throw out the garbage .</td>\n",
80
+ " <td>neutral</td>\n",
81
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
82
+ " </tr>\n",
83
+ " <tr>\n",
84
+ " <th>1</th>\n",
85
+ " <td>So Dick , how about getting some coffee for to...</td>\n",
86
+ " <td>joy</td>\n",
87
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
88
+ " </tr>\n",
89
+ " <tr>\n",
90
+ " <th>2</th>\n",
91
+ " <td>Come on , you can at least try a little , besi...</td>\n",
92
+ " <td>neutral</td>\n",
93
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
94
+ " </tr>\n",
95
+ " <tr>\n",
96
+ " <th>3</th>\n",
97
+ " <td>What ’ s wrong with that ? Cigarette is the th...</td>\n",
98
+ " <td>anger</td>\n",
99
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
100
+ " </tr>\n",
101
+ " <tr>\n",
102
+ " <th>4</th>\n",
103
+ " <td>Not for me , Dick .</td>\n",
104
+ " <td>neutral</td>\n",
105
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
106
+ " </tr>\n",
107
+ " <tr>\n",
108
+ " <th>...</th>\n",
109
+ " <td>...</td>\n",
110
+ " <td>...</td>\n",
111
+ " <td>...</td>\n",
112
+ " </tr>\n",
113
+ " <tr>\n",
114
+ " <th>598293</th>\n",
115
+ " <td>You got banned for participating in a brigade.</td>\n",
116
+ " <td>sadness</td>\n",
117
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
118
+ " </tr>\n",
119
+ " <tr>\n",
120
+ " <th>598294</th>\n",
121
+ " <td>A joke is subjective pal, second of all you ne...</td>\n",
122
+ " <td>joy</td>\n",
123
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
124
+ " </tr>\n",
125
+ " <tr>\n",
126
+ " <th>598295</th>\n",
127
+ " <td>Well, I'm glad you're out of all that now. How...</td>\n",
128
+ " <td>joy</td>\n",
129
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
130
+ " </tr>\n",
131
+ " <tr>\n",
132
+ " <th>598296</th>\n",
133
+ " <td>Everyone likes [NAME].</td>\n",
134
+ " <td>love</td>\n",
135
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
136
+ " </tr>\n",
137
+ " <tr>\n",
138
+ " <th>598297</th>\n",
139
+ " <td>The FDA has plenty to criticize. But like here...</td>\n",
140
+ " <td>anger</td>\n",
141
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
142
+ " </tr>\n",
143
+ " </tbody>\n",
144
+ "</table>\n",
145
+ "<p>598298 rows × 3 columns</p>\n",
146
+ "</div>"
147
+ ],
148
+ "text/plain": [
149
+ " sentence label \\\n",
150
+ "0 I'll throw out the garbage . neutral \n",
151
+ "1 So Dick , how about getting some coffee for to... joy \n",
152
+ "2 Come on , you can at least try a little , besi... neutral \n",
153
+ "3 What ’ s wrong with that ? Cigarette is the th... anger \n",
154
+ "4 Not for me , Dick . neutral \n",
155
+ "... ... ... \n",
156
+ "598293 You got banned for participating in a brigade. sadness \n",
157
+ "598294 A joke is subjective pal, second of all you ne... joy \n",
158
+ "598295 Well, I'm glad you're out of all that now. How... joy \n",
159
+ "598296 Everyone likes [NAME]. love \n",
160
+ "598297 The FDA has plenty to criticize. But like here... anger \n",
161
+ "\n",
162
+ " text \n",
163
+ "0 ###Human:\\nyou are a sentiment analist. guess ... \n",
164
+ "1 ###Human:\\nyou are a sentiment analist. guess ... \n",
165
+ "2 ###Human:\\nyou are a sentiment analist. guess ... \n",
166
+ "3 ###Human:\\nyou are a sentiment analist. guess ... \n",
167
+ "4 ###Human:\\nyou are a sentiment analist. guess ... \n",
168
+ "... ... \n",
169
+ "598293 ###Human:\\nyou are a sentiment analist. guess ... \n",
170
+ "598294 ###Human:\\nyou are a sentiment analist. guess ... \n",
171
+ "598295 ###Human:\\nyou are a sentiment analist. guess ... \n",
172
+ "598296 ###Human:\\nyou are a sentiment analist. guess ... \n",
173
+ "598297 ###Human:\\nyou are a sentiment analist. guess ... \n",
174
+ "\n",
175
+ "[598298 rows x 3 columns]"
176
+ ]
177
+ },
178
+ "execution_count": 3,
179
+ "metadata": {},
180
+ "output_type": "execute_result"
181
+ }
182
+ ],
183
+ "source": [
184
+ "train_dataset"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "execution_count": null,
190
+ "metadata": {},
191
+ "outputs": [],
192
+ "source": [
193
+ "# imports for model creation\n",
194
+ "import tensorflow as tf\n",
195
+ "from keras import layers\n",
196
+ "from keras import losses\n",
197
+ "import keras\n",
198
+ "from tensorflow.keras.preprocessing.text import Tokenizer\n",
199
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "execution_count": null,
205
+ "metadata": {},
206
+ "outputs": [],
207
+ "source": [
208
+ "# Tokenization of dataset\n",
209
+ "tokenizer = Tokenizer()\n",
210
+ "tokenizer.fit_on_texts(train_dataset['sentence'])\n",
211
+ "\n",
212
+ "vocab_size = len(tokenizer.word_index) + 1 # our dataset vocab size (space split)\n",
213
+ "max_length = 200 # max words in a sentence\n",
214
+ "embedding_dim = 50 # TODO: need to adjust accordingly\n",
215
+ "\n",
216
+ "X = tokenizer.texts_to_sequences(train_dataset['sentence'])\n",
217
+ "X = pad_sequences(X, maxlen=max_length, padding='post')"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": null,
223
+ "metadata": {},
224
+ "outputs": [],
225
+ "source": [
226
+ "# Encode the lables\n",
227
+ "labels = train_dataset['label'].map({'neutral': '1', 'joy': '2', 'sadness': '3', 'anger': '4', 'fear': '5', 'love': '6', 'surprise': '7'}).astype('float32').values"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "code",
232
+ "execution_count": null,
233
+ "metadata": {},
234
+ "outputs": [],
235
+ "source": [
236
+ "# Build the model\n",
237
+ "model = keras.Sequential([\n",
238
+ " keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,)),\n",
239
+ " keras.layers.GlobalAveragePooling1D(),\n",
240
+ " keras.layers.Dense(16, activation='relu'),\n",
241
+ " keras.layers.Dense(1, activation='sigmoid')\n",
242
+ "])"
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": null,
248
+ "metadata": {},
249
+ "outputs": [],
250
+ "source": [
251
+ "# Compile the model\n",
252
+ "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": null,
258
+ "metadata": {},
259
+ "outputs": [],
260
+ "source": [
261
+ "# split the dataset into train and test\n",
262
+ "from sklearn.model_selection import train_test_split\n",
263
+ "\n",
264
+ "X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42, shuffle=True)"
265
+ ]
266
+ },
267
+ {
268
+ "cell_type": "code",
269
+ "execution_count": null,
270
+ "metadata": {},
271
+ "outputs": [],
272
+ "source": [
273
+ "# train the model\n",
274
+ "model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "metadata": {},
281
+ "outputs": [],
282
+ "source": [
283
+ "# Evaluate the model\n",
284
+ "loss, accuracy = model.evaluate(X_test, y_test)\n",
285
+ "accuracy"
286
+ ]
287
+ },
288
+ {
289
+ "cell_type": "code",
290
+ "execution_count": null,
291
+ "metadata": {},
292
+ "outputs": [],
293
+ "source": [
294
+ "# save the model\n",
295
+ "try:\n",
296
+ " model.save(\"../models/sentimental-analysis-llama2.keras\")\n",
297
+ "except FileNotFoundError:\n",
298
+ " os.mkdir(\"../models\")\n",
299
+ " model.save(\"../models/sentimental-analysis-llama2.keras\")"
300
+ ]
301
+ }
302
+ ],
303
+ "metadata": {
304
+ "kernelspec": {
305
+ "display_name": "Python 3 (ipykernel)",
306
+ "language": "python",
307
+ "name": "python3"
308
+ },
309
+ "language_info": {
310
+ "codemirror_mode": {
311
+ "name": "ipython",
312
+ "version": 3
313
+ },
314
+ "file_extension": ".py",
315
+ "mimetype": "text/x-python",
316
+ "name": "python",
317
+ "nbconvert_exporter": "python",
318
+ "pygments_lexer": "ipython3",
319
+ "version": "3.10.12"
320
+ }
321
+ },
322
+ "nbformat": 4,
323
+ "nbformat_minor": 2
324
+ }
notes/dair_ai-emotions_classification.ipynb ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "2b46cd0f-ae6d-4781-8dab-89df1f880ada",
7
+ "metadata": {
8
+ "execution": {
9
+ "iopub.execute_input": "2024-03-31T14:00:16.074098Z",
10
+ "iopub.status.busy": "2024-03-31T14:00:16.073728Z",
11
+ "iopub.status.idle": "2024-03-31T14:00:16.461502Z",
12
+ "shell.execute_reply": "2024-03-31T14:00:16.460565Z",
13
+ "shell.execute_reply.started": "2024-03-31T14:00:16.074074Z"
14
+ }
15
+ },
16
+ "outputs": [
17
+ {
18
+ "name": "stderr",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "No config specified, defaulting to: emotion/split\n",
22
+ "Reusing dataset emotion (/root/.cache/huggingface/datasets/dair-ai___emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)\n"
23
+ ]
24
+ },
25
+ {
26
+ "data": {
27
+ "application/vnd.jupyter.widget-view+json": {
28
+ "model_id": "c3b3103a90124dfea0bbec49829f8746",
29
+ "version_major": 2,
30
+ "version_minor": 0
31
+ },
32
+ "text/plain": [
33
+ " 0%| | 0/3 [00:00<?, ?it/s]"
34
+ ]
35
+ },
36
+ "metadata": {},
37
+ "output_type": "display_data"
38
+ },
39
+ {
40
+ "data": {
41
+ "text/plain": [
42
+ "DatasetDict({\n",
43
+ " train: Dataset({\n",
44
+ " features: ['text', 'label'],\n",
45
+ " num_rows: 16000\n",
46
+ " })\n",
47
+ " validation: Dataset({\n",
48
+ " features: ['text', 'label'],\n",
49
+ " num_rows: 2000\n",
50
+ " })\n",
51
+ " test: Dataset({\n",
52
+ " features: ['text', 'label'],\n",
53
+ " num_rows: 2000\n",
54
+ " })\n",
55
+ "})"
56
+ ]
57
+ },
58
+ "execution_count": 1,
59
+ "metadata": {},
60
+ "output_type": "execute_result"
61
+ }
62
+ ],
63
+ "source": [
64
+ "from datasets import load_dataset\n",
65
+ "\n",
66
+ "dataset = load_dataset(\"dair-ai/emotion\")\n",
67
+ "dataset"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 3,
73
+ "id": "cbdbcb02-bcc5-4928-8eff-e238437f004b",
74
+ "metadata": {
75
+ "execution": {
76
+ "iopub.execute_input": "2024-03-31T14:00:16.463534Z",
77
+ "iopub.status.busy": "2024-03-31T14:00:16.463202Z",
78
+ "iopub.status.idle": "2024-03-31T14:00:16.498438Z",
79
+ "shell.execute_reply": "2024-03-31T14:00:16.497492Z",
80
+ "shell.execute_reply.started": "2024-03-31T14:00:16.463508Z"
81
+ }
82
+ },
83
+ "outputs": [
84
+ {
85
+ "name": "stdout",
86
+ "output_type": "stream",
87
+ "text": [
88
+ "<class 'pandas.core.frame.DataFrame'>\n",
89
+ "Int64Index: 18000 entries, 0 to 1999\n",
90
+ "Data columns (total 2 columns):\n",
91
+ " # Column Non-Null Count Dtype \n",
92
+ "--- ------ -------------- ----- \n",
93
+ " 0 text 18000 non-null object\n",
94
+ " 1 label 18000 non-null int64 \n",
95
+ "dtypes: int64(1), object(1)\n",
96
+ "memory usage: 421.9+ KB\n"
97
+ ]
98
+ },
99
+ {
100
+ "data": {
101
+ "text/html": [
102
+ "<div>\n",
103
+ "<style scoped>\n",
104
+ " .dataframe tbody tr th:only-of-type {\n",
105
+ " vertical-align: middle;\n",
106
+ " }\n",
107
+ "\n",
108
+ " .dataframe tbody tr th {\n",
109
+ " vertical-align: top;\n",
110
+ " }\n",
111
+ "\n",
112
+ " .dataframe thead th {\n",
113
+ " text-align: right;\n",
114
+ " }\n",
115
+ "</style>\n",
116
+ "<table border=\"1\" class=\"dataframe\">\n",
117
+ " <thead>\n",
118
+ " <tr style=\"text-align: right;\">\n",
119
+ " <th></th>\n",
120
+ " <th>text</th>\n",
121
+ " <th>label</th>\n",
122
+ " </tr>\n",
123
+ " </thead>\n",
124
+ " <tbody>\n",
125
+ " <tr>\n",
126
+ " <th>0</th>\n",
127
+ " <td>i didnt feel humiliated</td>\n",
128
+ " <td>0</td>\n",
129
+ " </tr>\n",
130
+ " <tr>\n",
131
+ " <th>1</th>\n",
132
+ " <td>i can go from feeling so hopeless to so damned...</td>\n",
133
+ " <td>0</td>\n",
134
+ " </tr>\n",
135
+ " <tr>\n",
136
+ " <th>2</th>\n",
137
+ " <td>im grabbing a minute to post i feel greedy wrong</td>\n",
138
+ " <td>3</td>\n",
139
+ " </tr>\n",
140
+ " <tr>\n",
141
+ " <th>3</th>\n",
142
+ " <td>i am ever feeling nostalgic about the fireplac...</td>\n",
143
+ " <td>2</td>\n",
144
+ " </tr>\n",
145
+ " <tr>\n",
146
+ " <th>4</th>\n",
147
+ " <td>i am feeling grouchy</td>\n",
148
+ " <td>3</td>\n",
149
+ " </tr>\n",
150
+ " </tbody>\n",
151
+ "</table>\n",
152
+ "</div>"
153
+ ],
154
+ "text/plain": [
155
+ " text label\n",
156
+ "0 i didnt feel humiliated 0\n",
157
+ "1 i can go from feeling so hopeless to so damned... 0\n",
158
+ "2 im grabbing a minute to post i feel greedy wrong 3\n",
159
+ "3 i am ever feeling nostalgic about the fireplac... 2\n",
160
+ "4 i am feeling grouchy 3"
161
+ ]
162
+ },
163
+ "execution_count": 3,
164
+ "metadata": {},
165
+ "output_type": "execute_result"
166
+ }
167
+ ],
168
+ "source": [
169
+ "import pandas as pd\n",
170
+ "\n",
171
+ "dataset_train = dataset['train'].to_pandas()\n",
172
+ "dataset_test = dataset['test'].to_pandas()\n",
173
+ "\n",
174
+ "complete_dataset = pd.concat([dataset_train, dataset_test])\n",
175
+ "\n",
176
+ "complete_dataset.info()\n",
177
+ "complete_dataset.head()"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 4,
183
+ "id": "0cfc926c-70df-4089-9b2e-f201faa223df",
184
+ "metadata": {
185
+ "execution": {
186
+ "iopub.execute_input": "2024-03-31T14:00:16.500097Z",
187
+ "iopub.status.busy": "2024-03-31T14:00:16.499802Z",
188
+ "iopub.status.idle": "2024-03-31T14:00:16.505424Z",
189
+ "shell.execute_reply": "2024-03-31T14:00:16.504005Z",
190
+ "shell.execute_reply.started": "2024-03-31T14:00:16.500071Z"
191
+ }
192
+ },
193
+ "outputs": [],
194
+ "source": [
195
+ "# imports for model creation\n",
196
+ "import tensorflow as tf\n",
197
+ "from keras import layers\n",
198
+ "from keras import losses\n",
199
+ "import keras\n",
200
+ "from tensorflow.keras.preprocessing.text import Tokenizer\n",
201
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences"
202
+ ]
203
+ },
204
+ {
205
+ "cell_type": "code",
206
+ "execution_count": 5,
207
+ "id": "eb4bab6b-ae99-4fae-bf0b-ca91be630db3",
208
+ "metadata": {
209
+ "execution": {
210
+ "iopub.execute_input": "2024-03-31T14:00:16.508818Z",
211
+ "iopub.status.busy": "2024-03-31T14:00:16.508418Z",
212
+ "iopub.status.idle": "2024-03-31T14:00:17.259616Z",
213
+ "shell.execute_reply": "2024-03-31T14:00:17.258634Z",
214
+ "shell.execute_reply.started": "2024-03-31T14:00:16.508782Z"
215
+ }
216
+ },
217
+ "outputs": [],
218
+ "source": [
219
+ "tokenizer = Tokenizer()\n",
220
+ "tokenizer.fit_on_texts(complete_dataset['text'])\n",
221
+ "\n",
222
+ "vocab_size = len(tokenizer.word_index) + 1\n",
223
+ "max_length = 200 # max words in a sentence\n",
224
+ "embedding_dim = 50 # TODO: need to adjust accordingly\n",
225
+ "\n",
226
+ "X_train = tokenizer.texts_to_sequences(dataset_train['text'])\n",
227
+ "X_train = pad_sequences(X_train, maxlen=max_length, padding='post')\n",
228
+ "\n",
229
+ "X_test = tokenizer.texts_to_sequences(dataset_test['text'])\n",
230
+ "X_test = pad_sequences(X_test, maxlen=max_length, padding='post')\n",
231
+ "\n",
232
+ "y_train = dataset_train['label']\n",
233
+ "y_test = dataset_test['label']\n",
234
+ "\n",
235
+ "from keras.utils import to_categorical\n",
236
+ "\n",
237
+ "num_classes = 6 # Assuming you have 3 classes\n",
238
+ "y_train_encoded = to_categorical(y_train, num_classes=num_classes)\n",
239
+ "y_test_encoded = to_categorical(y_test, num_classes=num_classes)"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "execution_count": 6,
245
+ "id": "d7202d74-95c7-4bb2-aea5-54481dfcafd6",
246
+ "metadata": {
247
+ "execution": {
248
+ "iopub.execute_input": "2024-03-31T14:00:17.261577Z",
249
+ "iopub.status.busy": "2024-03-31T14:00:17.261236Z",
250
+ "iopub.status.idle": "2024-03-31T14:00:17.267983Z",
251
+ "shell.execute_reply": "2024-03-31T14:00:17.267053Z",
252
+ "shell.execute_reply.started": "2024-03-31T14:00:17.261539Z"
253
+ }
254
+ },
255
+ "outputs": [
256
+ {
257
+ "name": "stdout",
258
+ "output_type": "stream",
259
+ "text": [
260
+ "(16000, 200)\n",
261
+ "(2000, 200)\n",
262
+ "(16000,)\n",
263
+ "(2000,)\n"
264
+ ]
265
+ }
266
+ ],
267
+ "source": [
268
+ "labels = complete_dataset['label']\n",
269
+ "\n",
270
+ "print(X_train.shape)\n",
271
+ "print(X_test.shape)\n",
272
+ "print(y_train.shape)\n",
273
+ "print(y_test.shape)"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": 7,
279
+ "id": "738e3137-7ea4-4e71-9395-773e537083cf",
280
+ "metadata": {
281
+ "execution": {
282
+ "iopub.execute_input": "2024-03-31T14:00:17.269524Z",
283
+ "iopub.status.busy": "2024-03-31T14:00:17.269276Z",
284
+ "iopub.status.idle": "2024-03-31T14:00:17.328258Z",
285
+ "shell.execute_reply": "2024-03-31T14:00:17.327093Z",
286
+ "shell.execute_reply.started": "2024-03-31T14:00:17.269498Z"
287
+ }
288
+ },
289
+ "outputs": [],
290
+ "source": [
291
+ "# Build the model\n",
292
+ "model = keras.Sequential([\n",
293
+ " keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,)),\n",
294
+ " keras.layers.GlobalAveragePooling1D(),\n",
295
+ " keras.layers.Dense(32, activation='relu'),\n",
296
+ " keras.layers.Dense(6, activation='sigmoid')\n",
297
+ "])"
298
+ ]
299
+ },
300
+ {
301
+ "cell_type": "code",
302
+ "execution_count": 8,
303
+ "id": "cd072f39-99e6-44f0-8c7f-106a0055c43b",
304
+ "metadata": {
305
+ "execution": {
306
+ "iopub.execute_input": "2024-03-31T14:00:17.330501Z",
307
+ "iopub.status.busy": "2024-03-31T14:00:17.330148Z",
308
+ "iopub.status.idle": "2024-03-31T14:00:17.340754Z",
309
+ "shell.execute_reply": "2024-03-31T14:00:17.339909Z",
310
+ "shell.execute_reply.started": "2024-03-31T14:00:17.330466Z"
311
+ }
312
+ },
313
+ "outputs": [],
314
+ "source": [
315
+ "# Compile the model\n",
316
+ "model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": 9,
322
+ "id": "72ad6548-5d1c-4221-88c7-014dcbaea0ee",
323
+ "metadata": {
324
+ "execution": {
325
+ "iopub.execute_input": "2024-03-31T14:00:17.342873Z",
326
+ "iopub.status.busy": "2024-03-31T14:00:17.342494Z",
327
+ "iopub.status.idle": "2024-03-31T14:00:17.347797Z",
328
+ "shell.execute_reply": "2024-03-31T14:00:17.346354Z",
329
+ "shell.execute_reply.started": "2024-03-31T14:00:17.342836Z"
330
+ }
331
+ },
332
+ "outputs": [],
333
+ "source": [
334
+ "# split the dataset into train and test\n",
335
+ "# from sklearn.model_selection import train_test_split\n",
336
+ "\n",
337
+ "# X_train, X_test, y_train, y_test = train_test_split(, labels, test_size=0.3, random_state=42, shuffle=True)\n",
338
+ "# X_train"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": 12,
344
+ "id": "9267da90-7a84-49d1-94d0-04a2cd3062e0",
345
+ "metadata": {
346
+ "execution": {
347
+ "iopub.execute_input": "2024-03-31T14:00:43.055635Z",
348
+ "iopub.status.busy": "2024-03-31T14:00:43.055255Z",
349
+ "iopub.status.idle": "2024-03-31T14:01:14.962372Z",
350
+ "shell.execute_reply": "2024-03-31T14:01:14.961429Z",
351
+ "shell.execute_reply.started": "2024-03-31T14:00:43.055606Z"
352
+ }
353
+ },
354
+ "outputs": [
355
+ {
356
+ "name": "stdout",
357
+ "output_type": "stream",
358
+ "text": [
359
+ "Epoch 1/15\n"
360
+ ]
361
+ },
362
+ {
363
+ "name": "stderr",
364
+ "output_type": "stream",
365
+ "text": [
366
+ "2024-03-31 14:12:18.969565: E tensorflow/stream_executor/cuda/cuda_blas.cc:232] failed to create cublas handle: CUBLAS_STATUS_NOT_INITIALIZED\n",
367
+ "2024-03-31 14:12:18.969636: E tensorflow/stream_executor/cuda/cuda_blas.cc:234] Failure to initialize cublas may be due to OOM (cublas needs some free memory when you initialize it, and your deep-learning framework may have preallocated more than its fair share), or may be because this binary was not built with support for the GPU in your machine.\n",
368
+ "2024-03-31 14:12:18.969674: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at matmul_op_impl.h:438 : INTERNAL: Attempting to perform BLAS operation using StreamExecutor without BLAS support\n"
369
+ ]
370
+ },
371
+ {
372
+ "ename": "InternalError",
373
+ "evalue": "Graph execution error:\n\nDetected at node 'sequential/dense/MatMul' defined at (most recent call last):\n File \"/usr/lib/python3.9/runpy.py\", line 197, in _run_module_as_main\n return _run_code(code, main_globals, None,\n File \"/usr/lib/python3.9/runpy.py\", line 87, in _run_code\n exec(code, run_globals)\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel_launcher.py\", line 17, in <module>\n app.launch_new_instance()\n File \"/usr/local/lib/python3.9/dist-packages/traitlets/config/application.py\", line 1041, in launch_instance\n app.start()\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/kernelapp.py\", line 712, in start\n self.io_loop.start()\n File \"/usr/local/lib/python3.9/dist-packages/tornado/platform/asyncio.py\", line 199, in start\n self.asyncio_loop.run_forever()\n File \"/usr/lib/python3.9/asyncio/base_events.py\", line 601, in run_forever\n self._run_once()\n File \"/usr/lib/python3.9/asyncio/base_events.py\", line 1905, in _run_once\n handle._run()\n File \"/usr/lib/python3.9/asyncio/events.py\", line 80, in _run\n self._context.run(self._callback, *self._args)\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/kernelbase.py\", line 510, in dispatch_queue\n await self.process_one()\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/kernelbase.py\", line 499, in process_one\n await dispatch(*args)\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/kernelbase.py\", line 406, in dispatch_shell\n await result\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/kernelbase.py\", line 730, in execute_request\n reply_content = await reply_content\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/ipkernel.py\", line 383, in do_execute\n res = shell.run_cell(\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/zmqshell.py\", line 528, in run_cell\n return super().run_cell(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py\", line 2885, in run_cell\n result = self._run_cell(\n File \"/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py\", line 2940, in _run_cell\n return runner(coro)\n File \"/usr/local/lib/python3.9/dist-packages/IPython/core/async_helpers.py\", line 129, in _pseudo_sync_runner\n coro.send(None)\n File \"/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py\", line 3139, in run_cell_async\n has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n File \"/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py\", line 3318, in run_ast_nodes\n if await self.run_code(code, result, async_=asy):\n File \"/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py\", line 3378, in run_code\n exec(code_obj, self.user_global_ns, self.user_ns)\n File \"/tmp/ipykernel_517/2691274767.py\", line 2, in <module>\n model.fit(X_train, y_train_encoded, epochs=15, batch_size=32, validation_data=(X_test, y_test_encoded))\n File \"/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py\", line 64, in error_handler\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/training.py\", line 1409, in fit\n tmp_logs = self.train_function(iterator)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/training.py\", line 1051, in train_function\n return step_function(self, iterator)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/training.py\", line 1040, in step_function\n outputs = model.distribute_strategy.run(run_step, args=(data,))\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/training.py\", line 1030, in run_step\n outputs = model.train_step(data)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/training.py\", line 889, in train_step\n y_pred = self(x, training=True)\n File \"/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py\", line 64, in error_handler\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/training.py\", line 490, in __call__\n return super().__call__(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py\", line 64, in error_handler\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/base_layer.py\", line 1014, in __call__\n outputs = call_fn(inputs, *args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py\", line 92, in error_handler\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/sequential.py\", line 374, in call\n return super(Sequential, self).call(inputs, training=training, mask=mask)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/functional.py\", line 458, in call\n return self._run_internal_graph(\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/functional.py\", line 596, in _run_internal_graph\n outputs = node.layer(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py\", line 64, in error_handler\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/base_layer.py\", line 1014, in __call__\n outputs = call_fn(inputs, *args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py\", line 92, in error_handler\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/layers/core/dense.py\", line 221, in call\n outputs = tf.matmul(a=inputs, b=self.kernel)\nNode: 'sequential/dense/MatMul'\nAttempting to perform BLAS operation using StreamExecutor without BLAS support\n\t [[{{node sequential/dense/MatMul}}]] [Op:__inference_train_function_727]",
374
+ "output_type": "error",
375
+ "traceback": [
376
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
377
+ "\u001b[0;31mInternalError\u001b[0m Traceback (most recent call last)",
378
+ "Cell \u001b[0;32mIn [12], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# train the model\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train_encoded\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m15\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m32\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalidation_data\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mX_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_test_encoded\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
379
+ "File \u001b[0;32m/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py:67\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e: \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 66\u001b[0m filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[0;32m---> 67\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[1;32m 68\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 69\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
380
+ "File \u001b[0;32m/usr/local/lib/python3.9/dist-packages/tensorflow/python/eager/execute.py:54\u001b[0m, in \u001b[0;36mquick_execute\u001b[0;34m(op_name, num_outputs, inputs, attrs, ctx, name)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m ctx\u001b[38;5;241m.\u001b[39mensure_initialized()\n\u001b[0;32m---> 54\u001b[0m tensors \u001b[38;5;241m=\u001b[39m pywrap_tfe\u001b[38;5;241m.\u001b[39mTFE_Py_Execute(ctx\u001b[38;5;241m.\u001b[39m_handle, device_name, op_name,\n\u001b[1;32m 55\u001b[0m inputs, attrs, num_outputs)\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m core\u001b[38;5;241m.\u001b[39m_NotOkStatusException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
381
+ "\u001b[0;31mInternalError\u001b[0m: Graph execution error:\n\nDetected at node 'sequential/dense/MatMul' defined at (most recent call last):\n File \"/usr/lib/python3.9/runpy.py\", line 197, in _run_module_as_main\n return _run_code(code, main_globals, None,\n File \"/usr/lib/python3.9/runpy.py\", line 87, in _run_code\n exec(code, run_globals)\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel_launcher.py\", line 17, in <module>\n app.launch_new_instance()\n File \"/usr/local/lib/python3.9/dist-packages/traitlets/config/application.py\", line 1041, in launch_instance\n app.start()\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/kernelapp.py\", line 712, in start\n self.io_loop.start()\n File \"/usr/local/lib/python3.9/dist-packages/tornado/platform/asyncio.py\", line 199, in start\n self.asyncio_loop.run_forever()\n File \"/usr/lib/python3.9/asyncio/base_events.py\", line 601, in run_forever\n self._run_once()\n File \"/usr/lib/python3.9/asyncio/base_events.py\", line 1905, in _run_once\n handle._run()\n File \"/usr/lib/python3.9/asyncio/events.py\", line 80, in _run\n self._context.run(self._callback, *self._args)\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/kernelbase.py\", line 510, in dispatch_queue\n await self.process_one()\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/kernelbase.py\", line 499, in process_one\n await dispatch(*args)\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/kernelbase.py\", line 406, in dispatch_shell\n await result\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/kernelbase.py\", line 730, in execute_request\n reply_content = await reply_content\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/ipkernel.py\", line 383, in do_execute\n res = shell.run_cell(\n File \"/usr/local/lib/python3.9/dist-packages/ipykernel/zmqshell.py\", line 528, in run_cell\n return super().run_cell(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py\", line 2885, in run_cell\n result = self._run_cell(\n File \"/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py\", line 2940, in _run_cell\n return runner(coro)\n File \"/usr/local/lib/python3.9/dist-packages/IPython/core/async_helpers.py\", line 129, in _pseudo_sync_runner\n coro.send(None)\n File \"/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py\", line 3139, in run_cell_async\n has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n File \"/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py\", line 3318, in run_ast_nodes\n if await self.run_code(code, result, async_=asy):\n File \"/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py\", line 3378, in run_code\n exec(code_obj, self.user_global_ns, self.user_ns)\n File \"/tmp/ipykernel_517/2691274767.py\", line 2, in <module>\n model.fit(X_train, y_train_encoded, epochs=15, batch_size=32, validation_data=(X_test, y_test_encoded))\n File \"/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py\", line 64, in error_handler\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/training.py\", line 1409, in fit\n tmp_logs = self.train_function(iterator)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/training.py\", line 1051, in train_function\n return step_function(self, iterator)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/training.py\", line 1040, in step_function\n outputs = model.distribute_strategy.run(run_step, args=(data,))\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/training.py\", line 1030, in run_step\n outputs = model.train_step(data)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/training.py\", line 889, in train_step\n y_pred = self(x, training=True)\n File \"/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py\", line 64, in error_handler\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/training.py\", line 490, in __call__\n return super().__call__(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py\", line 64, in error_handler\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/base_layer.py\", line 1014, in __call__\n outputs = call_fn(inputs, *args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py\", line 92, in error_handler\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/sequential.py\", line 374, in call\n return super(Sequential, self).call(inputs, training=training, mask=mask)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/functional.py\", line 458, in call\n return self._run_internal_graph(\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/functional.py\", line 596, in _run_internal_graph\n outputs = node.layer(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py\", line 64, in error_handler\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/engine/base_layer.py\", line 1014, in __call__\n outputs = call_fn(inputs, *args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py\", line 92, in error_handler\n return fn(*args, **kwargs)\n File \"/usr/local/lib/python3.9/dist-packages/keras/layers/core/dense.py\", line 221, in call\n outputs = tf.matmul(a=inputs, b=self.kernel)\nNode: 'sequential/dense/MatMul'\nAttempting to perform BLAS operation using StreamExecutor without BLAS support\n\t [[{{node sequential/dense/MatMul}}]] [Op:__inference_train_function_727]"
382
+ ]
383
+ }
384
+ ],
385
+ "source": [
386
+ "# train the model\n",
387
+ "model.fit(X_train, y_train_encoded, epochs=15, batch_size=32, validation_data=(X_test, y_test_encoded))"
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": 48,
393
+ "id": "24e17bec-2fbe-400f-9273-a5abe823f193",
394
+ "metadata": {
395
+ "execution": {
396
+ "iopub.execute_input": "2024-03-31T14:01:20.843282Z",
397
+ "iopub.status.busy": "2024-03-31T14:01:20.842873Z",
398
+ "iopub.status.idle": "2024-03-31T14:01:21.062240Z",
399
+ "shell.execute_reply": "2024-03-31T14:01:21.061522Z",
400
+ "shell.execute_reply.started": "2024-03-31T14:01:20.843253Z"
401
+ }
402
+ },
403
+ "outputs": [
404
+ {
405
+ "name": "stdout",
406
+ "output_type": "stream",
407
+ "text": [
408
+ "63/63 [==============================] - 0s 2ms/step - loss: 0.5242 - accuracy: 0.8580\n"
409
+ ]
410
+ },
411
+ {
412
+ "data": {
413
+ "text/plain": [
414
+ "0.8579999804496765"
415
+ ]
416
+ },
417
+ "execution_count": 48,
418
+ "metadata": {},
419
+ "output_type": "execute_result"
420
+ }
421
+ ],
422
+ "source": [
423
+ "# Evaluate the model\n",
424
+ "loss, accuracy = model.evaluate(X_test, y_test_encoded)\n",
425
+ "accuracy"
426
+ ]
427
+ }
428
+ ],
429
+ "metadata": {
430
+ "kernelspec": {
431
+ "display_name": "Python 3 (ipykernel)",
432
+ "language": "python",
433
+ "name": "python3"
434
+ },
435
+ "language_info": {
436
+ "codemirror_mode": {
437
+ "name": "ipython",
438
+ "version": 3
439
+ },
440
+ "file_extension": ".py",
441
+ "mimetype": "text/x-python",
442
+ "name": "python",
443
+ "nbconvert_exporter": "python",
444
+ "pygments_lexer": "ipython3",
445
+ "version": "3.9.16"
446
+ }
447
+ },
448
+ "nbformat": 4,
449
+ "nbformat_minor": 5
450
+ }
notes/imsoumyaneel-sentiment_analysis_llama2.ipynb ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "id": "2b46cd0f-ae6d-4781-8dab-89df1f880ada",
7
+ "metadata": {
8
+ "execution": {
9
+ "iopub.execute_input": "2024-03-31T16:38:33.012523Z",
10
+ "iopub.status.busy": "2024-03-31T16:38:33.012195Z",
11
+ "iopub.status.idle": "2024-03-31T16:38:41.339903Z",
12
+ "shell.execute_reply": "2024-03-31T16:38:41.339283Z",
13
+ "shell.execute_reply.started": "2024-03-31T16:38:33.012515Z"
14
+ }
15
+ },
16
+ "outputs": [
17
+ {
18
+ "name": "stderr",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "Using custom data configuration imsoumyaneel--sentiment-analysis-llama2-406c8d12ee6e98f7\n"
22
+ ]
23
+ },
24
+ {
25
+ "name": "stdout",
26
+ "output_type": "stream",
27
+ "text": [
28
+ "Downloading and preparing dataset csv/imsoumyaneel--sentiment-analysis-llama2 to /root/.cache/huggingface/datasets/imsoumyaneel___csv/imsoumyaneel--sentiment-analysis-llama2-406c8d12ee6e98f7/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...\n"
29
+ ]
30
+ },
31
+ {
32
+ "data": {
33
+ "application/vnd.jupyter.widget-view+json": {
34
+ "model_id": "1b57dcefac4c41bd8afdc69d582d21ae",
35
+ "version_major": 2,
36
+ "version_minor": 0
37
+ },
38
+ "text/plain": [
39
+ "Downloading data files: 0%| | 0/1 [00:00<?, ?it/s]"
40
+ ]
41
+ },
42
+ "metadata": {},
43
+ "output_type": "display_data"
44
+ },
45
+ {
46
+ "data": {
47
+ "application/vnd.jupyter.widget-view+json": {
48
+ "model_id": "73fbc9718d8540d3a795a552e943aa7f",
49
+ "version_major": 2,
50
+ "version_minor": 0
51
+ },
52
+ "text/plain": [
53
+ "Downloading data: 0%| | 0.00/173M [00:00<?, ?B/s]"
54
+ ]
55
+ },
56
+ "metadata": {},
57
+ "output_type": "display_data"
58
+ },
59
+ {
60
+ "data": {
61
+ "application/vnd.jupyter.widget-view+json": {
62
+ "model_id": "5b85187c04ff42baa213ed0026e2fdd6",
63
+ "version_major": 2,
64
+ "version_minor": 0
65
+ },
66
+ "text/plain": [
67
+ "Extracting data files: 0%| | 0/1 [00:00<?, ?it/s]"
68
+ ]
69
+ },
70
+ "metadata": {},
71
+ "output_type": "display_data"
72
+ },
73
+ {
74
+ "data": {
75
+ "application/vnd.jupyter.widget-view+json": {
76
+ "model_id": "176a9bfaff1b4ff0ae8e6a701054e277",
77
+ "version_major": 2,
78
+ "version_minor": 0
79
+ },
80
+ "text/plain": [
81
+ "0 tables [00:00, ? tables/s]"
82
+ ]
83
+ },
84
+ "metadata": {},
85
+ "output_type": "display_data"
86
+ },
87
+ {
88
+ "name": "stderr",
89
+ "output_type": "stream",
90
+ "text": [
91
+ "/usr/local/lib/python3.9/dist-packages/datasets/download/streaming_download_manager.py:695: FutureWarning: the 'mangle_dupe_cols' keyword is deprecated and will be removed in a future version. Please take steps to stop the use of 'mangle_dupe_cols'\n",
92
+ " return pd.read_csv(xopen(filepath_or_buffer, \"rb\", use_auth_token=use_auth_token), **kwargs)\n"
93
+ ]
94
+ },
95
+ {
96
+ "name": "stdout",
97
+ "output_type": "stream",
98
+ "text": [
99
+ "Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/imsoumyaneel___csv/imsoumyaneel--sentiment-analysis-llama2-406c8d12ee6e98f7/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.\n"
100
+ ]
101
+ },
102
+ {
103
+ "data": {
104
+ "application/vnd.jupyter.widget-view+json": {
105
+ "model_id": "a74c347ebc614136b0a2ad549ef01e3d",
106
+ "version_major": 2,
107
+ "version_minor": 0
108
+ },
109
+ "text/plain": [
110
+ " 0%| | 0/1 [00:00<?, ?it/s]"
111
+ ]
112
+ },
113
+ "metadata": {},
114
+ "output_type": "display_data"
115
+ },
116
+ {
117
+ "data": {
118
+ "text/plain": [
119
+ "DatasetDict({\n",
120
+ " train: Dataset({\n",
121
+ " features: ['sentence', 'label', 'text'],\n",
122
+ " num_rows: 598298\n",
123
+ " })\n",
124
+ "})"
125
+ ]
126
+ },
127
+ "execution_count": 3,
128
+ "metadata": {},
129
+ "output_type": "execute_result"
130
+ }
131
+ ],
132
+ "source": [
133
+ "from datasets import load_dataset\n",
134
+ "\n",
135
+ "# dataset = load_dataset(\"dair-ai/emotion\")\n",
136
+ "dataset = load_dataset(\"imsoumyaneel/sentiment-analysis-llama2\")\n",
137
+ "dataset"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": 4,
143
+ "id": "cbdbcb02-bcc5-4928-8eff-e238437f004b",
144
+ "metadata": {
145
+ "execution": {
146
+ "iopub.execute_input": "2024-03-31T16:38:41.341147Z",
147
+ "iopub.status.busy": "2024-03-31T16:38:41.340979Z",
148
+ "iopub.status.idle": "2024-03-31T16:38:42.665731Z",
149
+ "shell.execute_reply": "2024-03-31T16:38:42.665184Z",
150
+ "shell.execute_reply.started": "2024-03-31T16:38:41.341132Z"
151
+ }
152
+ },
153
+ "outputs": [
154
+ {
155
+ "name": "stdout",
156
+ "output_type": "stream",
157
+ "text": [
158
+ "<class 'pandas.core.frame.DataFrame'>\n",
159
+ "RangeIndex: 598298 entries, 0 to 598297\n",
160
+ "Data columns (total 4 columns):\n",
161
+ " # Column Non-Null Count Dtype \n",
162
+ "--- ------ -------------- ----- \n",
163
+ " 0 sentence 598298 non-null object\n",
164
+ " 1 label 598298 non-null object\n",
165
+ " 2 text 598298 non-null object\n",
166
+ " 3 new_label 598298 non-null object\n",
167
+ "dtypes: object(4)\n",
168
+ "memory usage: 18.3+ MB\n",
169
+ "<class 'pandas.core.frame.DataFrame'>\n",
170
+ "Int64Index: 478638 entries, 352227 to 559736\n",
171
+ "Data columns (total 4 columns):\n",
172
+ " # Column Non-Null Count Dtype \n",
173
+ "--- ------ -------------- ----- \n",
174
+ " 0 sentence 478638 non-null object\n",
175
+ " 1 label 478638 non-null object\n",
176
+ " 2 text 478638 non-null object\n",
177
+ " 3 new_label 478638 non-null object\n",
178
+ "dtypes: object(4)\n",
179
+ "memory usage: 18.3+ MB\n",
180
+ "<class 'pandas.core.frame.DataFrame'>\n",
181
+ "Int64Index: 119660 entries, 0 to 598297\n",
182
+ "Data columns (total 4 columns):\n",
183
+ " # Column Non-Null Count Dtype \n",
184
+ "--- ------ -------------- ----- \n",
185
+ " 0 sentence 119660 non-null object\n",
186
+ " 1 label 119660 non-null object\n",
187
+ " 2 text 119660 non-null object\n",
188
+ " 3 new_label 119660 non-null object\n",
189
+ "dtypes: object(4)\n",
190
+ "memory usage: 4.6+ MB\n"
191
+ ]
192
+ },
193
+ {
194
+ "data": {
195
+ "text/html": [
196
+ "<div>\n",
197
+ "<style scoped>\n",
198
+ " .dataframe tbody tr th:only-of-type {\n",
199
+ " vertical-align: middle;\n",
200
+ " }\n",
201
+ "\n",
202
+ " .dataframe tbody tr th {\n",
203
+ " vertical-align: top;\n",
204
+ " }\n",
205
+ "\n",
206
+ " .dataframe thead th {\n",
207
+ " text-align: right;\n",
208
+ " }\n",
209
+ "</style>\n",
210
+ "<table border=\"1\" class=\"dataframe\">\n",
211
+ " <thead>\n",
212
+ " <tr style=\"text-align: right;\">\n",
213
+ " <th></th>\n",
214
+ " <th>sentence</th>\n",
215
+ " <th>label</th>\n",
216
+ " <th>text</th>\n",
217
+ " <th>new_label</th>\n",
218
+ " </tr>\n",
219
+ " </thead>\n",
220
+ " <tbody>\n",
221
+ " <tr>\n",
222
+ " <th>0</th>\n",
223
+ " <td>I'll throw out the garbage .</td>\n",
224
+ " <td>neutral</td>\n",
225
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
226
+ " <td>1</td>\n",
227
+ " </tr>\n",
228
+ " <tr>\n",
229
+ " <th>1</th>\n",
230
+ " <td>So Dick , how about getting some coffee for to...</td>\n",
231
+ " <td>joy</td>\n",
232
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
233
+ " <td>0</td>\n",
234
+ " </tr>\n",
235
+ " <tr>\n",
236
+ " <th>2</th>\n",
237
+ " <td>Come on , you can at least try a little , besi...</td>\n",
238
+ " <td>neutral</td>\n",
239
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
240
+ " <td>1</td>\n",
241
+ " </tr>\n",
242
+ " <tr>\n",
243
+ " <th>3</th>\n",
244
+ " <td>What ’ s wrong with that ? Cigarette is the th...</td>\n",
245
+ " <td>anger</td>\n",
246
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
247
+ " <td>3</td>\n",
248
+ " </tr>\n",
249
+ " <tr>\n",
250
+ " <th>4</th>\n",
251
+ " <td>Not for me , Dick .</td>\n",
252
+ " <td>neutral</td>\n",
253
+ " <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
254
+ " <td>1</td>\n",
255
+ " </tr>\n",
256
+ " </tbody>\n",
257
+ "</table>\n",
258
+ "</div>"
259
+ ],
260
+ "text/plain": [
261
+ " sentence label \\\n",
262
+ "0 I'll throw out the garbage . neutral \n",
263
+ "1 So Dick , how about getting some coffee for to... joy \n",
264
+ "2 Come on , you can at least try a little , besi... neutral \n",
265
+ "3 What ’ s wrong with that ? Cigarette is the th... anger \n",
266
+ "4 Not for me , Dick . neutral \n",
267
+ "\n",
268
+ " text new_label \n",
269
+ "0 ###Human:\\nyou are a sentiment analist. guess ... 1 \n",
270
+ "1 ###Human:\\nyou are a sentiment analist. guess ... 0 \n",
271
+ "2 ###Human:\\nyou are a sentiment analist. guess ... 1 \n",
272
+ "3 ###Human:\\nyou are a sentiment analist. guess ... 3 \n",
273
+ "4 ###Human:\\nyou are a sentiment analist. guess ... 1 "
274
+ ]
275
+ },
276
+ "execution_count": 4,
277
+ "metadata": {},
278
+ "output_type": "execute_result"
279
+ }
280
+ ],
281
+ "source": [
282
+ "import pandas as pd\n",
283
+ "\n",
284
+ "complete_dataset = dataset['train'].to_pandas()\n",
285
+ "complete_dataset['new_label'] = complete_dataset['label'].map({'joy': '0', 'neutral': '1', 'sadness': '2', 'anger': '3', 'fear': '4', 'love': '5', 'surprise': '6'}).values\n",
286
+ "\n",
287
+ "train_dataset = complete_dataset.sample(frac=0.8,random_state=200)\n",
288
+ "test_dataset = complete_dataset.drop(train_dataset.index)\n",
289
+ "\n",
290
+ "complete_dataset.info()\n",
291
+ "train_dataset.info()\n",
292
+ "test_dataset.info()\n",
293
+ "\n",
294
+ "complete_dataset.head()"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": 5,
300
+ "id": "0cfc926c-70df-4089-9b2e-f201faa223df",
301
+ "metadata": {
302
+ "execution": {
303
+ "iopub.execute_input": "2024-03-31T16:38:42.666938Z",
304
+ "iopub.status.busy": "2024-03-31T16:38:42.666761Z",
305
+ "iopub.status.idle": "2024-03-31T16:38:46.106948Z",
306
+ "shell.execute_reply": "2024-03-31T16:38:46.106267Z",
307
+ "shell.execute_reply.started": "2024-03-31T16:38:42.666926Z"
308
+ }
309
+ },
310
+ "outputs": [],
311
+ "source": [
312
+ "# imports for model creation\n",
313
+ "import tensorflow as tf\n",
314
+ "from keras import layers\n",
315
+ "from keras import losses\n",
316
+ "import keras\n",
317
+ "from tensorflow.keras.preprocessing.text import Tokenizer\n",
318
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences"
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": 6,
324
+ "id": "eb4bab6b-ae99-4fae-bf0b-ca91be630db3",
325
+ "metadata": {
326
+ "execution": {
327
+ "iopub.execute_input": "2024-03-31T16:38:46.108849Z",
328
+ "iopub.status.busy": "2024-03-31T16:38:46.108483Z",
329
+ "iopub.status.idle": "2024-03-31T16:39:01.033885Z",
330
+ "shell.execute_reply": "2024-03-31T16:39:01.033311Z",
331
+ "shell.execute_reply.started": "2024-03-31T16:38:46.108831Z"
332
+ }
333
+ },
334
+ "outputs": [],
335
+ "source": [
336
+ "tokenizer = Tokenizer()\n",
337
+ "tokenizer.fit_on_texts(complete_dataset['sentence'])\n",
338
+ "\n",
339
+ "vocab_size = len(tokenizer.word_index) + 1\n",
340
+ "max_length = 200 # max words in a sentence\n",
341
+ "embedding_dim = 50 # TODO: need to adjust accordinglys\n",
342
+ "\n",
343
+ "X_train = tokenizer.texts_to_sequences(train_dataset['sentence'])\n",
344
+ "X_train = pad_sequences(X_train, maxlen=max_length, padding='post')\n",
345
+ "\n",
346
+ "X_test = tokenizer.texts_to_sequences(test_dataset['sentence'])\n",
347
+ "X_test = pad_sequences(X_test, maxlen=max_length, padding='post')\n",
348
+ "\n",
349
+ "y_train = train_dataset['new_label']\n",
350
+ "y_test = test_dataset['new_label']\n",
351
+ "\n",
352
+ "from keras.utils import to_categorical\n",
353
+ "\n",
354
+ "num_classes = 7 # Assuming you have 3 classes\n",
355
+ "y_train_encoded = to_categorical(y_train, num_classes=num_classes)\n",
356
+ "y_test_encoded = to_categorical(y_test, num_classes=num_classes)"
357
+ ]
358
+ },
359
+ {
360
+ "cell_type": "code",
361
+ "execution_count": 7,
362
+ "id": "d7202d74-95c7-4bb2-aea5-54481dfcafd6",
363
+ "metadata": {
364
+ "execution": {
365
+ "iopub.execute_input": "2024-03-31T16:39:01.035022Z",
366
+ "iopub.status.busy": "2024-03-31T16:39:01.034846Z",
367
+ "iopub.status.idle": "2024-03-31T16:39:01.038541Z",
368
+ "shell.execute_reply": "2024-03-31T16:39:01.038020Z",
369
+ "shell.execute_reply.started": "2024-03-31T16:39:01.035006Z"
370
+ }
371
+ },
372
+ "outputs": [
373
+ {
374
+ "name": "stdout",
375
+ "output_type": "stream",
376
+ "text": [
377
+ "(478638, 200)\n",
378
+ "(119660, 200)\n",
379
+ "(478638,)\n",
380
+ "(119660,)\n"
381
+ ]
382
+ }
383
+ ],
384
+ "source": [
385
+ "labels = complete_dataset['label']\n",
386
+ "\n",
387
+ "print(X_train.shape)\n",
388
+ "print(X_test.shape)\n",
389
+ "print(y_train.shape)\n",
390
+ "print(y_test.shape)"
391
+ ]
392
+ },
393
+ {
394
+ "cell_type": "code",
395
+ "execution_count": 8,
396
+ "id": "738e3137-7ea4-4e71-9395-773e537083cf",
397
+ "metadata": {
398
+ "execution": {
399
+ "iopub.execute_input": "2024-03-31T16:39:01.039687Z",
400
+ "iopub.status.busy": "2024-03-31T16:39:01.039206Z",
401
+ "iopub.status.idle": "2024-03-31T16:39:02.148732Z",
402
+ "shell.execute_reply": "2024-03-31T16:39:02.148025Z",
403
+ "shell.execute_reply.started": "2024-03-31T16:39:01.039671Z"
404
+ }
405
+ },
406
+ "outputs": [],
407
+ "source": [
408
+ "# Build the model\n",
409
+ "model = keras.Sequential([\n",
410
+ " keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,)),\n",
411
+ " keras.layers.GlobalAveragePooling1D(),\n",
412
+ " keras.layers.Dense(32, activation='relu'),\n",
413
+ " keras.layers.Dense(7, activation='sigmoid')\n",
414
+ "])"
415
+ ]
416
+ },
417
+ {
418
+ "cell_type": "code",
419
+ "execution_count": 9,
420
+ "id": "cd072f39-99e6-44f0-8c7f-106a0055c43b",
421
+ "metadata": {
422
+ "execution": {
423
+ "iopub.execute_input": "2024-03-31T16:39:02.150108Z",
424
+ "iopub.status.busy": "2024-03-31T16:39:02.149634Z",
425
+ "iopub.status.idle": "2024-03-31T16:39:02.159085Z",
426
+ "shell.execute_reply": "2024-03-31T16:39:02.158478Z",
427
+ "shell.execute_reply.started": "2024-03-31T16:39:02.150090Z"
428
+ }
429
+ },
430
+ "outputs": [],
431
+ "source": [
432
+ "# Compile the model\n",
433
+ "model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])"
434
+ ]
435
+ },
436
+ {
437
+ "cell_type": "code",
438
+ "execution_count": 10,
439
+ "id": "72ad6548-5d1c-4221-88c7-014dcbaea0ee",
440
+ "metadata": {
441
+ "execution": {
442
+ "iopub.execute_input": "2024-03-31T16:39:02.160402Z",
443
+ "iopub.status.busy": "2024-03-31T16:39:02.159930Z",
444
+ "iopub.status.idle": "2024-03-31T16:39:02.162845Z",
445
+ "shell.execute_reply": "2024-03-31T16:39:02.162259Z",
446
+ "shell.execute_reply.started": "2024-03-31T16:39:02.160382Z"
447
+ }
448
+ },
449
+ "outputs": [],
450
+ "source": [
451
+ "# split the dataset into train and test\n",
452
+ "# from sklearn.model_selection import train_test_split\n",
453
+ "\n",
454
+ "# X_train, X_test, y_train, y_test = train_test_split(, labels, test_size=0.3, random_state=42, shuffle=True)\n",
455
+ "# X_train"
456
+ ]
457
+ },
458
+ {
459
+ "cell_type": "code",
460
+ "execution_count": 12,
461
+ "id": "9267da90-7a84-49d1-94d0-04a2cd3062e0",
462
+ "metadata": {
463
+ "execution": {
464
+ "iopub.execute_input": "2024-03-31T16:39:34.346301Z",
465
+ "iopub.status.busy": "2024-03-31T16:39:34.345554Z",
466
+ "iopub.status.idle": "2024-03-31T16:48:40.470989Z",
467
+ "shell.execute_reply": "2024-03-31T16:48:40.470195Z",
468
+ "shell.execute_reply.started": "2024-03-31T16:39:34.346268Z"
469
+ }
470
+ },
471
+ "outputs": [
472
+ {
473
+ "name": "stdout",
474
+ "output_type": "stream",
475
+ "text": [
476
+ "Epoch 1/10\n",
477
+ "14958/14958 [==============================] - 55s 4ms/step - loss: 0.4894 - accuracy: 0.8447 - val_loss: 0.4174 - val_accuracy: 0.8586\n",
478
+ "Epoch 2/10\n",
479
+ "14958/14958 [==============================] - 54s 4ms/step - loss: 0.3798 - accuracy: 0.8692 - val_loss: 0.3835 - val_accuracy: 0.8651\n",
480
+ "Epoch 3/10\n",
481
+ "14958/14958 [==============================] - 54s 4ms/step - loss: 0.3453 - accuracy: 0.8761 - val_loss: 0.3638 - val_accuracy: 0.8655\n",
482
+ "Epoch 4/10\n",
483
+ "14958/14958 [==============================] - 54s 4ms/step - loss: 0.3166 - accuracy: 0.8810 - val_loss: 0.3513 - val_accuracy: 0.8645\n",
484
+ "Epoch 5/10\n",
485
+ "14958/14958 [==============================] - 55s 4ms/step - loss: 0.2941 - accuracy: 0.8848 - val_loss: 0.3548 - val_accuracy: 0.8669\n",
486
+ "Epoch 6/10\n",
487
+ "14958/14958 [==============================] - 54s 4ms/step - loss: 0.2789 - accuracy: 0.8881 - val_loss: 0.3423 - val_accuracy: 0.8654\n",
488
+ "Epoch 7/10\n",
489
+ "14958/14958 [==============================] - 55s 4ms/step - loss: 0.2675 - accuracy: 0.8909 - val_loss: 0.3447 - val_accuracy: 0.8646\n",
490
+ "Epoch 8/10\n",
491
+ "14958/14958 [==============================] - 55s 4ms/step - loss: 0.2590 - accuracy: 0.8937 - val_loss: 0.3418 - val_accuracy: 0.8658\n",
492
+ "Epoch 9/10\n",
493
+ "14958/14958 [==============================] - 55s 4ms/step - loss: 0.2511 - accuracy: 0.8963 - val_loss: 0.3417 - val_accuracy: 0.8636\n",
494
+ "Epoch 10/10\n",
495
+ "14958/14958 [==============================] - 54s 4ms/step - loss: 0.2446 - accuracy: 0.8981 - val_loss: 0.3639 - val_accuracy: 0.8604\n"
496
+ ]
497
+ },
498
+ {
499
+ "data": {
500
+ "text/plain": [
501
+ "<keras.callbacks.History at 0x7f8c901c6280>"
502
+ ]
503
+ },
504
+ "execution_count": 12,
505
+ "metadata": {},
506
+ "output_type": "execute_result"
507
+ }
508
+ ],
509
+ "source": [
510
+ "# train the model\n",
511
+ "model.fit(X_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test, y_test_encoded))"
512
+ ]
513
+ },
514
+ {
515
+ "cell_type": "code",
516
+ "execution_count": 14,
517
+ "id": "24e17bec-2fbe-400f-9273-a5abe823f193",
518
+ "metadata": {
519
+ "execution": {
520
+ "iopub.execute_input": "2024-03-31T16:57:03.280928Z",
521
+ "iopub.status.busy": "2024-03-31T16:57:03.280193Z",
522
+ "iopub.status.idle": "2024-03-31T16:57:09.194519Z",
523
+ "shell.execute_reply": "2024-03-31T16:57:09.193928Z",
524
+ "shell.execute_reply.started": "2024-03-31T16:57:03.280897Z"
525
+ }
526
+ },
527
+ "outputs": [
528
+ {
529
+ "name": "stdout",
530
+ "output_type": "stream",
531
+ "text": [
532
+ "3740/3740 [==============================] - 6s 2ms/step - loss: 0.3639 - accuracy: 0.8604\n"
533
+ ]
534
+ },
535
+ {
536
+ "data": {
537
+ "text/plain": [
538
+ "0.8604295253753662"
539
+ ]
540
+ },
541
+ "execution_count": 14,
542
+ "metadata": {},
543
+ "output_type": "execute_result"
544
+ }
545
+ ],
546
+ "source": [
547
+ "# Evaluate the model\n",
548
+ "loss, accuracy = model.evaluate(X_test, y_test_encoded)\n",
549
+ "accuracy"
550
+ ]
551
+ },
552
+ {
553
+ "cell_type": "code",
554
+ "execution_count": 16,
555
+ "id": "15fbbb09-ffdf-41d3-ba11-8877aa2c078e",
556
+ "metadata": {
557
+ "execution": {
558
+ "iopub.execute_input": "2024-03-31T17:01:34.387114Z",
559
+ "iopub.status.busy": "2024-03-31T17:01:34.386216Z",
560
+ "iopub.status.idle": "2024-03-31T17:01:34.528910Z",
561
+ "shell.execute_reply": "2024-03-31T17:01:34.528157Z",
562
+ "shell.execute_reply.started": "2024-03-31T17:01:34.387078Z"
563
+ }
564
+ },
565
+ "outputs": [],
566
+ "source": [
567
+ "# save the model\n",
568
+ "import os\n",
569
+ "try:\n",
570
+ " model.save(\"../models/sentimental-analysis-llama2.keras\")\n",
571
+ "except FileNotFoundError:\n",
572
+ " os.mkdir(\"../models\")\n",
573
+ " model.save(\"../models/sentimental-analysis-llama2.keras\")"
574
+ ]
575
+ },
576
+ {
577
+ "cell_type": "code",
578
+ "execution_count": 22,
579
+ "id": "792f0680-5a32-4c46-b5b4-eb6795b51aeb",
580
+ "metadata": {
581
+ "execution": {
582
+ "iopub.execute_input": "2024-03-31T17:04:41.901658Z",
583
+ "iopub.status.busy": "2024-03-31T17:04:41.901124Z",
584
+ "iopub.status.idle": "2024-03-31T17:04:41.948670Z",
585
+ "shell.execute_reply": "2024-03-31T17:04:41.948177Z",
586
+ "shell.execute_reply.started": "2024-03-31T17:04:41.901637Z"
587
+ }
588
+ },
589
+ "outputs": [
590
+ {
591
+ "name": "stdout",
592
+ "output_type": "stream",
593
+ "text": [
594
+ "1/1 [==============================] - 0s 17ms/step\n"
595
+ ]
596
+ },
597
+ {
598
+ "data": {
599
+ "text/plain": [
600
+ "array([[0.809063 , 0.78246254, 0.02547726, 0.03657908, 0.00648503,\n",
601
+ " 0.02069169, 0.07264358]], dtype=float32)"
602
+ ]
603
+ },
604
+ "execution_count": 22,
605
+ "metadata": {},
606
+ "output_type": "execute_result"
607
+ }
608
+ ],
609
+ "source": [
610
+ "def preprocess_text(text):\n",
611
+ " # Tokenize the text\n",
612
+ " tokenized_text = tokenizer.texts_to_sequences([text])\n",
613
+ " # Pad sequences to the same length as training data\n",
614
+ " padded_text = pad_sequences(tokenized_text, maxlen=max_length, padding='post')\n",
615
+ " return padded_text\n",
616
+ "\n",
617
+ "# Preprocess the custom input text\n",
618
+ "preprocessed_text = preprocess_text(\"this is good\")\n",
619
+ "\n",
620
+ "# Make predictions\n",
621
+ "predictions = model.predict(preprocessed_text)\n",
622
+ "\n",
623
+ "predictions"
624
+ ]
625
+ }
626
+ ],
627
+ "metadata": {
628
+ "kernelspec": {
629
+ "display_name": "Python 3 (ipykernel)",
630
+ "language": "python",
631
+ "name": "python3"
632
+ },
633
+ "language_info": {
634
+ "codemirror_mode": {
635
+ "name": "ipython",
636
+ "version": 3
637
+ },
638
+ "file_extension": ".py",
639
+ "mimetype": "text/x-python",
640
+ "name": "python",
641
+ "nbconvert_exporter": "python",
642
+ "pygments_lexer": "ipython3",
643
+ "version": "3.9.16"
644
+ }
645
+ },
646
+ "nbformat": 4,
647
+ "nbformat_minor": 5
648
+ }
notes/model_utilizer.ipynb ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 13,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from tensorflow.keras.preprocessing.text import Tokenizer\n",
10
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
11
+ "from tensorflow import keras\n",
12
+ "from pathlib import Path as pp\n",
13
+ "\n",
14
+ "import os\n",
15
+ "notebook_path = os.getcwd()"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 19,
21
+ "metadata": {},
22
+ "outputs": [
23
+ {
24
+ "ename": "ValueError",
25
+ "evalue": "File not found: filepath=../imsoumyaneel-sentiment_analysis_llama2.keras. Please ensure the file is an accessible `.keras` zip file.",
26
+ "output_type": "error",
27
+ "traceback": [
28
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
29
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
30
+ "Cell \u001b[0;32mIn[19], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m padded_text\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# load model\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mkeras\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodels\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m../imsoumyaneel-sentiment_analysis_llama2.keras\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Replace \"your_model.h5\" with the path to your trained model\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;66;03m# Preprocess the custom input text\u001b[39;00m\n\u001b[1;32m 18\u001b[0m preprocessed_text \u001b[38;5;241m=\u001b[39m preprocess_text(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mi love this thing!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
31
+ "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/keras/src/saving/saving_api.py:185\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m(filepath, custom_objects, compile, safe_mode)\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m legacy_h5_format\u001b[38;5;241m.\u001b[39mload_model_from_hdf5(filepath)\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(filepath)\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.keras\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 185\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 186\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFile not found: filepath=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilepath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 187\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease ensure the file is an accessible `.keras` \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 188\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mzip file.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 189\u001b[0m )\n\u001b[1;32m 190\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 191\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 192\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFile format not supported: filepath=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilepath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 193\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mKeras 3 only supports V3 `.keras` files and \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 202\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmight have a different name).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 203\u001b[0m )\n",
32
+ "\u001b[0;31mValueError\u001b[0m: File not found: filepath=../imsoumyaneel-sentiment_analysis_llama2.keras. Please ensure the file is an accessible `.keras` zip file."
33
+ ]
34
+ }
35
+ ],
36
+ "source": [
37
+ "tokenizer = Tokenizer()\n",
38
+ "max_length = 200\n",
39
+ "models_dir = pp(pp(notebook_path).parent, 'models')\n",
40
+ "\n",
41
+ "\n",
42
+ "# use model\n",
43
+ "def preprocess_text(text):\n",
44
+ " # Tokenize the text\n",
45
+ " tokenized_text = tokenizer.texts_to_sequences([text])\n",
46
+ " # Pad sequences to the same length as training data\n",
47
+ " padded_text = pad_sequences(tokenized_text, maxlen=max_length, padding='post')\n",
48
+ " return padded_text\n",
49
+ "\n",
50
+ "# load model\n",
51
+ "model = keras.models.load_model('../imsoumyaneel-sentiment_analysis_llama2.keras') # Replace \"your_model.h5\" with the path to your trained model\n",
52
+ "\n",
53
+ "# Preprocess the custom input text\n",
54
+ "preprocessed_text = preprocess_text(\"i love this thing!\")\n",
55
+ "\n",
56
+ "# Make predictions\n",
57
+ "predictions = model.predict(preprocessed_text)\n",
58
+ "\n",
59
+ "predictions[0][0]"
60
+ ]
61
+ }
62
+ ],
63
+ "metadata": {
64
+ "kernelspec": {
65
+ "display_name": ".venv",
66
+ "language": "python",
67
+ "name": "python3"
68
+ },
69
+ "language_info": {
70
+ "codemirror_mode": {
71
+ "name": "ipython",
72
+ "version": 3
73
+ },
74
+ "file_extension": ".py",
75
+ "mimetype": "text/x-python",
76
+ "name": "python",
77
+ "nbconvert_exporter": "python",
78
+ "pygments_lexer": "ipython3",
79
+ "version": "3.10.12"
80
+ }
81
+ },
82
+ "nbformat": 4,
83
+ "nbformat_minor": 2
84
+ }
notes/references/reference_for_model_creation.ipynb ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "\n",
8
+ "Person: \n",
9
+ "- my dataset csv has two columns content and category, content is just comments as string and ccategory is the label like positive or nagative\n",
10
+ "\n",
11
+ "i want to create a binary classification sentimental analysis model using tensorflow, \n",
12
+ "i have seperated my dataset into 70-3o train and test dataset\n",
13
+ "how do i tokenize and create/train my model and test it\n",
14
+ "\n",
15
+ "ChatGPT: \n",
16
+ "- To create a binary classification sentiment analysis model using TensorFlow, you can follow these steps: \n",
17
+ "1. **Import Necessary Libraries** : Import TensorFlow, NumPy, and any other required libraries. \n",
18
+ "2. **Load the Dataset** : Load your CSV dataset into memory using libraries like Pandas. \n",
19
+ "3. **Preprocess the Data** : Tokenize the text data, convert it to sequences, pad sequences to make them uniform in length, and encode the labels. \n",
20
+ "4. **Build the Model** : Create a neural network model using TensorFlow's Keras API. \n",
21
+ "5. **Compile the Model** : Specify the loss function, optimizer, and metrics for training. \n",
22
+ "6. **Train the Model** : Fit the model to your training data. \n",
23
+ "7. **Evaluate the Model** : Test the trained model on your test dataset to evaluate its performance.\n",
24
+ "\n",
25
+ "Here's a Python code example to guide you through these steps:\n",
26
+ "\n",
27
+ "```python\n",
28
+ "import pandas as pd\n",
29
+ "import numpy as np\n",
30
+ "import tensorflow as tf\n",
31
+ "from tensorflow.keras.preprocessing.text import Tokenizer\n",
32
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
33
+ "\n",
34
+ "# Step 2: Load the dataset\n",
35
+ "df = pd.read_csv(\"your_dataset.csv\")\n",
36
+ "\n",
37
+ "# Step 3: Preprocess the data\n",
38
+ "tokenizer = Tokenizer()\n",
39
+ "tokenizer.fit_on_texts(df['content'])\n",
40
+ "\n",
41
+ "vocab_size = len(tokenizer.word_index) + 1\n",
42
+ "max_length = 100 # adjust as needed\n",
43
+ "embedding_dim = 50 # adjust as needed\n",
44
+ "\n",
45
+ "X = tokenizer.texts_to_sequences(df['content'])\n",
46
+ "X = pad_sequences(X, maxlen=max_length, padding='post')\n",
47
+ "\n",
48
+ "# Encode labels\n",
49
+ "labels = pd.get_dummies(df['category'])\n",
50
+ "\n",
51
+ "# Split into train and test datasets\n",
52
+ "from sklearn.model_selection import train_test_split\n",
53
+ "X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)\n",
54
+ "\n",
55
+ "# Step 4: Build the model\n",
56
+ "model = tf.keras.Sequential([\n",
57
+ " tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),\n",
58
+ " tf.keras.layers.GlobalAveragePooling1D(),\n",
59
+ " tf.keras.layers.Dense(16, activation='relu'),\n",
60
+ " tf.keras.layers.Dense(1, activation='sigmoid')\n",
61
+ "])\n",
62
+ "\n",
63
+ "# Step 5: Compile the model\n",
64
+ "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
65
+ "\n",
66
+ "# Step 6: Train the model\n",
67
+ "model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))\n",
68
+ "\n",
69
+ "# Step 7: Evaluate the model\n",
70
+ "loss, accuracy = model.evaluate(X_test, y_test)\n",
71
+ "print(\"Test Accuracy:\", accuracy)\n",
72
+ "```\n",
73
+ "\n",
74
+ "\n",
75
+ "\n",
76
+ "This code assumes that your labels are already one-hot encoded. Adjust the architecture, hyperparameters, and preprocessing steps as needed for your specific dataset and requirements.\n",
77
+ "\n",
78
+ "Person: \n",
79
+ "- ValueError: Arguments `target` and `output` must have the same shape. Received: target.shape=(None, 2), output.shape=(None, 1)\n",
80
+ "\n",
81
+ "ChatGPT: \n",
82
+ "- The error you're encountering suggests that there's a mismatch between the shape of the target labels and the output of your model. Since you're performing binary classification, the output layer of your model should have one neuron with a sigmoid activation function. However, your target labels seem to have two columns, which might be causing the issue.\n",
83
+ "\n",
84
+ "To fix this, you need to ensure that your target labels match the shape expected by the model. Since you're performing binary classification, you only need one output neuron, and your target labels should be encoded accordingly.\n",
85
+ "\n",
86
+ "Here's how you can adjust your code to fix the issue:\n",
87
+ "1. Ensure your target labels are encoded as a single column for binary classification.\n",
88
+ "2. Modify the output layer of your model to have one neuron with a sigmoid activation function.\n",
89
+ "3. Use binary cross-entropy as the loss function.\n",
90
+ "\n",
91
+ "Here's the modified code:\n",
92
+ "\n",
93
+ "```python\n",
94
+ "# Step 2: Load the dataset\n",
95
+ "df = pd.read_csv(\"your_dataset.csv\")\n",
96
+ "\n",
97
+ "# Step 3: Preprocess the data\n",
98
+ "tokenizer = Tokenizer()\n",
99
+ "tokenizer.fit_on_texts(df['content'])\n",
100
+ "\n",
101
+ "vocab_size = len(tokenizer.word_index) + 1\n",
102
+ "max_length = 100 # adjust as needed\n",
103
+ "embedding_dim = 50 # adjust as needed\n",
104
+ "\n",
105
+ "X = tokenizer.texts_to_sequences(df['content'])\n",
106
+ "X = pad_sequences(X, maxlen=max_length, padding='post')\n",
107
+ "\n",
108
+ "# Encode labels\n",
109
+ "labels = df['category'].map({'positive': 1, 'negative': 0}) # assuming 'positive' is labeled as 1 and 'negative' as 0\n",
110
+ "\n",
111
+ "# Split into train and test datasets\n",
112
+ "from sklearn.model_selection import train_test_split\n",
113
+ "X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)\n",
114
+ "\n",
115
+ "# Step 4: Build the model\n",
116
+ "model = tf.keras.Sequential([\n",
117
+ " tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),\n",
118
+ " tf.keras.layers.GlobalAveragePooling1D(),\n",
119
+ " tf.keras.layers.Dense(16, activation='relu'),\n",
120
+ " tf.keras.layers.Dense(1, activation='sigmoid') # output layer with one neuron for binary classification\n",
121
+ "])\n",
122
+ "\n",
123
+ "# Step 5: Compile the model\n",
124
+ "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # binary cross-entropy for binary classification\n",
125
+ "\n",
126
+ "# Step 6: Train the model\n",
127
+ "model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))\n",
128
+ "\n",
129
+ "# Step 7: Evaluate the model\n",
130
+ "loss, accuracy = model.evaluate(X_test, y_test)\n",
131
+ "print(\"Test Accuracy:\", accuracy)\n",
132
+ "```\n",
133
+ "\n",
134
+ "\n",
135
+ "\n",
136
+ "This should resolve the shape mismatch issue and allow you to train and evaluate your binary classification sentiment analysis model.\n",
137
+ "\n",
138
+ "Person: \n",
139
+ "- what do you mean by Ensure your target labels are encoded as a single column for binary classification.\n",
140
+ "\n",
141
+ "ChatGPT: \n",
142
+ "- In binary classification tasks, where you're trying to classify data into two categories (e.g., positive and negative sentiment), it's common practice to encode the target labels using a single column. Each label is represented as either 0 or 1, where 0 typically represents one class (e.g., negative sentiment) and 1 represents the other class (e.g., positive sentiment).\n",
143
+ "\n",
144
+ "For example, if your original dataset looks like this:contentcategory\"great product\"positive\"terrible service\"negative\n",
145
+ "\n",
146
+ "You need to encode the 'category' column to convert the labels into numeric values. You might choose to encode 'positive' as 1 and 'negative' as 0, resulting in a single column for the target labels:contentcategory (encoded)\"great product\"1\"terrible service\"0\n",
147
+ "\n",
148
+ "This ensures that each target label is represented as a single number, making it compatible with the output of your neural network model, which typically consists of one neuron for binary classification.\n",
149
+ "\n",
150
+ "In the provided code, the line `labels = df['category'].map({'positive': 1, 'negative': 0})` accomplishes this encoding. It maps 'positive' to 1 and 'negative' to 0, creating a single column of numeric labels suitable for binary classification.\n",
151
+ "\n",
152
+ "Person: \n",
153
+ "- ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).\n",
154
+ "\n",
155
+ "ChatGPT: \n",
156
+ "- The error you're encountering suggests that there's an issue with the data type of your target labels when passing them to TensorFlow. TensorFlow expects the target labels to be in a compatible format, typically as NumPy arrays or TensorFlow tensors.\n",
157
+ "\n",
158
+ "In the provided code, the target labels are converted to a Pandas Series with integer values (0s and 1s). However, TensorFlow expects the target labels to be either NumPy arrays or TensorFlow tensors.\n",
159
+ "\n",
160
+ "To resolve this issue, you can explicitly convert the Pandas Series to a NumPy array before passing it to TensorFlow. Here's how you can do it:\n",
161
+ "\n",
162
+ "```python\n",
163
+ "# Encode labels and convert to NumPy array\n",
164
+ "labels = df['category'].map({'positive': 1, 'negative': 0}).values # Convert to NumPy array\n",
165
+ "\n",
166
+ "# Split into train and test datasets\n",
167
+ "from sklearn.model_selection import train_test_split\n",
168
+ "X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)\n",
169
+ "\n",
170
+ "# Step 4: Build the model\n",
171
+ "model = tf.keras.Sequential([\n",
172
+ " tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),\n",
173
+ " tf.keras.layers.GlobalAveragePooling1D(),\n",
174
+ " tf.keras.layers.Dense(16, activation='relu'),\n",
175
+ " tf.keras.layers.Dense(1, activation='sigmoid') # output layer with one neuron for binary classification\n",
176
+ "])\n",
177
+ "\n",
178
+ "# Step 5: Compile the model\n",
179
+ "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # binary cross-entropy for binary classification\n",
180
+ "\n",
181
+ "# Step 6: Train the model\n",
182
+ "model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))\n",
183
+ "\n",
184
+ "# Step 7: Evaluate the model\n",
185
+ "loss, accuracy = model.evaluate(X_test, y_test)\n",
186
+ "print(\"Test Accuracy:\", accuracy)\n",
187
+ "```\n",
188
+ "\n",
189
+ "\n",
190
+ "\n",
191
+ "By converting the Pandas Series to a NumPy array with `.values`, you ensure that the target labels are in a compatible format for TensorFlow, which should resolve the ValueError.\n",
192
+ "\n",
193
+ "Person: \n",
194
+ "- ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int). \n",
195
+ "\n",
196
+ "still getting\n",
197
+ "\n",
198
+ "ChatGPT: \n",
199
+ "- The error message \"Failed to convert a NumPy array to a Tensor (Unsupported object type int)\" suggests that TensorFlow is encountering an issue with the data type of your target labels during training or evaluation. This could occur if TensorFlow expects a certain data type for the target labels but receives a different one.\n",
200
+ "\n",
201
+ "To resolve this issue, you can explicitly convert the target labels to a compatible data type for TensorFlow. Typically, TensorFlow expects target labels to be of type `float32` or `int32`.\n",
202
+ "\n",
203
+ "Let's convert the target labels to the appropriate data type before passing them to TensorFlow:\n",
204
+ "\n",
205
+ "```python\n",
206
+ "# Convert target labels to float32\n",
207
+ "labels = df['category'].map({'positive': 1, 'negative': 0}).astype('float32').values\n",
208
+ "\n",
209
+ "# Split into train and test datasets\n",
210
+ "from sklearn.model_selection import train_test_split\n",
211
+ "X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)\n",
212
+ "\n",
213
+ "# Step 4: Build the model\n",
214
+ "model = tf.keras.Sequential([\n",
215
+ " tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),\n",
216
+ " tf.keras.layers.GlobalAveragePooling1D(),\n",
217
+ " tf.keras.layers.Dense(16, activation='relu'),\n",
218
+ " tf.keras.layers.Dense(1, activation='sigmoid') # output layer with one neuron for binary classification\n",
219
+ "])\n",
220
+ "\n",
221
+ "# Step 5: Compile the model\n",
222
+ "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # binary cross-entropy for binary classification\n",
223
+ "\n",
224
+ "# Step 6: Train the model\n",
225
+ "model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))\n",
226
+ "\n",
227
+ "# Step 7: Evaluate the model\n",
228
+ "loss, accuracy = model.evaluate(X_test, y_test)\n",
229
+ "print(\"Test Accuracy:\", accuracy)\n",
230
+ "```\n",
231
+ "\n",
232
+ "\n",
233
+ "\n",
234
+ "By explicitly converting the target labels to `float32` using `.astype('float32')`, you ensure that they are in a compatible data type for TensorFlow. This should resolve the ValueError you encountered.\n",
235
+ "\n",
236
+ "Person: \n",
237
+ "- how to use the model with custom input text\n",
238
+ "\n",
239
+ "ChatGPT: \n",
240
+ "- Once you've trained your sentiment analysis model, you can use it to predict the sentiment of custom input text. Here's how you can do it:\n",
241
+ "1. Preprocess the custom input text to make it compatible with the model.\n",
242
+ "2. Use the trained model to predict the sentiment of the preprocessed input text.\n",
243
+ "\n",
244
+ "Here's a step-by-step guide on how to achieve this:\n",
245
+ "\n",
246
+ "```python\n",
247
+ "import tensorflow as tf\n",
248
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
249
+ "import numpy as np\n",
250
+ "\n",
251
+ "# Assuming you've already trained and saved your model, load the model\n",
252
+ "model = tf.keras.models.load_model(\"your_model.h5\") # Replace \"your_model.h5\" with the path to your trained model\n",
253
+ "\n",
254
+ "# Function to preprocess input text\n",
255
+ "def preprocess_text(text):\n",
256
+ " # Tokenize the text\n",
257
+ " tokenized_text = tokenizer.texts_to_sequences([text])\n",
258
+ " # Pad sequences to the same length as training data\n",
259
+ " padded_text = pad_sequences(tokenized_text, maxlen=max_length, padding='post')\n",
260
+ " return padded_text\n",
261
+ "\n",
262
+ "# Example custom input text\n",
263
+ "custom_text = \"I really enjoyed the product. It exceeded my expectations!\"\n",
264
+ "\n",
265
+ "# Preprocess the custom input text\n",
266
+ "preprocessed_text = preprocess_text(custom_text)\n",
267
+ "\n",
268
+ "# Make predictions\n",
269
+ "predictions = model.predict(preprocessed_text)\n",
270
+ "\n",
271
+ "# Convert prediction to sentiment label\n",
272
+ "sentiment_label = \"positive\" if predictions[0][0] > 0.5 else \"negative\"\n",
273
+ "\n",
274
+ "print(\"Predicted Sentiment:\", sentiment_label)\n",
275
+ "```\n",
276
+ "\n",
277
+ "\n",
278
+ "\n",
279
+ "In this example: \n",
280
+ "- `preprocess_text()` function preprocesses the custom input text by tokenizing and padding it to make it compatible with the model's input format. \n",
281
+ "- `model.predict()` is used to make predictions on the preprocessed input text.\n",
282
+ "- The threshold of 0.5 is used to classify the sentiment. If the predicted probability is greater than 0.5, the sentiment is classified as positive; otherwise, it's classified as negative.\n",
283
+ "\n",
284
+ "Replace `\"your_model.h5\"` with the path to your trained model file. You also need to have the `tokenizer`, `max_length`, and any other preprocessing steps used during training available in order to preprocess the custom input text correctly.\n",
285
+ "\n",
286
+ "Person: \n",
287
+ "- how to save model\n",
288
+ "\n",
289
+ "ChatGPT: \n",
290
+ "- To save your TensorFlow/Keras model after training, you can use the `save()` method. This method allows you to save the entire model including its architecture, weights, and training configuration in the Hierarchical Data Format (HDF5) file format. Here's how you can save your model:\n",
291
+ "\n",
292
+ "```python\n",
293
+ "# Assuming you've already trained your model\n",
294
+ "# model = ... # your trained model\n",
295
+ "\n",
296
+ "# Save the model\n",
297
+ "model.save(\"your_model.h5\")\n",
298
+ "```\n",
299
+ "\n",
300
+ "\n",
301
+ "\n",
302
+ "This will save your model to a file named `\"your_model.h5\"` in the current directory. You can specify a different directory or path if needed.\n",
303
+ "\n",
304
+ "You can then load your saved model later using `tf.keras.models.load_model()` as shown in the previous example:\n",
305
+ "\n",
306
+ "```python\n",
307
+ "import tensorflow as tf\n",
308
+ "\n",
309
+ "# Load the saved model\n",
310
+ "model = tf.keras.models.load_model(\"your_model.h5\")\n",
311
+ "```\n",
312
+ "\n",
313
+ "\n",
314
+ "\n",
315
+ "This will load the entire model, including its architecture, weights, and training configuration, allowing you to make predictions or further train the model if needed.\n"
316
+ ]
317
+ }
318
+ ],
319
+ "metadata": {
320
+ "language_info": {
321
+ "name": "python"
322
+ }
323
+ },
324
+ "nbformat": 4,
325
+ "nbformat_minor": 2
326
+ }
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ datasets
2
+ keras
3
+ pandas
4
+ tf_keras
5
+ tensorflow
6
+ transformers
7
+ tf-keras
8
+ tensorrt
9
+ tensorflow[and-cuda]