JThosantad commited on
Commit
fe4dee2
1 Parent(s): 852bf17

Upload 2 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ imdb_reviews.csv filter=lfs diff=lfs merge=lfs -text
WorkshopSentimentsAna-std.ipynb ADDED
@@ -0,0 +1,847 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "c454c018-02b7-4c3d-a21f-411748963a3f",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Workshop: Sentiment Analysis"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "2eda2e01-dfc4-42a6-9b6a-5cdf39fbce78",
14
+ "metadata": {},
15
+ "source": [
16
+ "<div>\n",
17
+ "<img src=\"https://lh3.googleusercontent.com/pw/ADCreHdzakFbNdHwBE1ZrwOiNCQibViWOir9DF9Dv4fbZEdWpx4mzFOT_RxkUGLTyDW7fQ0OwEyNQwqllupbvm0WiU0RNuFs-kWx1fTIvjiSkPGE5m64PilOIeApxQLwX_rl-JU7uYT-ROxdppIsJimCeos=w406-h451-s-no-gm?authuser=0\" width=\"390\"/> \n",
18
+ "</div>"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "id": "7ef9db65-1fda-4fc6-8bb9-bc52bdbb9529",
25
+ "metadata": {
26
+ "tags": []
27
+ },
28
+ "outputs": [],
29
+ "source": [
30
+ "# !pip install nltk\n",
31
+ "# !pip install transformers "
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "markdown",
36
+ "id": "1a0b8ed9-f240-47b4-aa62-0cf48bdd7868",
37
+ "metadata": {
38
+ "jp-MarkdownHeadingCollapsed": true,
39
+ "tags": []
40
+ },
41
+ "source": [
42
+ "## Rule-Based Approaches\n",
43
+ "\n",
44
+ "- **Lexicon-Based Methods**: Use sentiment lexicons or dictionaries that contain words annotated with their sentiment polarity (positive, negative, neutral).\n",
45
+ "- **Pattern Matching**: Identify sentiment based on predefined patterns or rules in the text.\n"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "id": "9f7f14b4-60ba-4a92-a9d0-a124e62fe03b",
52
+ "metadata": {
53
+ "tags": []
54
+ },
55
+ "outputs": [],
56
+ "source": [
57
+ "import nltk\n",
58
+ "from nltk.tokenize import word_tokenize\n",
59
+ "from nltk.corpus import stopwords\n",
60
+ "\n",
61
+ "# nltk.download('stopwords')\n",
62
+ "# nltk.download('punkt')"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": null,
68
+ "id": "8a25f60f-f202-49cd-b965-e3ebb1676786",
69
+ "metadata": {
70
+ "tags": []
71
+ },
72
+ "outputs": [],
73
+ "source": [
74
+ "print(stopwords.words('english'))"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": null,
80
+ "id": "7652d6d2-ba4c-4d02-bfe3-313b6e0f24a5",
81
+ "metadata": {
82
+ "tags": []
83
+ },
84
+ "outputs": [],
85
+ "source": [
86
+ "text = \"I had a good experience with the product. Highly recommended!\""
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "id": "53fc7d50-59fa-4bec-9ae4-b93a1a3847f1",
93
+ "metadata": {
94
+ "tags": []
95
+ },
96
+ "outputs": [],
97
+ "source": [
98
+ "tokens = word_tokenize(text.lower())"
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "execution_count": null,
104
+ "id": "faac761f-912e-44f7-b7b0-626baaea6a56",
105
+ "metadata": {
106
+ "tags": []
107
+ },
108
+ "outputs": [],
109
+ "source": [
110
+ "print(tokens)"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": null,
116
+ "id": "9f6543a2-76f4-4993-b535-f90e50bada72",
117
+ "metadata": {
118
+ "tags": []
119
+ },
120
+ "outputs": [],
121
+ "source": [
122
+ "stop_words = set(stopwords.words('english'))"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": null,
128
+ "id": "4d7f529d-f006-48db-a092-2262f17cb3cd",
129
+ "metadata": {
130
+ "tags": []
131
+ },
132
+ "outputs": [],
133
+ "source": [
134
+ "tokens = [word for word in tokens if word.isalnum() and word not in stop_words] #alnum = alphanumeric"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": null,
140
+ "id": "4acfb41c-615d-4e8b-92dc-3f73a4188402",
141
+ "metadata": {
142
+ "tags": []
143
+ },
144
+ "outputs": [],
145
+ "source": [
146
+ "print(tokens)"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": null,
152
+ "id": "c3cfd1cc-3f30-43de-a469-dec0b3816313",
153
+ "metadata": {},
154
+ "outputs": [],
155
+ "source": []
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": null,
160
+ "id": "aed2ad01-27e5-45e3-a55c-63084966a482",
161
+ "metadata": {
162
+ "tags": []
163
+ },
164
+ "outputs": [],
165
+ "source": [
166
+ "# Sample positive and negative words\n",
167
+ "positive_words = set(['good', 'awesome', 'excellent', 'happy', 'positive'])\n",
168
+ "negative_words = set(['bad', 'terrible', 'poor', 'unhappy', 'negative'])\n",
169
+ "\n",
170
+ "def rule_based_sentiment_analysis(text):\n",
171
+ " # Tokenize the text\n",
172
+ " tokens = word_tokenize(text.lower())\n",
173
+ "\n",
174
+ " # Remove stopwords\n",
175
+ " stop_words = set(stopwords.words('english'))\n",
176
+ " tokens = [word for word in tokens if word.isalnum() and word not in stop_words] #alnum = alphanumeric\n",
177
+ "\n",
178
+ " # Calculate sentiment score\n",
179
+ " sentiment_score = sum(1 for word in tokens if word in positive_words) - sum(1 for word in tokens if word in negative_words)\n",
180
+ "\n",
181
+ " # Classify sentiment\n",
182
+ " if sentiment_score > 0:\n",
183
+ " return 'Positive'\n",
184
+ " elif sentiment_score < 0:\n",
185
+ " return 'Negative'\n",
186
+ " else:\n",
187
+ " return 'Neutral'\n",
188
+ "\n",
189
+ "# Example usage\n",
190
+ "text_to_analyze = \"I had a good experience with the product. Highly recommended!\"\n",
191
+ "sentiment_result = rule_based_sentiment_analysis(text_to_analyze)\n",
192
+ "print(f\"Sentiment: {sentiment_result}\")"
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "markdown",
197
+ "id": "21764069-0b07-4b3e-8103-b2ab464a9182",
198
+ "metadata": {
199
+ "tags": []
200
+ },
201
+ "source": [
202
+ "## Machine Learning Approaches"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "markdown",
207
+ "id": "dc739c8a-a453-43d1-bdc5-ad10d823d748",
208
+ "metadata": {
209
+ "jp-MarkdownHeadingCollapsed": true,
210
+ "tags": []
211
+ },
212
+ "source": [
213
+ "### Import packages"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": null,
219
+ "id": "7e030b97-e111-45ea-b00f-09a360f3400e",
220
+ "metadata": {
221
+ "tags": []
222
+ },
223
+ "outputs": [],
224
+ "source": [
225
+ "import pandas as pd\n",
226
+ "from sklearn.pipeline import Pipeline\n",
227
+ "from sklearn.utils import shuffle\n",
228
+ "from sklearn.model_selection import train_test_split\n",
229
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
230
+ "# from sklearn.svm import SVC\n",
231
+ "from sklearn.naive_bayes import MultinomialNB\n",
232
+ "from sklearn.metrics import classification_report, confusion_matrix\n",
233
+ "\n"
234
+ ]
235
+ },
236
+ {
237
+ "cell_type": "markdown",
238
+ "id": "54c4fe66-f52f-487f-bfd5-0ea6e05206ce",
239
+ "metadata": {
240
+ "tags": []
241
+ },
242
+ "source": [
243
+ "### TF-IDF vectorizer"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "markdown",
248
+ "id": "3f5b7e92-5de4-4894-b2be-47dac1cf2482",
249
+ "metadata": {},
250
+ "source": [
251
+ "\n",
252
+ "<div>\n",
253
+ "<img src=\"https://www.kdnuggets.com/wp-content/uploads/awan_convert_text_documents_tfidf_matrix_tfidfvectorizer_3.png\" width=\"590\"/> \n",
254
+ "</div>\n",
255
+ "\n",
256
+ "\n",
257
+ "Image sources: https://www.kdnuggets.com/2022/09/convert-text-documents-tfidf-matrix-tfidfvectorizer.html\n",
258
+ "\n",
259
+ "\n",
260
+ "\n",
261
+ "\n"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "markdown",
266
+ "id": "9bd125fc-11fd-414a-b8f0-ff7ef628fb94",
267
+ "metadata": {
268
+ "jp-MarkdownHeadingCollapsed": true,
269
+ "tags": []
270
+ },
271
+ "source": [
272
+ "##### Example on Small data"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": null,
278
+ "id": "8a61fdce-6544-4774-bc29-265bf4afaa90",
279
+ "metadata": {
280
+ "tags": []
281
+ },
282
+ "outputs": [],
283
+ "source": [
284
+ "\n",
285
+ "\n",
286
+ "# Sample data\n",
287
+ "documents = [\n",
288
+ " \"This is the first document.\",\n",
289
+ " \"This document is the second document.\",\n",
290
+ " \"And this is the third one.\",\n",
291
+ " \"Is this the first document?\"\n",
292
+ "]"
293
+ ]
294
+ },
295
+ {
296
+ "cell_type": "code",
297
+ "execution_count": null,
298
+ "id": "5794027b-2bee-46d9-9b4d-9cbaa7c4120f",
299
+ "metadata": {
300
+ "tags": []
301
+ },
302
+ "outputs": [],
303
+ "source": [
304
+ "# Create a DataFrame for better visualization\n",
305
+ "df = pd.DataFrame({'Text': documents})"
306
+ ]
307
+ },
308
+ {
309
+ "cell_type": "code",
310
+ "execution_count": null,
311
+ "id": "b49d5272-0383-4e39-910b-87276c4ffca2",
312
+ "metadata": {
313
+ "tags": []
314
+ },
315
+ "outputs": [],
316
+ "source": [
317
+ "# TF-IDF vectorization\n",
318
+ "vectorizer = TfidfVectorizer()\n",
319
+ "tfidf_matrix = vectorizer.fit_transform(df['Text'].tolist())"
320
+ ]
321
+ },
322
+ {
323
+ "cell_type": "code",
324
+ "execution_count": null,
325
+ "id": "46c0b47d-80ab-498b-91a2-7202f1c429fd",
326
+ "metadata": {
327
+ "tags": []
328
+ },
329
+ "outputs": [],
330
+ "source": [
331
+ "# Convert the TF-IDF matrix to a DataFrame\n",
332
+ "tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())"
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": null,
338
+ "id": "91c2bee0-5bb6-44b9-a609-1f3d0e891ad4",
339
+ "metadata": {
340
+ "tags": []
341
+ },
342
+ "outputs": [],
343
+ "source": [
344
+ "# Print the original data\n",
345
+ "print(\"Original Data:\")\n",
346
+ "print(df)"
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "code",
351
+ "execution_count": null,
352
+ "id": "24c4a522-8ef4-4001-ada6-031a043b9a54",
353
+ "metadata": {
354
+ "tags": []
355
+ },
356
+ "outputs": [],
357
+ "source": [
358
+ "print(tfidf_matrix)"
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "code",
363
+ "execution_count": null,
364
+ "id": "6feb5892-284f-43d1-ab7b-5b13dbfadd0b",
365
+ "metadata": {
366
+ "tags": []
367
+ },
368
+ "outputs": [],
369
+ "source": [
370
+ "# Print the TF-IDF matrix\n",
371
+ "print(\"\\nTF-IDF Matrix:\")\n",
372
+ "print(tfidf_df)"
373
+ ]
374
+ },
375
+ {
376
+ "cell_type": "markdown",
377
+ "id": "6802c239-edfa-462e-99ea-31386fd7aed4",
378
+ "metadata": {
379
+ "tags": []
380
+ },
381
+ "source": [
382
+ "## Naive Bayes classifier trained on the TF-IDF features."
383
+ ]
384
+ },
385
+ {
386
+ "cell_type": "markdown",
387
+ "id": "3accf6f8-6cae-4265-8d5d-fb5d40a07a2d",
388
+ "metadata": {},
389
+ "source": [
390
+ "<div>\n",
391
+ "<img src=\"fig_bayes-nw.png\" width=\"800\"/> \n",
392
+ "</div>\n"
393
+ ]
394
+ },
395
+ {
396
+ "cell_type": "markdown",
397
+ "id": "9062063a-557b-4971-ad84-e3601b1a520e",
398
+ "metadata": {
399
+ "jp-MarkdownHeadingCollapsed": true,
400
+ "tags": []
401
+ },
402
+ "source": [
403
+ "### Read data/Preparation"
404
+ ]
405
+ },
406
+ {
407
+ "cell_type": "code",
408
+ "execution_count": null,
409
+ "id": "8d2eab09-03c7-441e-9c78-0c2e069f4d25",
410
+ "metadata": {
411
+ "tags": []
412
+ },
413
+ "outputs": [],
414
+ "source": [
415
+ "# df = pd.read_csv(\"Womens_Clothing_E_Commerce_Reviews.csv\")\n",
416
+ "df = pd.read_csv(\"imdb_reviews.csv\")"
417
+ ]
418
+ },
419
+ {
420
+ "cell_type": "code",
421
+ "execution_count": null,
422
+ "id": "aca597f3-c8da-4314-990e-253d5ed719da",
423
+ "metadata": {
424
+ "tags": []
425
+ },
426
+ "outputs": [],
427
+ "source": [
428
+ "df.shape"
429
+ ]
430
+ },
431
+ {
432
+ "cell_type": "code",
433
+ "execution_count": null,
434
+ "id": "7d8131e4-4a69-45af-aa12-335c926e308f",
435
+ "metadata": {
436
+ "tags": []
437
+ },
438
+ "outputs": [],
439
+ "source": [
440
+ "df.head(3)"
441
+ ]
442
+ },
443
+ {
444
+ "cell_type": "code",
445
+ "execution_count": null,
446
+ "id": "43a27caf-779b-4bd1-a3cf-fa641021172e",
447
+ "metadata": {
448
+ "tags": []
449
+ },
450
+ "outputs": [],
451
+ "source": [
452
+ "df['label'].unique()"
453
+ ]
454
+ },
455
+ {
456
+ "cell_type": "code",
457
+ "execution_count": null,
458
+ "id": "c72dd5ec-59b2-4c7f-a8fb-fdade866984d",
459
+ "metadata": {
460
+ "tags": []
461
+ },
462
+ "outputs": [],
463
+ "source": [
464
+ "df['label'].unique()"
465
+ ]
466
+ },
467
+ {
468
+ "cell_type": "code",
469
+ "execution_count": null,
470
+ "id": "ba556f9b-da1c-4d13-8d70-563e0bd528a1",
471
+ "metadata": {
472
+ "tags": []
473
+ },
474
+ "outputs": [],
475
+ "source": [
476
+ "df.isna().sum()"
477
+ ]
478
+ },
479
+ {
480
+ "cell_type": "markdown",
481
+ "id": "819c31c3-873d-4d31-a21a-759059bd4c6d",
482
+ "metadata": {
483
+ "jp-MarkdownHeadingCollapsed": true,
484
+ "tags": []
485
+ },
486
+ "source": [
487
+ "### Split the dataset into training and testing sets"
488
+ ]
489
+ },
490
+ {
491
+ "cell_type": "code",
492
+ "execution_count": null,
493
+ "id": "6ca318a2-26d7-446e-8324-6660171f239d",
494
+ "metadata": {
495
+ "tags": []
496
+ },
497
+ "outputs": [],
498
+ "source": [
499
+ "train_data, test_data, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)"
500
+ ]
501
+ },
502
+ {
503
+ "cell_type": "code",
504
+ "execution_count": null,
505
+ "id": "f0cfc8fc-49e5-4c88-bb33-8084dcf00100",
506
+ "metadata": {
507
+ "tags": []
508
+ },
509
+ "outputs": [],
510
+ "source": [
511
+ "print(train_data)"
512
+ ]
513
+ },
514
+ {
515
+ "cell_type": "code",
516
+ "execution_count": null,
517
+ "id": "51d0a415-4982-43dd-8864-c189ba6826f4",
518
+ "metadata": {
519
+ "tags": []
520
+ },
521
+ "outputs": [],
522
+ "source": [
523
+ "print(train_labels)"
524
+ ]
525
+ },
526
+ {
527
+ "cell_type": "markdown",
528
+ "id": "42987cdb-4cdf-46df-95d8-7c2b2824c1ee",
529
+ "metadata": {
530
+ "jp-MarkdownHeadingCollapsed": true,
531
+ "tags": []
532
+ },
533
+ "source": [
534
+ "### Create a pipeline"
535
+ ]
536
+ },
537
+ {
538
+ "cell_type": "code",
539
+ "execution_count": null,
540
+ "id": "06ffd548-c333-4c1a-87ce-9699ddd116ee",
541
+ "metadata": {
542
+ "tags": []
543
+ },
544
+ "outputs": [],
545
+ "source": [
546
+ "sentiment_pipeline = Pipeline([\n",
547
+ " ('tfidf', TfidfVectorizer()),\n",
548
+ " ('nb', MultinomialNB())\n",
549
+ "])"
550
+ ]
551
+ },
552
+ {
553
+ "cell_type": "markdown",
554
+ "id": "6bafa7cd-8d0b-4725-bd40-4a3b04634fab",
555
+ "metadata": {
556
+ "jp-MarkdownHeadingCollapsed": true,
557
+ "tags": []
558
+ },
559
+ "source": [
560
+ "### Train the model using the pipeline"
561
+ ]
562
+ },
563
+ {
564
+ "cell_type": "code",
565
+ "execution_count": null,
566
+ "id": "712dea09-52c2-4a9f-8bf9-3cbb273fe4b5",
567
+ "metadata": {
568
+ "tags": []
569
+ },
570
+ "outputs": [],
571
+ "source": [
572
+ "sentiment_pipeline.fit(train_data, train_labels)\n"
573
+ ]
574
+ },
575
+ {
576
+ "cell_type": "markdown",
577
+ "id": "4c95c599-ae0d-433f-9ed5-856fd9fa35e0",
578
+ "metadata": {
579
+ "jp-MarkdownHeadingCollapsed": true,
580
+ "tags": []
581
+ },
582
+ "source": [
583
+ "### Make predictions on the test set"
584
+ ]
585
+ },
586
+ {
587
+ "cell_type": "code",
588
+ "execution_count": null,
589
+ "id": "37ae9eda-4a02-4f40-bdeb-ecb8ea67f9d3",
590
+ "metadata": {
591
+ "tags": []
592
+ },
593
+ "outputs": [],
594
+ "source": [
595
+ "predictions = sentiment_pipeline.predict(test_data)"
596
+ ]
597
+ },
598
+ {
599
+ "cell_type": "markdown",
600
+ "id": "a33458e2-90cb-4c94-b977-8cc8ea5a273e",
601
+ "metadata": {
602
+ "jp-MarkdownHeadingCollapsed": true,
603
+ "tags": []
604
+ },
605
+ "source": [
606
+ "### Evaluate the model"
607
+ ]
608
+ },
609
+ {
610
+ "cell_type": "code",
611
+ "execution_count": null,
612
+ "id": "9ad90567-93d2-4090-81be-5c77f41e379a",
613
+ "metadata": {
614
+ "tags": []
615
+ },
616
+ "outputs": [],
617
+ "source": [
618
+ "\n",
619
+ "report = classification_report(test_labels, predictions)\n",
620
+ "\n",
621
+ "print(\"Classification Report:\\n\", report)"
622
+ ]
623
+ },
624
+ {
625
+ "cell_type": "code",
626
+ "execution_count": null,
627
+ "id": "ef002e29-d065-4825-a076-3d23fdfa7b59",
628
+ "metadata": {
629
+ "tags": []
630
+ },
631
+ "outputs": [],
632
+ "source": [
633
+ "cm = confusion_matrix(test_labels, predictions)\n",
634
+ "cm"
635
+ ]
636
+ },
637
+ {
638
+ "cell_type": "markdown",
639
+ "id": "6e7729bb-a833-4feb-bd2a-b04a2741bd70",
640
+ "metadata": {
641
+ "jp-MarkdownHeadingCollapsed": true,
642
+ "tags": []
643
+ },
644
+ "source": [
645
+ "## Huggingface: Pre-trained sentiment analysis model\n",
646
+ "\n",
647
+ "https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english"
648
+ ]
649
+ },
650
+ {
651
+ "cell_type": "code",
652
+ "execution_count": null,
653
+ "id": "9afad444-c2cc-4f3d-b49d-07a723be6154",
654
+ "metadata": {
655
+ "tags": []
656
+ },
657
+ "outputs": [],
658
+ "source": [
659
+ "\n",
660
+ "from transformers import pipeline\n",
661
+ "sentiment_analyzer = pipeline('sentiment-analysis', model =\"distilbert-base-uncased-finetuned-sst-2-english\") #, revision =\"af0f99b\")\n",
662
+ "data = [\"I love you\", \"I hate you\"]\n",
663
+ "sentiment_analyzer(data)\n"
664
+ ]
665
+ },
666
+ {
667
+ "cell_type": "code",
668
+ "execution_count": null,
669
+ "id": "4987efd9-8ca8-40b1-90cc-ff361207fb8f",
670
+ "metadata": {
671
+ "tags": []
672
+ },
673
+ "outputs": [],
674
+ "source": [
675
+ "result = sentiment_analyzer(\"I love using this model!\")\n",
676
+ "print(result)"
677
+ ]
678
+ },
679
+ {
680
+ "cell_type": "markdown",
681
+ "id": "68436dda-e3c3-499d-b390-60443f9a1796",
682
+ "metadata": {
683
+ "jp-MarkdownHeadingCollapsed": true,
684
+ "tags": []
685
+ },
686
+ "source": [
687
+ "## Huggingface: Thai "
688
+ ]
689
+ },
690
+ {
691
+ "cell_type": "markdown",
692
+ "id": "72a9f8e0-12bf-403b-8b78-e381a65e9eaa",
693
+ "metadata": {},
694
+ "source": [
695
+ "### model=\"poom-sci/WangchanBERTa-finetuned-sentiment\"\n",
696
+ "\n",
697
+ "https://huggingface.co/poom-sci/WangchanBERTa-finetuned-sentiment"
698
+ ]
699
+ },
700
+ {
701
+ "cell_type": "code",
702
+ "execution_count": null,
703
+ "id": "d698825b-3bd7-4370-871f-ac6e5fe5fe47",
704
+ "metadata": {
705
+ "tags": []
706
+ },
707
+ "outputs": [],
708
+ "source": [
709
+ "from transformers import pipeline\n",
710
+ "\n",
711
+ "sentiment_analyzer = pipeline('sentiment-analysis', model=\"poom-sci/WangchanBERTa-finetuned-sentiment\")#, revision=\"b78d071\")\n",
712
+ "\n",
713
+ "data = [\"อร่อยจัดๆ\", \"รอนานแท้\"]\n",
714
+ "sentiment_analyzer(data)\n"
715
+ ]
716
+ },
717
+ {
718
+ "cell_type": "code",
719
+ "execution_count": null,
720
+ "id": "87d815d4-135c-471e-93ee-cacc93653d4e",
721
+ "metadata": {
722
+ "tags": []
723
+ },
724
+ "outputs": [],
725
+ "source": [
726
+ "sentiment_analyzer(\"ข้าวบูด\")"
727
+ ]
728
+ },
729
+ {
730
+ "cell_type": "code",
731
+ "execution_count": null,
732
+ "id": "60f5c43a-6cb7-47f1-85c5-751e91599ad9",
733
+ "metadata": {},
734
+ "outputs": [],
735
+ "source": []
736
+ },
737
+ {
738
+ "cell_type": "markdown",
739
+ "id": "f894a4bd-1f04-4126-aa8d-e0211b41687e",
740
+ "metadata": {
741
+ "jp-MarkdownHeadingCollapsed": true,
742
+ "tags": []
743
+ },
744
+ "source": [
745
+ "## Deploy on Streamlit Sharing\n",
746
+ "\n",
747
+ "https://share.streamlit.io/ or https://huggingface.co/spaces\n",
748
+ "\n",
749
+ "https://docs.streamlit.io/library/api-reference\n",
750
+ "\n",
751
+ "https://github.com/\n",
752
+ "\n"
753
+ ]
754
+ },
755
+ {
756
+ "cell_type": "code",
757
+ "execution_count": null,
758
+ "id": "dfd5baee-dc74-4f6d-84be-52a2b89d0f28",
759
+ "metadata": {
760
+ "tags": []
761
+ },
762
+ "outputs": [],
763
+ "source": [
764
+ "\n",
765
+ "%%writefile app_senti.py\n",
766
+ "\n",
767
+ "\n",
768
+ "import streamlit as st\n",
769
+ "from transformers import pipeline\n",
770
+ "\n",
771
+ "# Load the sentiment analysis model\n",
772
+ "model_name = \"poom-sci/WangchanBERTa-finetuned-sentiment\"\n",
773
+ "sentiment_analyzer = pipeline('sentiment-analysis', model=model_name)\n",
774
+ "\n",
775
+ "# Streamlit app\n",
776
+ "st.title(\"Thai Sentiment Analysis App\")\n",
777
+ "\n",
778
+ "# Input text\n",
779
+ "text_input = st.text_area(\"Enter Thai text for sentiment analysis\", \"ขอความเห็นหน่อย... \")\n",
780
+ "\n",
781
+ "# Button to trigger analysis\n",
782
+ "if st.button(\"Analyze Sentiment\"):\n",
783
+ " # Analyze sentiment using the model\n",
784
+ " results = sentiment_analyzer([text_input])\n",
785
+ "\n",
786
+ " # Extract sentiment and score\n",
787
+ " sentiment = results[0]['label']\n",
788
+ " score = results[0]['score']\n",
789
+ " \n",
790
+ "\n",
791
+ " # Display result as progress bars\n",
792
+ " st.subheader(\"Sentiment Analysis Result:\")\n",
793
+ "\n",
794
+ " if sentiment == 'pos':\n",
795
+ " st.success(f\"Positive Sentiment (Score: {score:.2f})\")\n",
796
+ " st.progress(score)\n",
797
+ " elif sentiment == 'neg':\n",
798
+ " st.error(f\"Negative Sentiment (Score: {score:.2f})\")\n",
799
+ " st.progress(score)\n",
800
+ " else:\n",
801
+ " st.warning(f\"Neutral Sentiment (Score: {score:.2f})\")\n",
802
+ " st.progress(score)\n"
803
+ ]
804
+ },
805
+ {
806
+ "cell_type": "code",
807
+ "execution_count": null,
808
+ "id": "70111967-b904-4f18-a8d0-0c8701ec35ab",
809
+ "metadata": {},
810
+ "outputs": [],
811
+ "source": [
812
+ "%%writefile requirements.txt\n",
813
+ "transformers\n",
814
+ "torch\n"
815
+ ]
816
+ },
817
+ {
818
+ "cell_type": "code",
819
+ "execution_count": null,
820
+ "id": "88001002-587d-403d-ab65-d060bde9d42d",
821
+ "metadata": {},
822
+ "outputs": [],
823
+ "source": []
824
+ }
825
+ ],
826
+ "metadata": {
827
+ "kernelspec": {
828
+ "display_name": "Python 3 (ipykernel)",
829
+ "language": "python",
830
+ "name": "python3"
831
+ },
832
+ "language_info": {
833
+ "codemirror_mode": {
834
+ "name": "ipython",
835
+ "version": 3
836
+ },
837
+ "file_extension": ".py",
838
+ "mimetype": "text/x-python",
839
+ "name": "python",
840
+ "nbconvert_exporter": "python",
841
+ "pygments_lexer": "ipython3",
842
+ "version": "3.11.3"
843
+ }
844
+ },
845
+ "nbformat": 4,
846
+ "nbformat_minor": 5
847
+ }
imdb_reviews.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94752f1d468e32222c9190c99dc1758a2e81ec6ad5e76528fe4ce31d3edd495c
3
+ size 66262304