danielcd99 AndreMitri commited on
Commit
63de343
1 Parent(s): c1392df

Add RottenTomatos, arrumando tokenizador, arrumando paths (#2)

Browse files

- New test dataset RottenTomatos (38e8124e601981c9365e270f4280be3a1c3c8ad4)
- Fix df path (e8eb8f6f97767645897831d4af668d4d110896f9)
- Fix df path (80007d324957c1861539cf7e02e19a09d587249a)
- Fix dataset, removing NaN (0242ff9ad7ce675cc230718cb5094560340c93c5)
- Add 200 token limit to pipeline (46d3e20480dc07e8b5f7560a567f0d14ed58baa4)
- Metricas de avaliacao do bert (f9d1d3a7273e5ec1316359f57e7bec256bc87807)
- Update file sample of 3000 (8589dd2e12c4325a0b60c6915a49eeaed0eff88c)
- Changed APP dataset from IMDB to ROTTEN TOMATOS (2fc85f6b2905ce73c89f09387ea3bd06fd90078f)
- Remove large files and add to .gitignore (20e262b9fe79e080909f81114b3faf2ab5d672d0)
- Track data/rotten_tomatos.csv with Git LFS (a35354d8c54380430e3b9ee13609f8744953784f)


Co-authored-by: Andre Guarnier De Mitri <AndreMitri@users.noreply.huggingface.co>

.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  data/imdb_reviews.csv filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  data/imdb_reviews.csv filter=lfs diff=lfs merge=lfs -text
37
+ data/rotten_tomatos.csv filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -2,10 +2,10 @@ import streamlit as st
2
  import pandas as pd
3
  from preprocess_data import preprocess_text,get_stopwords
4
  from datasets import load_dataset
5
- from transformers import pipeline
6
  from wordnet import wordnet_pipeline
7
 
8
- dataset = load_dataset('danielcd99/imdb')
9
 
10
  dataframes = {}
11
  for split in dataset.keys():
@@ -17,6 +17,9 @@ for split in dataset.keys():
17
  MODEL_PATH = 'danielcd99/BERT_imdb'
18
 
19
  def load_pipeline():
 
 
 
20
  pipe=pipeline(
21
  "text-classification",
22
  model=MODEL_PATH
 
2
  import pandas as pd
3
  from preprocess_data import preprocess_text,get_stopwords
4
  from datasets import load_dataset
5
+ from transformers import AutoTokenizer, pipeline
6
  from wordnet import wordnet_pipeline
7
 
8
+ dataset = load_dataset('AndreMitri/rotten_tomatos')
9
 
10
  dataframes = {}
11
  for split in dataset.keys():
 
17
  MODEL_PATH = 'danielcd99/BERT_imdb'
18
 
19
  def load_pipeline():
20
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
21
+ tokenizer.model_max_length = 200
22
+
23
  pipe=pipeline(
24
  "text-classification",
25
  model=MODEL_PATH
data/rotten_tomatos.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aab34e6b1357deec1bfca34bcb3b5ad44f56b5064f9222e4e2e60a6e97bd1a5
3
+ size 446463
notebooks_explicativos/Estatistico.ipynb CHANGED
@@ -402,7 +402,7 @@
402
  }
403
  ],
404
  "source": [
405
- "db = pd.read_csv('imdb_reviews.csv')\n",
406
  "db.head(5)"
407
  ]
408
  },
 
402
  }
403
  ],
404
  "source": [
405
+ "db = pd.read_csv('../data/imdb_reviews.csv')\n",
406
  "db.head(5)"
407
  ]
408
  },
notebooks_explicativos/Neural_Bert.ipynb CHANGED
@@ -75,7 +75,7 @@
75
  },
76
  {
77
  "cell_type": "code",
78
- "execution_count": 3,
79
  "metadata": {
80
  "colab": {
81
  "base_uri": "https://localhost:8080/",
@@ -149,13 +149,13 @@
149
  "4 Petter Mattei's \"Love in the Time of Money\" is... positive"
150
  ]
151
  },
152
- "execution_count": 3,
153
  "metadata": {},
154
  "output_type": "execute_result"
155
  }
156
  ],
157
  "source": [
158
- "df_reviews = pd.read_csv('imdb_reviews.csv')\n",
159
  "df_reviews.head()"
160
  ]
161
  },
@@ -538,7 +538,7 @@
538
  },
539
  {
540
  "cell_type": "code",
541
- "execution_count": 10,
542
  "metadata": {
543
  "colab": {
544
  "base_uri": "https://localhost:8080/"
@@ -551,8 +551,8 @@
551
  "name": "stderr",
552
  "output_type": "stream",
553
  "text": [
554
- "c:\\Users\\andre\\1JUPYTER\\dt_labs\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
555
- " from .autonotebook import tqdm as notebook_tqdm\n"
556
  ]
557
  }
558
  ],
@@ -574,7 +574,7 @@
574
  },
575
  {
576
  "cell_type": "code",
577
- "execution_count": 13,
578
  "metadata": {
579
  "id": "LKEjDZCHpk4e"
580
  },
@@ -887,7 +887,6 @@
887
  },
888
  "outputs": [],
889
  "source": [
890
- "\n",
891
  "# Load both accuracy and f1 metrics\n",
892
  "accuracy_metric = evaluate.load(\"accuracy\")\n",
893
  "f1_metric = evaluate.load(\"f1\")\n",
@@ -1175,7 +1174,7 @@
1175
  },
1176
  {
1177
  "cell_type": "code",
1178
- "execution_count": 34,
1179
  "metadata": {
1180
  "id": "lOHVSyfJJ8zK"
1181
  },
@@ -1188,7 +1187,7 @@
1188
  },
1189
  {
1190
  "cell_type": "code",
1191
- "execution_count": 35,
1192
  "metadata": {
1193
  "id": "t-T7hDZ2J1Qk"
1194
  },
@@ -1261,6 +1260,253 @@
1261
  "source": [
1262
  "get_prediction(\"This movie is awesome!\")"
1263
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1264
  }
1265
  ],
1266
  "metadata": {
@@ -1283,7 +1529,7 @@
1283
  "name": "python",
1284
  "nbconvert_exporter": "python",
1285
  "pygments_lexer": "ipython3",
1286
- "version": "3.10.11"
1287
  }
1288
  },
1289
  "nbformat": 4,
 
75
  },
76
  {
77
  "cell_type": "code",
78
+ "execution_count": 25,
79
  "metadata": {
80
  "colab": {
81
  "base_uri": "https://localhost:8080/",
 
149
  "4 Petter Mattei's \"Love in the Time of Money\" is... positive"
150
  ]
151
  },
152
+ "execution_count": 25,
153
  "metadata": {},
154
  "output_type": "execute_result"
155
  }
156
  ],
157
  "source": [
158
+ "df_reviews = pd.read_csv('../data/imdb_reviews.csv')\n",
159
  "df_reviews.head()"
160
  ]
161
  },
 
538
  },
539
  {
540
  "cell_type": "code",
541
+ "execution_count": 22,
542
  "metadata": {
543
  "colab": {
544
  "base_uri": "https://localhost:8080/"
 
551
  "name": "stderr",
552
  "output_type": "stream",
553
  "text": [
554
+ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
555
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
556
  ]
557
  }
558
  ],
 
574
  },
575
  {
576
  "cell_type": "code",
577
+ "execution_count": 36,
578
  "metadata": {
579
  "id": "LKEjDZCHpk4e"
580
  },
 
887
  },
888
  "outputs": [],
889
  "source": [
 
890
  "# Load both accuracy and f1 metrics\n",
891
  "accuracy_metric = evaluate.load(\"accuracy\")\n",
892
  "f1_metric = evaluate.load(\"f1\")\n",
 
1174
  },
1175
  {
1176
  "cell_type": "code",
1177
+ "execution_count": 27,
1178
  "metadata": {
1179
  "id": "lOHVSyfJJ8zK"
1180
  },
 
1187
  },
1188
  {
1189
  "cell_type": "code",
1190
+ "execution_count": 28,
1191
  "metadata": {
1192
  "id": "t-T7hDZ2J1Qk"
1193
  },
 
1260
  "source": [
1261
  "get_prediction(\"This movie is awesome!\")"
1262
  ]
1263
+ },
1264
+ {
1265
+ "cell_type": "markdown",
1266
+ "metadata": {},
1267
+ "source": [
1268
+ "# Avaliação do modelo em novos dados\n",
1269
+ "Avaliação realizada em outro dataset, as reviews do RottenTomatoes"
1270
+ ]
1271
+ },
1272
+ {
1273
+ "cell_type": "code",
1274
+ "execution_count": 30,
1275
+ "metadata": {},
1276
+ "outputs": [
1277
+ {
1278
+ "name": "stderr",
1279
+ "output_type": "stream",
1280
+ "text": [
1281
+ "[nltk_data] Downloading package stopwords to\n",
1282
+ "[nltk_data] C:\\Users\\andre\\AppData\\Roaming\\nltk_data...\n",
1283
+ "[nltk_data] Package stopwords is already up-to-date!\n"
1284
+ ]
1285
+ },
1286
+ {
1287
+ "data": {
1288
+ "text/html": [
1289
+ "<div>\n",
1290
+ "<style scoped>\n",
1291
+ " .dataframe tbody tr th:only-of-type {\n",
1292
+ " vertical-align: middle;\n",
1293
+ " }\n",
1294
+ "\n",
1295
+ " .dataframe tbody tr th {\n",
1296
+ " vertical-align: top;\n",
1297
+ " }\n",
1298
+ "\n",
1299
+ " .dataframe thead th {\n",
1300
+ " text-align: right;\n",
1301
+ " }\n",
1302
+ "</style>\n",
1303
+ "<table border=\"1\" class=\"dataframe\">\n",
1304
+ " <thead>\n",
1305
+ " <tr style=\"text-align: right;\">\n",
1306
+ " <th></th>\n",
1307
+ " <th>review</th>\n",
1308
+ " <th>sentiment</th>\n",
1309
+ " <th>bert_results</th>\n",
1310
+ " </tr>\n",
1311
+ " </thead>\n",
1312
+ " <tbody>\n",
1313
+ " <tr>\n",
1314
+ " <th>651</th>\n",
1315
+ " <td>The film is content as it is to run clever one...</td>\n",
1316
+ " <td>negative</td>\n",
1317
+ " <td>Positive</td>\n",
1318
+ " </tr>\n",
1319
+ " <tr>\n",
1320
+ " <th>2205</th>\n",
1321
+ " <td>&amp;#91;Has&amp;#93; a surprising and somewhat disapp...</td>\n",
1322
+ " <td>negative</td>\n",
1323
+ " <td>Positive</td>\n",
1324
+ " </tr>\n",
1325
+ " <tr>\n",
1326
+ " <th>362</th>\n",
1327
+ " <td>Absurdly over-rated...</td>\n",
1328
+ " <td>negative</td>\n",
1329
+ " <td>Negative</td>\n",
1330
+ " </tr>\n",
1331
+ " <tr>\n",
1332
+ " <th>2784</th>\n",
1333
+ " <td>A rare bird, not because of what it's like but...</td>\n",
1334
+ " <td>negative</td>\n",
1335
+ " <td>Positive</td>\n",
1336
+ " </tr>\n",
1337
+ " <tr>\n",
1338
+ " <th>1914</th>\n",
1339
+ " <td>Lord of Illusions is also quite repulsive, as ...</td>\n",
1340
+ " <td>negative</td>\n",
1341
+ " <td>Positive</td>\n",
1342
+ " </tr>\n",
1343
+ " <tr>\n",
1344
+ " <th>...</th>\n",
1345
+ " <td>...</td>\n",
1346
+ " <td>...</td>\n",
1347
+ " <td>...</td>\n",
1348
+ " </tr>\n",
1349
+ " <tr>\n",
1350
+ " <th>2230</th>\n",
1351
+ " <td>The movie is completely innocuous, passably en...</td>\n",
1352
+ " <td>negative</td>\n",
1353
+ " <td>Positive</td>\n",
1354
+ " </tr>\n",
1355
+ " <tr>\n",
1356
+ " <th>2354</th>\n",
1357
+ " <td>A mud-simple horror trudge set in a swamp colo...</td>\n",
1358
+ " <td>negative</td>\n",
1359
+ " <td>Negative</td>\n",
1360
+ " </tr>\n",
1361
+ " <tr>\n",
1362
+ " <th>2404</th>\n",
1363
+ " <td>Just plain generic.</td>\n",
1364
+ " <td>negative</td>\n",
1365
+ " <td>Negative</td>\n",
1366
+ " </tr>\n",
1367
+ " <tr>\n",
1368
+ " <th>720</th>\n",
1369
+ " <td>Ulmer brings an enormous amount of impressioni...</td>\n",
1370
+ " <td>positive</td>\n",
1371
+ " <td>Negative</td>\n",
1372
+ " </tr>\n",
1373
+ " <tr>\n",
1374
+ " <th>527</th>\n",
1375
+ " <td>In their directorial debut, Britt Poulton and ...</td>\n",
1376
+ " <td>negative</td>\n",
1377
+ " <td>Negative</td>\n",
1378
+ " </tr>\n",
1379
+ " </tbody>\n",
1380
+ "</table>\n",
1381
+ "<p>3000 rows × 3 columns</p>\n",
1382
+ "</div>"
1383
+ ],
1384
+ "text/plain": [
1385
+ " review sentiment bert_results\n",
1386
+ "651 The film is content as it is to run clever one... negative Positive\n",
1387
+ "2205 &#91;Has&#93; a surprising and somewhat disapp... negative Positive\n",
1388
+ "362 Absurdly over-rated... negative Negative\n",
1389
+ "2784 A rare bird, not because of what it's like but... negative Positive\n",
1390
+ "1914 Lord of Illusions is also quite repulsive, as ... negative Positive\n",
1391
+ "... ... ... ...\n",
1392
+ "2230 The movie is completely innocuous, passably en... negative Positive\n",
1393
+ "2354 A mud-simple horror trudge set in a swamp colo... negative Negative\n",
1394
+ "2404 Just plain generic. negative Negative\n",
1395
+ "720 Ulmer brings an enormous amount of impressioni... positive Negative\n",
1396
+ "527 In their directorial debut, Britt Poulton and ... negative Negative\n",
1397
+ "\n",
1398
+ "[3000 rows x 3 columns]"
1399
+ ]
1400
+ },
1401
+ "execution_count": 30,
1402
+ "metadata": {},
1403
+ "output_type": "execute_result"
1404
+ }
1405
+ ],
1406
+ "source": [
1407
+ "import pandas as pd\n",
1408
+ "from preprocess_data import preprocess_text,get_stopwords\n",
1409
+ "from transformers import AutoTokenizer, pipeline\n",
1410
+ "\n",
1411
+ "df = pd.read_csv('../data/rotten_tomatos.csv')\n",
1412
+ "\n",
1413
+ "MODEL_PATH = 'danielcd99/BERT_imdb'\n",
1414
+ "\n",
1415
+ "def load_pipeline():\n",
1416
+ " tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)\n",
1417
+ " tokenizer.model_max_length = 200\n",
1418
+ "\n",
1419
+ " pipe=pipeline(\n",
1420
+ " \"text-classification\",\n",
1421
+ " model=MODEL_PATH\n",
1422
+ " )\n",
1423
+ " return pipe\n",
1424
+ "\n",
1425
+ "pipe = load_pipeline()\n",
1426
+ "get_stopwords()\n",
1427
+ "df['preprocessed_review'] = df['review'].copy()\n",
1428
+ "df['preprocessed_review'] = df['preprocessed_review'].apply(preprocess_text)\n",
1429
+ " \n",
1430
+ "predictions = []\n",
1431
+ "for review in df['preprocessed_review']:\n",
1432
+ " try:\n",
1433
+ " label = pipe(review)[0]['label']\n",
1434
+ " except:\n",
1435
+ " print(\"Ocorreu um erro de carregamento, tente novamente!\")\n",
1436
+ " \n",
1437
+ " if label == 'LABEL_0':\n",
1438
+ " predictions.append(0)\n",
1439
+ " else:\n",
1440
+ " predictions.append(1)\n",
1441
+ "\n",
1442
+ "df['bert_results'] = predictions\n",
1443
+ "\n",
1444
+ "cols = ['review','sentiment', 'bert_results']\n",
1445
+ "df = df[cols]\n",
1446
+ "df"
1447
+ ]
1448
+ },
1449
+ {
1450
+ "cell_type": "code",
1451
+ "execution_count": 31,
1452
+ "metadata": {},
1453
+ "outputs": [
1454
+ {
1455
+ "name": "stdout",
1456
+ "output_type": "stream",
1457
+ "text": [
1458
+ "Precision: 0.8066\n",
1459
+ "Recall: 0.8449\n",
1460
+ "F1 Score: 0.8253\n"
1461
+ ]
1462
+ },
1463
+ {
1464
+ "data": {
1465
+ "image/png": "",
1466
+ "text/plain": [
1467
+ "<Figure size 800x600 with 2 Axes>"
1468
+ ]
1469
+ },
1470
+ "metadata": {},
1471
+ "output_type": "display_data"
1472
+ }
1473
+ ],
1474
+ "source": [
1475
+ "from sklearn.metrics import confusion_matrix, precision_recall_fscore_support\n",
1476
+ "import matplotlib.pyplot as plt\n",
1477
+ "import seaborn as sns\n",
1478
+ "\n",
1479
+ "# Mapear 'Positive' para 1 e 'Negative' para 0 em 'sentiment'\n",
1480
+ "df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})\n",
1481
+ "df['bert_results'] = df['bert_results'].map({'Positive': 1, 'Negative': 0})\n",
1482
+ "\n",
1483
+ "# Calcular métricas de avaliação: precision, recall, f1-score\n",
1484
+ "precision, recall, f1_score, _ = precision_recall_fscore_support(df['sentiment'], df['bert_results'], average='binary')\n",
1485
+ "\n",
1486
+ "print(f\"Precision: {precision:.4f}\")\n",
1487
+ "print(f\"Recall: {recall:.4f}\")\n",
1488
+ "print(f\"F1 Score: {f1_score:.4f}\")\n",
1489
+ "\n",
1490
+ "# Calcular e plotar a matriz de confusão\n",
1491
+ "cm = confusion_matrix(df['sentiment'], df['bert_results'])\n",
1492
+ "plt.figure(figsize=(8, 6))\n",
1493
+ "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])\n",
1494
+ "plt.xlabel('Predicted')\n",
1495
+ "plt.ylabel('True')\n",
1496
+ "plt.title('Confusion Matrix')\n",
1497
+ "plt.show()"
1498
+ ]
1499
+ },
1500
+ {
1501
+ "cell_type": "markdown",
1502
+ "metadata": {},
1503
+ "source": [
1504
+ "Após ajustar o modelo BERT utilizando a base de dados do IMDb, avaliada com referência aos dados do Rotten Tomatoes, obtivemos as seguintes métricas de desempenho:\n",
1505
+ "\n",
1506
+ "Precision: 0.8562 --- Recall: 0.8654 --- F1 Score: 0.8608\n",
1507
+ "\n",
1508
+ "Essas métricas indicam que o modelo ajustado conseguiu classificar de forma bastante precisa os sentimentos dos textos da base de dados IMDb, utilizando o BERT finetunado com dados do Rotten Tomatoes como referência."
1509
+ ]
1510
  }
1511
  ],
1512
  "metadata": {
 
1529
  "name": "python",
1530
  "nbconvert_exporter": "python",
1531
  "pygments_lexer": "ipython3",
1532
+ "version": "3.11.7"
1533
  }
1534
  },
1535
  "nbformat": 4,
notebooks_explicativos/Simbolico.ipynb CHANGED
@@ -465,7 +465,7 @@
465
  }
466
  ],
467
  "source": [
468
- "df = pd.read_csv('imdb.csv')\n",
469
  "df.head(5)"
470
  ]
471
  },
 
465
  }
466
  ],
467
  "source": [
468
+ "df = pd.read_csv('../data/imdb.csv')\n",
469
  "df.head(5)"
470
  ]
471
  },