{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "id": "xJ3YIFEAxbfz", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "7348ee14-40c9-4afd-a51a-8b48147924c0" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2024-08-18 15:03:31-- https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-train.txt\n", "Resolving alt.qcri.org (alt.qcri.org)... 37.186.61.205\n", "Connecting to alt.qcri.org (alt.qcri.org)|37.186.61.205|:443... connected.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: http://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-train.txt [following]\n", "--2024-08-18 15:03:32-- http://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-train.txt\n", "Connecting to alt.qcri.org (alt.qcri.org)|37.186.61.205|:80... connected.\n", "HTTP request sent, awaiting response... 302 Moved Temporarily\n", "Location: https://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-train.txt [following]\n", "--2024-08-18 15:03:32-- https://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-train.txt\n", "Connecting to alt.qcri.org (alt.qcri.org)|37.186.61.205|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 1502506 (1.4M) [text/plain]\n", "Saving to: ‘OSACT2022-sharedTask-train.txt.2’\n", "\n", "OSACT2022-sharedTas 100%[===================>] 1.43M 1.19MB/s in 1.2s \n", "\n", "2024-08-18 15:03:34 (1.19 MB/s) - ‘OSACT2022-sharedTask-train.txt.2’ saved [1502506/1502506]\n", "\n", "--2024-08-18 15:03:34-- https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-dev.txt\n", "Resolving alt.qcri.org (alt.qcri.org)... 37.186.61.205\n", "Connecting to alt.qcri.org (alt.qcri.org)|37.186.61.205|:443... connected.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: http://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-dev.txt [following]\n", "--2024-08-18 15:03:34-- http://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-dev.txt\n", "Connecting to alt.qcri.org (alt.qcri.org)|37.186.61.205|:80... connected.\n", "HTTP request sent, awaiting response... 302 Moved Temporarily\n", "Location: https://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-dev.txt [following]\n", "--2024-08-18 15:03:34-- https://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-dev.txt\n", "Connecting to alt.qcri.org (alt.qcri.org)|37.186.61.205|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 211355 (206K) [text/plain]\n", "Saving to: ‘OSACT2022-sharedTask-dev.txt.2’\n", "\n", "OSACT2022-sharedTas 100%[===================>] 206.40K 626KB/s in 0.3s \n", "\n", "2024-08-18 15:03:35 (626 KB/s) - ‘OSACT2022-sharedTask-dev.txt.2’ saved [211355/211355]\n", "\n", "--2024-08-18 15:03:36-- https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-test-tweets.txt\n", "Resolving alt.qcri.org (alt.qcri.org)... 37.186.61.205\n", "Connecting to alt.qcri.org (alt.qcri.org)|37.186.61.205|:443... connected.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: http://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-test-tweets.txt [following]\n", "--2024-08-18 15:03:36-- http://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-test-tweets.txt\n", "Connecting to alt.qcri.org (alt.qcri.org)|37.186.61.205|:80... connected.\n", "HTTP request sent, awaiting response... 302 Moved Temporarily\n", "Location: https://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-test-tweets.txt [following]\n", "--2024-08-18 15:03:36-- https://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-test-tweets.txt\n", "Connecting to alt.qcri.org (alt.qcri.org)|37.186.61.205|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 348638 (340K) [text/plain]\n", "Saving to: ‘OSACT2022-sharedTask-test-tweets.txt.2’\n", "\n", "OSACT2022-sharedTas 100%[===================>] 340.47K 705KB/s in 0.5s \n", "\n", "2024-08-18 15:03:37 (705 KB/s) - ‘OSACT2022-sharedTask-test-tweets.txt.2’ saved [348638/348638]\n", "\n", "--2024-08-18 15:03:37-- https://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-test-taskA-gold-labels.txt\n", "Resolving alt.qcri.org (alt.qcri.org)... 37.186.61.205\n", "Connecting to alt.qcri.org (alt.qcri.org)|37.186.61.205|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 16780 (16K) [text/plain]\n", "Saving to: ‘OSACT2022-sharedTask-test-taskA-gold-labels.txt.2’\n", "\n", "OSACT2022-sharedTas 100%[===================>] 16.39K --.-KB/s in 0s \n", "\n", "2024-08-18 15:03:38 (174 MB/s) - ‘OSACT2022-sharedTask-test-taskA-gold-labels.txt.2’ saved [16780/16780]\n", "\n" ] } ], "source": [ "!wget \"https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-train.txt\"\n", "!wget \"https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-dev.txt\"\n", "!wget \"https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-test-tweets.txt\"\n", "!wget \"https://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-test-taskA-gold-labels.txt\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "7ltyuVxxzGRT", "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "outputId": "a8d27b74-d188-4bb6-bf8b-070f4ab2d7b8" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 1 @USER ردينا ع التطنز 😏👊🏻 OFF \\\n", "0 2 وصارت فطاير البقالات غذاء صحي 👎🏻 URL NOT_OFF \n", "1 3 @USER روحي لبريده تلقين اشباه كثير بس ماحد زيك... OFF \n", "2 4 @USER مش باين حاجه خالص 😣مش عارف بقى 😔 NOT_OFF \n", "3 5 #اليوم_الاثنين👏 يقولك :%90 من المسلمي... NOT_OFF \n", "4 6 حمدلله ماحطها في فمي اساسا😷🤢 URL NOT_OFF \n", "... ... ... ... \n", "8881 8883 @USER الله يلعنهم 🤢 OFF \n", "8882 8884 واحد سال زوجته بعد كم سنة زواج:- حبيبتي كم... NOT_OFF \n", "8883 8885 @USER يالله روح زي الشاطر واحذف الشو الي سويته... OFF \n", "8884 8886 لـمـا الـكـلاب تـهـوهـو عـلـيـك🐶🐶 وأنـت_اللـى_... NOT_OFF \n", "8885 8887 #بايع_الكليجاالله ياخذكم ي بنات خلوني ... NOT_OFF \n", "\n", " NOT_HS NOT_VLG NOT_VIO \n", "0 NOT_HS NOT_VLG NOT_VIO \n", "1 NOT_HS NOT_VLG NOT_VIO \n", "2 NOT_HS NOT_VLG NOT_VIO \n", "3 NOT_HS NOT_VLG NOT_VIO \n", "4 NOT_HS NOT_VLG NOT_VIO \n", "... ... ... ... \n", "8881 NOT_HS NOT_VLG NOT_VIO \n", "8882 NOT_HS NOT_VLG NOT_VIO \n", "8883 NOT_HS NOT_VLG NOT_VIO \n", "8884 NOT_HS NOT_VLG NOT_VIO \n", "8885 NOT_HS NOT_VLG NOT_VIO \n", "\n", "[8886 rows x 6 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1@USER ردينا ع التطنز 😏👊🏻OFFNOT_HSNOT_VLGNOT_VIO
02وصارت فطاير البقالات غذاء صحي 👎🏻 URLNOT_OFFNOT_HSNOT_VLGNOT_VIO
13@USER روحي لبريده تلقين اشباه كثير بس ماحد زيك...OFFNOT_HSNOT_VLGNOT_VIO
24@USER مش باين حاجه خالص 😣<LF>مش عارف بقى 😔NOT_OFFNOT_HSNOT_VLGNOT_VIO
35#اليوم_الاثنين<LF><LF>👏 يقولك :%90 من المسلمي...NOT_OFFNOT_HSNOT_VLGNOT_VIO
46حمدلله ماحطها في فمي اساسا😷🤢 URLNOT_OFFNOT_HSNOT_VLGNOT_VIO
.....................
88818883@USER الله يلعنهم 🤢OFFNOT_HSNOT_VLGNOT_VIO
88828884واحد سال زوجته بعد كم سنة زواج:<LF>- حبيبتي كم...NOT_OFFNOT_HSNOT_VLGNOT_VIO
88838885@USER يالله روح زي الشاطر واحذف الشو الي سويته...OFFNOT_HSNOT_VLGNOT_VIO
88848886لـمـا الـكـلاب تـهـوهـو عـلـيـك🐶🐶 وأنـت_اللـى_...NOT_OFFNOT_HSNOT_VLGNOT_VIO
88858887#بايع_الكليجا<LF><LF>الله ياخذكم ي بنات خلوني ...NOT_OFFNOT_HSNOT_VLGNOT_VIO
\n", "

8886 rows × 6 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "train_data", "summary": "{\n \"name\": \"train_data\",\n \"rows\": 8886,\n \"fields\": [\n {\n \"column\": \"1\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2565,\n \"min\": 2,\n \"max\": 8887,\n \"num_unique_values\": 8886,\n \"samples\": [\n 8738,\n 1890,\n 4986\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"@USER \\u0631\\u062f\\u064a\\u0646\\u0627 \\u0639 \\u0627\\u0644\\u062a\\u0637\\u0646\\u0632 \\ud83d\\ude0f\\ud83d\\udc4a\\ud83c\\udffb\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 8886,\n \"samples\": [\n \"@USER \\u0644\\u0627 \\u0648\\u0627\\u0644\\u0644\\u0647 \\u0645\\u0627 \\u0623\\u0631\\u0641\\u0636 \\u060c \\u0628\\u0633 \\u0644\\u0627\\u0632\\u0645 \\u062a\\u0643\\u0648\\u0646 \\u0648\\u064a\\u0627\\u064a \\u0634\\u0631\\u064a\\u0643\\u0629 \\u0627\\u0644\\u062d\\u064a\\u0627\\u0629 \\ud83d\\ude0c\\u270b\\ud83c\\udffb \\u0639\\u0634\\u0627\\u0646 \\u0646\\u0635\\u064a\\u0631 \\u0637\\u0631\\u0632\\u0627\\u0646 \\u0648\\u0637\\u0631\\u0632\\u0627\\u0646\\u0647 \\ud83d\\udc35\",\n \"\\u0639\\u0646 \\u0634\\u0639\\u0648\\u0631 \\u0623\\u062e\\u062f \\u0645\\u063a\\u0627\\u062f\\u0631\\u0629 \\u0639\\u0627\\u0644\\u0669 \\u0639\\u0634\\u0627\\u0646 \\u0645\\u0648\\u0639\\u062f \\u0627\\u0644\\u062f\\u0643\\u062a\\u0648\\u0631 \\u0639\\u0627\\u0644\\u0669:\\u0663\\u0660 \\u0648 \\u0623\\u0648\\u0635\\u0644 \\u0669:\\u0661\\u0665 \\u0648\\u062a\\u0635\\u064a\\u0631 \\u0661\\u0660:\\u0663\\u0660 \\u0648\\u0644\\u0633\\u0647 \\u0645\\u0627 \\u0627\\u062c\\u0649 \\u0645\\u0648\\u0639\\u062f\\u064a!!!!!!!! \\ud83d\\ude21 URL\",\n \"@USER \\u0644\\u0627 \\u0644\\u0627 \\u0627\\u0644\\u0648\\u0636\\u0639 \\u0632\\u0627\\u062f \\u0639\\u0646 \\u062d\\u062f\\u0647 \\ud83d\\udc4a\\ud83c\\udffc\\ud83c\\udfc3\\ud83c\\udffb\\u200d\\u2640\\ufe0f\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"OFF\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"OFF\",\n \"NOT_OFF\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"NOT_HS\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"NOT_HS\",\n \"HS2\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"NOT_VLG\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"VLG\",\n \"NOT_VLG\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"NOT_VIO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"VIO\",\n \"NOT_VIO\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 3 } ], "source": [ "import pandas as pd\n", "import csv\n", "train_data = pd.read_csv(\"OSACT2022-sharedTask-train.txt\", sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", "dev_data = pd.read_csv(\"OSACT2022-sharedTask-dev.txt\", sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", "test_data = pd.read_csv(\"OSACT2022-sharedTask-test-tweets.txt\", sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", "train_data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "F7Jy__azzGVC", "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "outputId": "1f05a317-dd92-4c3b-c69f-68ff9ce85479" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " @USER ردينا ع التطنز 😏👊🏻 OFF\n", "0 وصارت فطاير البقالات غذاء صحي 👎🏻 URL NOT_OFF\n", "1 @USER روحي لبريده تلقين اشباه كثير بس ماحد زيك... OFF\n", "2 @USER مش باين حاجه خالص 😣مش عارف بقى 😔 NOT_OFF\n", "3 #اليوم_الاثنين👏 يقولك :%90 من المسلمي... NOT_OFF\n", "4 حمدلله ماحطها في فمي اساسا😷🤢 URL NOT_OFF\n", "... ... ...\n", "8881 @USER الله يلعنهم 🤢 OFF\n", "8882 واحد سال زوجته بعد كم سنة زواج:- حبيبتي كم... NOT_OFF\n", "8883 @USER يالله روح زي الشاطر واحذف الشو الي سويته... OFF\n", "8884 لـمـا الـكـلاب تـهـوهـو عـلـيـك🐶🐶 وأنـت_اللـى_... NOT_OFF\n", "8885 #بايع_الكليجاالله ياخذكم ي بنات خلوني ... NOT_OFF\n", "\n", "[8886 rows x 2 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
@USER ردينا ع التطنز 😏👊🏻OFF
0وصارت فطاير البقالات غذاء صحي 👎🏻 URLNOT_OFF
1@USER روحي لبريده تلقين اشباه كثير بس ماحد زيك...OFF
2@USER مش باين حاجه خالص 😣<LF>مش عارف بقى 😔NOT_OFF
3#اليوم_الاثنين<LF><LF>👏 يقولك :%90 من المسلمي...NOT_OFF
4حمدلله ماحطها في فمي اساسا😷🤢 URLNOT_OFF
.........
8881@USER الله يلعنهم 🤢OFF
8882واحد سال زوجته بعد كم سنة زواج:<LF>- حبيبتي كم...NOT_OFF
8883@USER يالله روح زي الشاطر واحذف الشو الي سويته...OFF
8884لـمـا الـكـلاب تـهـوهـو عـلـيـك🐶🐶 وأنـت_اللـى_...NOT_OFF
8885#بايع_الكليجا<LF><LF>الله ياخذكم ي بنات خلوني ...NOT_OFF
\n", "

8886 rows × 2 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "train_data", "summary": "{\n \"name\": \"train_data\",\n \"rows\": 8886,\n \"fields\": [\n {\n \"column\": \"@USER \\u0631\\u062f\\u064a\\u0646\\u0627 \\u0639 \\u0627\\u0644\\u062a\\u0637\\u0646\\u0632 \\ud83d\\ude0f\\ud83d\\udc4a\\ud83c\\udffb\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 8886,\n \"samples\": [\n \"@USER \\u0644\\u0627 \\u0648\\u0627\\u0644\\u0644\\u0647 \\u0645\\u0627 \\u0623\\u0631\\u0641\\u0636 \\u060c \\u0628\\u0633 \\u0644\\u0627\\u0632\\u0645 \\u062a\\u0643\\u0648\\u0646 \\u0648\\u064a\\u0627\\u064a \\u0634\\u0631\\u064a\\u0643\\u0629 \\u0627\\u0644\\u062d\\u064a\\u0627\\u0629 \\ud83d\\ude0c\\u270b\\ud83c\\udffb \\u0639\\u0634\\u0627\\u0646 \\u0646\\u0635\\u064a\\u0631 \\u0637\\u0631\\u0632\\u0627\\u0646 \\u0648\\u0637\\u0631\\u0632\\u0627\\u0646\\u0647 \\ud83d\\udc35\",\n \"\\u0639\\u0646 \\u0634\\u0639\\u0648\\u0631 \\u0623\\u062e\\u062f \\u0645\\u063a\\u0627\\u062f\\u0631\\u0629 \\u0639\\u0627\\u0644\\u0669 \\u0639\\u0634\\u0627\\u0646 \\u0645\\u0648\\u0639\\u062f \\u0627\\u0644\\u062f\\u0643\\u062a\\u0648\\u0631 \\u0639\\u0627\\u0644\\u0669:\\u0663\\u0660 \\u0648 \\u0623\\u0648\\u0635\\u0644 \\u0669:\\u0661\\u0665 \\u0648\\u062a\\u0635\\u064a\\u0631 \\u0661\\u0660:\\u0663\\u0660 \\u0648\\u0644\\u0633\\u0647 \\u0645\\u0627 \\u0627\\u062c\\u0649 \\u0645\\u0648\\u0639\\u062f\\u064a!!!!!!!! \\ud83d\\ude21 URL\",\n \"@USER \\u0644\\u0627 \\u0644\\u0627 \\u0627\\u0644\\u0648\\u0636\\u0639 \\u0632\\u0627\\u062f \\u0639\\u0646 \\u062d\\u062f\\u0647 \\ud83d\\udc4a\\ud83c\\udffc\\ud83c\\udfc3\\ud83c\\udffb\\u200d\\u2640\\ufe0f\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"OFF\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"OFF\",\n \"NOT_OFF\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 4 } ], "source": [ "train_data = train_data.drop(columns=['1', 'NOT_HS', 'NOT_VLG' , 'NOT_VIO'])\n", "train_data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "crd119tuzGZx", "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "outputId": "9ed78838-a108-497d-c635-5745a7b6e087" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Text label\n", "0 وصارت فطاير البقالات غذاء صحي 👎🏻 URL NOT_OFF\n", "1 @USER روحي لبريده تلقين اشباه كثير بس ماحد زيك... OFF\n", "2 @USER مش باين حاجه خالص 😣مش عارف بقى 😔 NOT_OFF\n", "3 #اليوم_الاثنين👏 يقولك :%90 من المسلمي... NOT_OFF\n", "4 حمدلله ماحطها في فمي اساسا😷🤢 URL NOT_OFF\n", "... ... ...\n", "8881 @USER الله يلعنهم 🤢 OFF\n", "8882 واحد سال زوجته بعد كم سنة زواج:- حبيبتي كم... NOT_OFF\n", "8883 @USER يالله روح زي الشاطر واحذف الشو الي سويته... OFF\n", "8884 لـمـا الـكـلاب تـهـوهـو عـلـيـك🐶🐶 وأنـت_اللـى_... NOT_OFF\n", "8885 #بايع_الكليجاالله ياخذكم ي بنات خلوني ... NOT_OFF\n", "\n", "[8886 rows x 2 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Textlabel
0وصارت فطاير البقالات غذاء صحي 👎🏻 URLNOT_OFF
1@USER روحي لبريده تلقين اشباه كثير بس ماحد زيك...OFF
2@USER مش باين حاجه خالص 😣<LF>مش عارف بقى 😔NOT_OFF
3#اليوم_الاثنين<LF><LF>👏 يقولك :%90 من المسلمي...NOT_OFF
4حمدلله ماحطها في فمي اساسا😷🤢 URLNOT_OFF
.........
8881@USER الله يلعنهم 🤢OFF
8882واحد سال زوجته بعد كم سنة زواج:<LF>- حبيبتي كم...NOT_OFF
8883@USER يالله روح زي الشاطر واحذف الشو الي سويته...OFF
8884لـمـا الـكـلاب تـهـوهـو عـلـيـك🐶🐶 وأنـت_اللـى_...NOT_OFF
8885#بايع_الكليجا<LF><LF>الله ياخذكم ي بنات خلوني ...NOT_OFF
\n", "

8886 rows × 2 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "train_data", "summary": "{\n \"name\": \"train_data\",\n \"rows\": 8886,\n \"fields\": [\n {\n \"column\": \"Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 8886,\n \"samples\": [\n \"@USER \\u0644\\u0627 \\u0648\\u0627\\u0644\\u0644\\u0647 \\u0645\\u0627 \\u0623\\u0631\\u0641\\u0636 \\u060c \\u0628\\u0633 \\u0644\\u0627\\u0632\\u0645 \\u062a\\u0643\\u0648\\u0646 \\u0648\\u064a\\u0627\\u064a \\u0634\\u0631\\u064a\\u0643\\u0629 \\u0627\\u0644\\u062d\\u064a\\u0627\\u0629 \\ud83d\\ude0c\\u270b\\ud83c\\udffb \\u0639\\u0634\\u0627\\u0646 \\u0646\\u0635\\u064a\\u0631 \\u0637\\u0631\\u0632\\u0627\\u0646 \\u0648\\u0637\\u0631\\u0632\\u0627\\u0646\\u0647 \\ud83d\\udc35\",\n \"\\u0639\\u0646 \\u0634\\u0639\\u0648\\u0631 \\u0623\\u062e\\u062f \\u0645\\u063a\\u0627\\u062f\\u0631\\u0629 \\u0639\\u0627\\u0644\\u0669 \\u0639\\u0634\\u0627\\u0646 \\u0645\\u0648\\u0639\\u062f \\u0627\\u0644\\u062f\\u0643\\u062a\\u0648\\u0631 \\u0639\\u0627\\u0644\\u0669:\\u0663\\u0660 \\u0648 \\u0623\\u0648\\u0635\\u0644 \\u0669:\\u0661\\u0665 \\u0648\\u062a\\u0635\\u064a\\u0631 \\u0661\\u0660:\\u0663\\u0660 \\u0648\\u0644\\u0633\\u0647 \\u0645\\u0627 \\u0627\\u062c\\u0649 \\u0645\\u0648\\u0639\\u062f\\u064a!!!!!!!! \\ud83d\\ude21 URL\",\n \"@USER \\u0644\\u0627 \\u0644\\u0627 \\u0627\\u0644\\u0648\\u0636\\u0639 \\u0632\\u0627\\u062f \\u0639\\u0646 \\u062d\\u062f\\u0647 \\ud83d\\udc4a\\ud83c\\udffc\\ud83c\\udfc3\\ud83c\\udffb\\u200d\\u2640\\ufe0f\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"OFF\",\n \"NOT_OFF\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 5 } ], "source": [ "train_data = train_data.rename(columns={\"@USER ردينا ع التطنز 😏👊🏻\": \"Text\"})\n", "train_data = train_data.rename(columns={\"OFF\": \"label\"})\n", "train_data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "SakRj7VGzc6t", "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "outputId": "05f6082b-f0cc-4879-df68-32e1c70e0902" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 8888 @USER افطرت عليك بعقاء واثنين من فروخها الجن 🔪😂 NOT_OFF \\\n", "0 8889 #داليا_مباركمادري ليش تقرفت😷 NOT_OFF \n", "1 8890 RT @USER ابديت السناب الجديد ❌حاس الناس ح... NOT_OFF \n", "2 8891 @USER هييه والله وااايدد 🔪🔪🔪💔💔 NOT_OFF \n", "3 8892 اكيد اخس شي 😤 URL NOT_OFF \n", "4 8893 مابي شي الحين غير فراشي😣 NOT_OFF \n", "... ... ... ... \n", "1264 10153 @USER روما محظوظين بذا المدرب بيروتي يسحب في ر... NOT_OFF \n", "1265 10154 @USER هلا والله بالحبهلا لولو❤️😙...\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
8888@USER افطرت عليك بعقاء واثنين من فروخها الجن 🔪😂NOT_OFFNOT_HSNOT_VLGNOT_VIO
08889#داليا_مبارك<LF>مادري ليش تقرفت😷NOT_OFFNOT_HSNOT_VLGNOT_VIO
18890RT @USER ابديت السناب الجديد ❌<LF>حاس الناس ح...NOT_OFFNOT_HSNOT_VLGNOT_VIO
28891@USER هييه والله وااايدد 🔪🔪🔪💔💔NOT_OFFNOT_HSNOT_VLGNOT_VIO
38892اكيد اخس شي 😤 URLNOT_OFFNOT_HSNOT_VLGNOT_VIO
48893مابي شي الحين غير فراشي😣NOT_OFFNOT_HSNOT_VLGNOT_VIO
.....................
126410153@USER روما محظوظين بذا المدرب بيروتي يسحب في ر...NOT_OFFNOT_HSNOT_VLGNOT_VIO
126510154@USER هلا والله بالحب<LF>هلا لولو❤️😙<LF>...<LF...OFFNOT_HSNOT_VLGNOT_VIO
126610155رينز فاز 😡😡😡😡😡😡😡 يعنني اوه شوفو العرض الأسطوري...OFFNOT_HSNOT_VLGNOT_VIO
126710156@USER ييييع والله شيء يلوع الكبد مريضات الله ي...OFFNOT_HSNOT_VLGNOT_VIO
126810157@USER تحسينها ع كليجه م اكلت شي واضح من الصوت 😷😷NOT_OFFNOT_HSNOT_VLGNOT_VIO
\n", "

1269 rows × 6 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", " \n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "dev_data", "summary": "{\n \"name\": \"dev_data\",\n \"rows\": 1269,\n \"fields\": [\n {\n \"column\": \"8888\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 366,\n \"min\": 8889,\n \"max\": 10157,\n \"num_unique_values\": 1269,\n \"samples\": [\n 10102,\n 9692,\n 9311\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"@USER \\u0627\\u0641\\u0637\\u0631\\u062a \\u0639\\u0644\\u064a\\u0643 \\u0628\\u0639\\u0642\\u0627\\u0621 \\u0648\\u0627\\u062b\\u0646\\u064a\\u0646 \\u0645\\u0646 \\u0641\\u0631\\u0648\\u062e\\u0647\\u0627 \\u0627\\u0644\\u062c\\u0646 \\ud83d\\udd2a\\ud83d\\ude02\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1269,\n \"samples\": [\n \"\\u0644\\u0648 \\u0643\\u0627\\u0646 \\u0627\\u0644\\u0627\\u0646\\u062a\\u0638\\u0627\\u0631 \\u0631\\u062c\\u0644\\u0627\\u064b \\u0644\\u0642\\u062a\\u0644\\u062a\\u0647 \\ud83d\\ude21\",\n \"@USER \\u0639\\u0646\\u062f\\u064a \\u0637\\u0641\\u0644\\u0647 \\u0635\\u063a\\u064a\\u0631\\u0629 \\u0643\\u0644 \\u0645\\u0627 \\u0633\\u0648\\u062a \\u0645\\u0635\\u064a\\u0628\\u0647 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d\\u0631\\u0645\\u062a \\u0627\\u0644\\u0645\\u0641\\u062a\\u0627\\u062d \\u0645\\u0639 \\u0627\\u0644\\u0634\\u0628\\u0627\\u0643\\u060c \\u0644\\u064a\\u0634 \\u064a\\u0627\\u0628\\u0627\\u0628\\u0627 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d\\u0631\\u0645\\u062a \\u0645\\u0639\\u062c\\u0648\\u0646 \\u0627\\u0644\\u0627\\u0633\\u0646\\u0627\\u0646 \\u0644\\u064a\\u0634 \\u064a\\u0627 \\u0628\\u0627\\u0628\\u0627 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d\\u0632\\u0639\\u0644\\u062a \\u0639\\u0644\\u064a \\u0642\\u0627\\u0644\\u062a \\u0644\\u064a \\u0643\\u0644 \\u062a\\u0631\\u0627\\u0628\\u0627\\u0646\\u0635\\u062f\\u0645\\u062a \\u0645\\u062f\\u0631\\u064a \\u0645\\u0646 \\u0641\\u064a\\u0646 \\u062c\\u0627\\u064a\\u0628\\u0629 \\u0647\\u0627\\u0644\\u0643\\u0644\\u0645\\u0629\\u0642\\u0644\\u062a \\u0627\\u064a\\u0634 \\u0627\\u064a\\u0634 \\ud83d\\ude20 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d \\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\",\n \"#\\u0628\\u0627\\u064a\\u0639_\\u0627\\u0644\\u0643\\u0644\\u064a\\u062c\\u0627\\u0627\\u0646\\u0627 \\u0627\\u0644\\u0628\\u0646\\u062a \\u0627\\u0644\\u0648\\u062d\\u064a\\u062f\\u0647 \\u0627\\u0644\\u064a \\u0627\\u0634\\u0648\\u0641 \\u0628\\u0627\\u064a\\u0639 \\u0627\\u0644\\u0643\\u0644\\u064a\\u062c\\u0627 ( \\u062c\\u064a\\u0643\\u0631 ) \\ud83e\\udd2e\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"NOT_OFF\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"OFF\",\n \"NOT_OFF\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"NOT_HS\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"NOT_HS\",\n \"HS3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"NOT_VLG\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"VLG\",\n \"NOT_VLG\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"NOT_VIO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"VIO\",\n \"NOT_VIO\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 6 } ], "source": [ "dev_data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "JtryssNFzfr1" }, "outputs": [], "source": [ "dev_data = dev_data.drop(columns=['8888', 'NOT_HS', 'NOT_VLG' , 'NOT_VIO'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "xL3h20zRzgW6", "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "outputId": "60a29bf1-5bb1-4ad9-de44-94abea53f377" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Text label\n", "0 #داليا_مباركمادري ليش تقرفت😷 NOT_OFF\n", "1 RT @USER ابديت السناب الجديد ❌حاس الناس ح... NOT_OFF\n", "2 @USER هييه والله وااايدد 🔪🔪🔪💔💔 NOT_OFF\n", "3 اكيد اخس شي 😤 URL NOT_OFF\n", "4 مابي شي الحين غير فراشي😣 NOT_OFF\n", "... ... ...\n", "1264 @USER روما محظوظين بذا المدرب بيروتي يسحب في ر... NOT_OFF\n", "1265 @USER هلا والله بالحبهلا لولو❤️😙...\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Textlabel
0#داليا_مبارك<LF>مادري ليش تقرفت😷NOT_OFF
1RT @USER ابديت السناب الجديد ❌<LF>حاس الناس ح...NOT_OFF
2@USER هييه والله وااايدد 🔪🔪🔪💔💔NOT_OFF
3اكيد اخس شي 😤 URLNOT_OFF
4مابي شي الحين غير فراشي😣NOT_OFF
.........
1264@USER روما محظوظين بذا المدرب بيروتي يسحب في ر...NOT_OFF
1265@USER هلا والله بالحب<LF>هلا لولو❤️😙<LF>...<LF...OFF
1266رينز فاز 😡😡😡😡😡😡😡 يعنني اوه شوفو العرض الأسطوري...OFF
1267@USER ييييع والله شيء يلوع الكبد مريضات الله ي...OFF
1268@USER تحسينها ع كليجه م اكلت شي واضح من الصوت 😷😷NOT_OFF
\n", "

1269 rows × 2 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", " \n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "dev_data", "summary": "{\n \"name\": \"dev_data\",\n \"rows\": 1269,\n \"fields\": [\n {\n \"column\": \"Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1269,\n \"samples\": [\n \"\\u0644\\u0648 \\u0643\\u0627\\u0646 \\u0627\\u0644\\u0627\\u0646\\u062a\\u0638\\u0627\\u0631 \\u0631\\u062c\\u0644\\u0627\\u064b \\u0644\\u0642\\u062a\\u0644\\u062a\\u0647 \\ud83d\\ude21\",\n \"@USER \\u0639\\u0646\\u062f\\u064a \\u0637\\u0641\\u0644\\u0647 \\u0635\\u063a\\u064a\\u0631\\u0629 \\u0643\\u0644 \\u0645\\u0627 \\u0633\\u0648\\u062a \\u0645\\u0635\\u064a\\u0628\\u0647 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d\\u0631\\u0645\\u062a \\u0627\\u0644\\u0645\\u0641\\u062a\\u0627\\u062d \\u0645\\u0639 \\u0627\\u0644\\u0634\\u0628\\u0627\\u0643\\u060c \\u0644\\u064a\\u0634 \\u064a\\u0627\\u0628\\u0627\\u0628\\u0627 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d\\u0631\\u0645\\u062a \\u0645\\u0639\\u062c\\u0648\\u0646 \\u0627\\u0644\\u0627\\u0633\\u0646\\u0627\\u0646 \\u0644\\u064a\\u0634 \\u064a\\u0627 \\u0628\\u0627\\u0628\\u0627 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d\\u0632\\u0639\\u0644\\u062a \\u0639\\u0644\\u064a \\u0642\\u0627\\u0644\\u062a \\u0644\\u064a \\u0643\\u0644 \\u062a\\u0631\\u0627\\u0628\\u0627\\u0646\\u0635\\u062f\\u0645\\u062a \\u0645\\u062f\\u0631\\u064a \\u0645\\u0646 \\u0641\\u064a\\u0646 \\u062c\\u0627\\u064a\\u0628\\u0629 \\u0647\\u0627\\u0644\\u0643\\u0644\\u0645\\u0629\\u0642\\u0644\\u062a \\u0627\\u064a\\u0634 \\u0627\\u064a\\u0634 \\ud83d\\ude20 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d \\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\\u0647\",\n \"#\\u0628\\u0627\\u064a\\u0639_\\u0627\\u0644\\u0643\\u0644\\u064a\\u062c\\u0627\\u0627\\u0646\\u0627 \\u0627\\u0644\\u0628\\u0646\\u062a \\u0627\\u0644\\u0648\\u062d\\u064a\\u062f\\u0647 \\u0627\\u0644\\u064a \\u0627\\u0634\\u0648\\u0641 \\u0628\\u0627\\u064a\\u0639 \\u0627\\u0644\\u0643\\u0644\\u064a\\u062c\\u0627 ( \\u062c\\u064a\\u0643\\u0631 ) \\ud83e\\udd2e\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"OFF\",\n \"NOT_OFF\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 8 } ], "source": [ "dev_data = dev_data.rename(columns={\"@USER افطرت عليك بعقاء واثنين من فروخها الجن 🔪😂\": \"Text\"})\n", "dev_data = dev_data.rename(columns={\"NOT_OFF\": \"label\"})\n", "dev_data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "NjrUivmwzgb6", "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "outputId": "86b4f0eb-25b6-432a-8c03-af7560e772bd" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 10158 @USER هتهزر معايا ولا ايه 😡😡😡😡\n", "0 10159 مشفتش العرض بتاعهم لا مش مهتمة لا😩🐸😂 URL\n", "1 10160 RT @USER عندما تكون لوحدك تحس انك لحالك صح 😊\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
10158@USER هتهزر معايا ولا ايه 😡😡😡😡
010159مشفتش العرض بتاعهم لا مش مهتمة لا😩🐸😂 URL
110160RT @USER عندما تكون لوحدك تحس انك لحالك صح 😊<L...
210161RT @USER ماشاء الله الرجال باين عليه محترم <LF...
310162@USER شسالفة احد يفهمني 😤
410163@USER اقووووول استريح عاد احتفالاتنا تحط اغاني...
.........
253512694قله حيا وين اهلهم ذولي الله لايبلانا لهالدرجه ...
253612695RT @USER ثم الطحلبه🐸🐸🐸 URL
253712696يا وجه الله 😷 من اليوم ورايح شاورما انسى 🔪 URL
253812697@USER متخلف حتى الحلال حرمتوه 😷
253912698@USER حنا خقينا على بنت رئيس مو على بياع كليجا...
\n", "

2540 rows × 2 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", " \n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "test_data", "summary": "{\n \"name\": \"test_data\",\n \"rows\": 2540,\n \"fields\": [\n {\n \"column\": \"10158\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 733,\n \"min\": 10159,\n \"max\": 12698,\n \"num_unique_values\": 2540,\n \"samples\": [\n 11716,\n 10293,\n 11798\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"@USER \\u0647\\u062a\\u0647\\u0632\\u0631 \\u0645\\u0639\\u0627\\u064a\\u0627 \\u0648\\u0644\\u0627 \\u0627\\u064a\\u0647 \\ud83d\\ude21\\ud83d\\ude21\\ud83d\\ude21\\ud83d\\ude21\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2540,\n \"samples\": [\n \"\\u064a\\u0627\\u0643\\u0631\\u0647\\u064a \\u0644\\u0647 \\u0627\\u0644\\u063a\\u064a\\u0627\\u0645\\u0647 \\ud83d\\ude37\\ud83d\\udc94\\ud83d\\udc94\\ud83d\\udc94\\ud83d\\udc94 URL\",\n \"RT @USER \\u0633\\u0639\\u0648\\u062f\\u064a\\u0647 \\u0631\\u0627\\u0643\\u0628\\u0647 \\u0632\\u0628 \\u0633\\u0648\\u0627\\u0642\\u0647\\u0627\\ud83d\\udc60 URL\",\n \"\\u0634\\u0648\\u0636\\u0639 \\u0627\\u0644\\u062c\\u0648 \\u0645\\u0639\\u064a \\u0643\\u0644 \\u0645\\u0627 \\u0627\\u063a\\u064a\\u0628 \\u064a\\u0632\\u064a\\u0646 \\u0644\\u0647\\u062f\\u0631\\u062c\\u0629 \\u0634\\u0624\\u0645 \\u0635\\u0631\\u062a \\u0639\\u0644\\u0649 \\u0627\\u0644\\u062d\\u064a\\u0627\\u0629\\ud83d\\ude21 URL\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 9 } ], "source": [ "test_data" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "id": "LJsmEXA9zggW" }, "outputs": [], "source": [ "test_data = test_data.drop(columns=['10158'])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "jT_O7vSbzgio", "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "outputId": "38e99acb-dcf4-48dd-ea7d-67e4637e2f4f" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Text\n", "0 مشفتش العرض بتاعهم لا مش مهتمة لا😩🐸😂 URL\n", "1 RT @USER عندما تكون لوحدك تحس انك لحالك صح 😊\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Text
0مشفتش العرض بتاعهم لا مش مهتمة لا😩🐸😂 URL
1RT @USER عندما تكون لوحدك تحس انك لحالك صح 😊<L...
2RT @USER ماشاء الله الرجال باين عليه محترم <LF...
3@USER شسالفة احد يفهمني 😤
4@USER اقووووول استريح عاد احتفالاتنا تحط اغاني...
......
2535قله حيا وين اهلهم ذولي الله لايبلانا لهالدرجه ...
2536RT @USER ثم الطحلبه🐸🐸🐸 URL
2537يا وجه الله 😷 من اليوم ورايح شاورما انسى 🔪 URL
2538@USER متخلف حتى الحلال حرمتوه 😷
2539@USER حنا خقينا على بنت رئيس مو على بياع كليجا...
\n", "

2540 rows × 1 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", " \n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "test_data", "summary": "{\n \"name\": \"test_data\",\n \"rows\": 2540,\n \"fields\": [\n {\n \"column\": \"Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2540,\n \"samples\": [\n \"\\u064a\\u0627\\u0643\\u0631\\u0647\\u064a \\u0644\\u0647 \\u0627\\u0644\\u063a\\u064a\\u0627\\u0645\\u0647 \\ud83d\\ude37\\ud83d\\udc94\\ud83d\\udc94\\ud83d\\udc94\\ud83d\\udc94 URL\",\n \"RT @USER \\u0633\\u0639\\u0648\\u062f\\u064a\\u0647 \\u0631\\u0627\\u0643\\u0628\\u0647 \\u0632\\u0628 \\u0633\\u0648\\u0627\\u0642\\u0647\\u0627\\ud83d\\udc60 URL\",\n \"\\u0634\\u0648\\u0636\\u0639 \\u0627\\u0644\\u062c\\u0648 \\u0645\\u0639\\u064a \\u0643\\u0644 \\u0645\\u0627 \\u0627\\u063a\\u064a\\u0628 \\u064a\\u0632\\u064a\\u0646 \\u0644\\u0647\\u062f\\u0631\\u062c\\u0629 \\u0634\\u0624\\u0645 \\u0635\\u0631\\u062a \\u0639\\u0644\\u0649 \\u0627\\u0644\\u062d\\u064a\\u0627\\u0629\\ud83d\\ude21 URL\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 11 } ], "source": [ "test_data = test_data.rename(columns={\"@USER هتهزر معايا ولا ايه 😡😡😡😡\": \"Text\"})\n", "test_data" ] }, { "cell_type": "code", "source": [ "test_labels = pd.read_csv(\"OSACT2022-sharedTask-test-taskA-gold-labels.txt\", sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", "test_labels = test_labels.rename(columns={\"NOT_OFF\": \"label\"})\n", "test_data = test_data.join(test_labels)\n", "test_data" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "UY1EuBsegOr3", "outputId": "67459fb8-db65-42f3-cefe-9265da8916c3" }, "execution_count": 12, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Text label\n", "0 مشفتش العرض بتاعهم لا مش مهتمة لا😩🐸😂 URL NOT_OFF\n", "1 RT @USER عندما تكون لوحدك تحس انك لحالك صح 😊\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Textlabel
0مشفتش العرض بتاعهم لا مش مهتمة لا😩🐸😂 URLNOT_OFF
1RT @USER عندما تكون لوحدك تحس انك لحالك صح 😊<L...NOT_OFF
2RT @USER ماشاء الله الرجال باين عليه محترم <LF...NOT_OFF
3@USER شسالفة احد يفهمني 😤NOT_OFF
4@USER اقووووول استريح عاد احتفالاتنا تحط اغاني...NOT_OFF
.........
2535قله حيا وين اهلهم ذولي الله لايبلانا لهالدرجه ...OFF
2536RT @USER ثم الطحلبه🐸🐸🐸 URLOFF
2537يا وجه الله 😷 من اليوم ورايح شاورما انسى 🔪 URLNOT_OFF
2538@USER متخلف حتى الحلال حرمتوه 😷OFF
2539@USER حنا خقينا على بنت رئيس مو على بياع كليجا...NOT_OFF
\n", "

2540 rows × 2 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", " \n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "test_data", "summary": "{\n \"name\": \"test_data\",\n \"rows\": 2540,\n \"fields\": [\n {\n \"column\": \"Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2540,\n \"samples\": [\n \"\\u064a\\u0627\\u0643\\u0631\\u0647\\u064a \\u0644\\u0647 \\u0627\\u0644\\u063a\\u064a\\u0627\\u0645\\u0647 \\ud83d\\ude37\\ud83d\\udc94\\ud83d\\udc94\\ud83d\\udc94\\ud83d\\udc94 URL\",\n \"RT @USER \\u0633\\u0639\\u0648\\u062f\\u064a\\u0647 \\u0631\\u0627\\u0643\\u0628\\u0647 \\u0632\\u0628 \\u0633\\u0648\\u0627\\u0642\\u0647\\u0627\\ud83d\\udc60 URL\",\n \"\\u0634\\u0648\\u0636\\u0639 \\u0627\\u0644\\u062c\\u0648 \\u0645\\u0639\\u064a \\u0643\\u0644 \\u0645\\u0627 \\u0627\\u063a\\u064a\\u0628 \\u064a\\u0632\\u064a\\u0646 \\u0644\\u0647\\u062f\\u0631\\u062c\\u0629 \\u0634\\u0624\\u0645 \\u0635\\u0631\\u062a \\u0639\\u0644\\u0649 \\u0627\\u0644\\u062d\\u064a\\u0627\\u0629\\ud83d\\ude21 URL\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"OFF\",\n \"NOT_OFF\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "markdown", "metadata": { "id": "zo3bQIomz58b" }, "source": [ "# **DOWNLOADING A LIST OF ARABIC STOPWORDS**" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "GEIXZTykzgkt", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "bef05ca9-90f6-42eb-90ea-ec513181d1c8" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2024-08-18 15:03:42-- https://raw.githubusercontent.com/alaa-a-a/multi-dialect-arabic-stop-words/main/Stop-words/stop_list_1177.txt\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 11468 (11K) [text/plain]\n", "Saving to: ‘stop_list_1177.txt.2’\n", "\n", "stop_list_1177.txt. 100%[===================>] 11.20K --.-KB/s in 0s \n", "\n", "2024-08-18 15:03:43 (89.4 MB/s) - ‘stop_list_1177.txt.2’ saved [11468/11468]\n", "\n" ] } ], "source": [ "# Alharbi, Alaa, and Mark Lee. \"Kawarith: an Arabic Twitter Corpus for Crisis Events.\"\n", "# Proceedings of the Sixth Arabic Natural Language Processing Workshop. 2021\n", "\n", "!wget https://raw.githubusercontent.com/alaa-a-a/multi-dialect-arabic-stop-words/main/Stop-words/stop_list_1177.txt\n", "arabic_stop_words = []\n", "with open ('./stop_list_1177.txt',encoding='utf-8') as f :\n", " for word in f.readlines() :\n", " arabic_stop_words.append(word.split(\"\\n\")[0])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "cp8SmC170cqH", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "c3844a28-7993-44e2-b5c0-56d0fa85e4de" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] } ], "source": [ "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import WordPunctTokenizer\n", "from nltk.stem.isri import ISRIStemmer\n", "import string\n", "import re\n", "from bs4 import BeautifulSoup\n", "nltk.download('stopwords')\n", "\n", "\n", "tok = WordPunctTokenizer()\n", "\n", "def normalize_arabic(text):\n", " text = re.sub(\"[إأآا]\", \"ا\", text)\n", " text = re.sub(\"ى\", \"ي\", text)\n", " text = re.sub(\"ؤ\", \"ء\", text)\n", " text = re.sub(\"ئ\", \"ء\", text)\n", " text = re.sub(\"ة\", \"ه\", text)\n", " text = re.sub(\"گ\", \"ك\", text)\n", " return text\n", "\n", "\n", "def remove_diacritics(text):\n", " arabic_diacritics = re.compile(\"\"\"\n", " ّ | # Tashdid\n", " َ | # Fatha\n", " ً | # Tanwin Fath\n", " ُ | # Damma\n", " ٌ | # Tanwin Damm\n", " ِ | # Kasra\n", " ٍ | # Tanwin Kasr\n", " ْ | # Sukun\n", " ـ # Tatwil/Kashida\n", " \"\"\", re.VERBOSE)\n", " return re.sub(arabic_diacritics, '', text)\n", "\n", "\n", "def remove_punctuations(text):\n", " arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:\"؟.,'{}~¦+|!”…“–ـ'''\n", " english_punctuations = string.punctuation\n", " punctuations_list = arabic_punctuations + english_punctuations\n", " translator = str.maketrans('', '', punctuations_list)\n", " return text.translate(translator)\n", "\n", "\n", "def remove_repeating_char(text):\n", " # return re.sub(r'(.)\\1+', r'\\1', text) # keep only 1 repeat\n", " return re.sub(r'(.)\\1+', r'\\1\\1', text) # keep 2 repeat\n", "\n", "def remove_stop_words(text):\n", " word_list = nltk.tokenize.wordpunct_tokenize(text.lower())\n", " word_list = [ w for w in word_list if not w in arabic_stop_words]\n", " return (\" \".join(word_list)).strip()\n", "\n", "\n", "\n", "def remove_non_arabic_letters(text):\n", " text = re.sub(r'([@A-Za-z0-9_]+)|#|http\\S+', ' ', text) # removes non arabic letters\n", " text = re.sub(r'ـــــــــــــ', '', text) # removes non arabic letters\n", " return text\n", "\n", "\n", "\n", "\n", "def clean_str(text):\n", " text = remove_non_arabic_letters(text)\n", " text = remove_punctuations(text)\n", " text = remove_diacritics(text)\n", " text = remove_repeating_char(text)\n", " # text = remove_stop_words(text)\n", "\n", " # Extract text from HTML tags, especially when dealing with data from 𝕏 (Twitter)\n", " soup = BeautifulSoup(text, 'lxml')\n", " souped = soup.get_text()\n", " pat1 = r'@[A-Za-z0-9]+'\n", " pat2 = r'https?://[A-Za-z0-9./]+'\n", " combined_pat = r'|'.join((pat1, pat2))\n", " stripped = re.sub(combined_pat, '', souped)\n", " try:\n", " clean = stripped.decode(\"utf-8-sig\").replace(u\"\\ufffd\", \"?\")\n", " except:\n", " clean = stripped\n", "\n", " words = tok.tokenize(clean)\n", " return (\" \".join(words)).strip()" ] }, { "cell_type": "markdown", "metadata": { "id": "KU_cHcIf2H_V" }, "source": [ "## **applying preprocessing on our dataset**" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "ouoJlEWv0c0M", "colab": { "base_uri": "https://localhost:8080/", "height": 241 }, "outputId": "0ff531cb-be6e-47bc-adaf-2e10989e214f" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Cleaning and parsing the training dataset...\n", "\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " Text label\n", "0 وصارت فطاير البقالات غذاء صحي 👎🏻 NOT_OFF\n", "1 روحي لبريده تلقين اشباه كثير بس ماحد زيكم مشفو... OFF\n", "2 مش باين حاجه خالص 😣 مش عارف بقى 😔 NOT_OFF\n", "3 اليوم الاثنين 👏 يقولك من المسلمين عندهم خاله ا... NOT_OFF\n", "4 حمدلله ماحطها في فمي اساسا 😷🤢 NOT_OFF" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Textlabel
0وصارت فطاير البقالات غذاء صحي 👎🏻NOT_OFF
1روحي لبريده تلقين اشباه كثير بس ماحد زيكم مشفو...OFF
2مش باين حاجه خالص 😣 مش عارف بقى 😔NOT_OFF
3اليوم الاثنين 👏 يقولك من المسلمين عندهم خاله ا...NOT_OFF
4حمدلله ماحطها في فمي اساسا 😷🤢NOT_OFF
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "train_data", "summary": "{\n \"name\": \"train_data\",\n \"rows\": 8886,\n \"fields\": [\n {\n \"column\": \"Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 8870,\n \"samples\": [\n \"\\u0647\\u0647 \\u0645\\u0648 \\u0628\\u0646\\u062a\\u064a \\ud83d\\udc4a\\ud83d\\udc4a\\ud83d\\ude02 \\u0645\\u0627\\u0632\\u0644\\u062a \\u0639\\u0630\\u0628\\u0627\\u0621 \\u0648\\u0627\\u0641\\u062a\\u062e\\u0631 \\ud83d\\ude0c\\ud83d\\ude02\",\n \"\\u064a\\u0627\\u0631\\u0628 \\u0635\\u0628\\u0631\\u0646\\u064a \\u0639\\u0644\\u0649 \\u0644\\u0648\\u0645\\u064a \\ud83d\\udc94\\ud83d\\udc94 \\u064a\\u0635\\u062d\\u064a\\u0646\\u064a \\u0645\\u0646 \\u0639\\u0632 \\u0646\\u0648\\u0645\\u064a \\u0628\\u0627\\u0644\\u0636\\u0631\\u0628 \\u0639\\u0644\\u0649 \\u0631\\u0627\\u0633\\u064a \\ud83d\\ude21\\ud83d\\udc4a\",\n \"\\u0648\\u0627\\u0644\\u0644\\u0647 \\u0627\\u0644\\u0646\\u0627\\u0642\\u0635\\u0647 \\u0627\\u0645\\u0643 \\u0644\\u0644\\u0627\\u0633\\u0641 \\u062a\\u062d\\u062a\\u0627\\u062c \\u0645\\u0646\\u0643\\u0645 \\u0627\\u0646 \\u062a\\u0643\\u0645\\u0644\\u0648\\u0647\\u0627 \\u0627\\u0645\\u0627 \\u0646\\u062d\\u0646 \\u0648\\u0627\\u0644\\u0644\\u0647 \\u0627\\u0644\\u062d\\u0645\\u062f \\u0643\\u0627\\u0645\\u0644\\u064a\\u0646 \\u0639\\u0642\\u0644 \\u064a\\u0627\\u0646\\u0627\\u0642\\u0635 \\u0627\\u0644\\u0639\\u0642\\u0644 \\u0644\\u0627 \\u0648\\u0627\\u0639\\u0644\\u0627\\u0645\\u064a \\ud83e\\udd22\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"OFF\",\n \"NOT_OFF\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 15 } ], "source": [ "print(\"Cleaning and parsing the training dataset...\\n\")\n", "\n", "train_data[\"Text\"] = train_data[\"Text\"].apply(lambda x: clean_str(x))\n", "\n", "train_data.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "id": "Gv3DF9UE0c3N", "colab": { "base_uri": "https://localhost:8080/", "height": 241 }, "outputId": "39a88f27-303c-4928-d1da-3e47ab1a7ade" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Cleaning and parsing the development dataset...\n", "\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " Text label\n", "0 داليا مبارك مادري ليش تقرفت 😷 NOT_OFF\n", "1 ابديت السناب الجديد ❌ حاس الناس حوس أشوف مشاهي... NOT_OFF\n", "2 هييه والله واايدد 🔪🔪💔💔 NOT_OFF\n", "3 اكيد اخس شي 😤 NOT_OFF\n", "4 مابي شي الحين غير فراشي 😣 NOT_OFF" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Textlabel
0داليا مبارك مادري ليش تقرفت 😷NOT_OFF
1ابديت السناب الجديد ❌ حاس الناس حوس أشوف مشاهي...NOT_OFF
2هييه والله واايدد 🔪🔪💔💔NOT_OFF
3اكيد اخس شي 😤NOT_OFF
4مابي شي الحين غير فراشي 😣NOT_OFF
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "dev_data", "summary": "{\n \"name\": \"dev_data\",\n \"rows\": 1269,\n \"fields\": [\n {\n \"column\": \"Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1268,\n \"samples\": [\n \"\\u0644\\u0648 \\u0643\\u0627\\u0646 \\u0627\\u0644\\u0627\\u0646\\u062a\\u0638\\u0627\\u0631 \\u0631\\u062c\\u0644\\u0627 \\u0644\\u0642\\u062a\\u0644\\u062a\\u0647 \\ud83d\\ude21\",\n \"\\u0639\\u0646\\u062f\\u064a \\u0637\\u0641\\u0644\\u0647 \\u0635\\u063a\\u064a\\u0631\\u0629 \\u0643\\u0644 \\u0645\\u0627 \\u0633\\u0648\\u062a \\u0645\\u0635\\u064a\\u0628\\u0647 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d \\u0631\\u0645\\u062a \\u0627\\u0644\\u0645\\u0641\\u062a\\u0627\\u062d \\u0645\\u0639 \\u0627\\u0644\\u0634\\u0628\\u0627\\u0643 \\u0644\\u064a\\u0634 \\u064a\\u0627\\u0628\\u0627\\u0628\\u0627 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d \\u0631\\u0645\\u062a \\u0645\\u0639\\u062c\\u0648\\u0646 \\u0627\\u0644\\u0627\\u0633\\u0646\\u0627\\u0646 \\u0644\\u064a\\u0634 \\u064a\\u0627 \\u0628\\u0627\\u0628\\u0627 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d \\u0632\\u0639\\u0644\\u062a \\u0639\\u0644\\u064a \\u0642\\u0627\\u0644\\u062a \\u0644\\u064a \\u0643\\u0644 \\u062a\\u0631\\u0627\\u0628 \\u0627\\u0646\\u0635\\u062f\\u0645\\u062a \\u0645\\u062f\\u0631\\u064a \\u0645\\u0646 \\u0641\\u064a\\u0646 \\u062c\\u0627\\u064a\\u0628\\u0629 \\u0647\\u0627\\u0644\\u0643\\u0644\\u0645\\u0629 \\u0642\\u0644\\u062a \\u0627\\u064a\\u0634 \\u0627\\u064a\\u0634 \\ud83d\\ude20 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d \\u0647\\u0647\",\n \"\\u0648\\u0639\\u0642\\u0644\\u0643 \\u0643\\u0627\\u0646 \\u0641\\u064a\\u0646 \\u0645\\u0646 \\u0627\\u0644\\u0623\\u0648\\u0644 \\u064a\\u0627 \\u0633\\u0643\\u0631 \\ud83d\\ude02\\ud83d\\ude02 \\u0648\\u0627\\u0646\\u062a \\u062a\\u0642\\u062f\\u0631 \\u062a\\u0633\\u0639\\u062f\\u0647\\u0627 \\u064a\\u0627\\u0641\\u0627\\u0644\\u062d \\u0628\\u0633 \\u0627\\u0644\\u0628\\u062f\\u0627\\u064a\\u0629 \\u063a\\u0644\\u0637 \\ud83d\\udc4a\\ud83d\\udc4a\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"OFF\",\n \"NOT_OFF\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 16 } ], "source": [ "print(\"Cleaning and parsing the development dataset...\\n\")\n", "\n", "dev_data[\"Text\"] = dev_data[\"Text\"].apply(lambda x: clean_str(x))\n", "\n", "dev_data.head()" ] }, { "cell_type": "code", "source": [ "print(\"Cleaning and parsing the test dataset...\\n\")\n", "\n", "test_data[\"Text\"] = test_data[\"Text\"].apply(lambda x: clean_str(x))\n", "\n", "test_data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 241 }, "id": "5Y4-Z0cZjgFz", "outputId": "189d4705-d232-41cc-9500-be7f91c4a6fa" }, "execution_count": 17, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Cleaning and parsing the test dataset...\n", "\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " Text label\n", "0 مشفتش العرض بتاعهم لا مش مهتمة لا 😩🐸😂 NOT_OFF\n", "1 عندما تكون لوحدك تحس انك لحالك صح 😊 حتى انا مث... NOT_OFF\n", "2 ماشاء الله الرجال باين عليه محترم البنات متى ت... NOT_OFF\n", "3 شسالفة احد يفهمني 😤 NOT_OFF\n", "4 اقوول استريح عاد احتفالاتنا تحط اغانينا 😡😡 NOT_OFF" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Textlabel
0مشفتش العرض بتاعهم لا مش مهتمة لا 😩🐸😂NOT_OFF
1عندما تكون لوحدك تحس انك لحالك صح 😊 حتى انا مث...NOT_OFF
2ماشاء الله الرجال باين عليه محترم البنات متى ت...NOT_OFF
3شسالفة احد يفهمني 😤NOT_OFF
4اقوول استريح عاد احتفالاتنا تحط اغانينا 😡😡NOT_OFF
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "test_data", "summary": "{\n \"name\": \"test_data\",\n \"rows\": 2540,\n \"fields\": [\n {\n \"column\": \"Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2539,\n \"samples\": [\n \"\\u0647\\u0647 \\ud83d\\ude02\\ud83d\\ude02\\ud83d\\udc94\\ud83d\\udd2a\",\n \"\\u0627\\u0648\\u0648\\u0641 \\ud83d\\ude21\",\n \"\\u0628\\u0627\\u064a\\u0639 \\u0627\\u0644\\u0643\\u0644\\u064a\\u062c\\u0627\\u0627\\u0644\\u0644\\u0647 \\u064a\\u0633\\u062a\\u0631 \\u0639\\u0644\\u064a\\u0647 \\u0645\\u0646 \\u0627\\u062e\\u0648\\u0627\\u062a \\u0632\\u0644\\u064a\\u062e\\u0647 \\u062c\\u0645\\u0627\\u0644\\u0647 \\u0639\\u0627\\u064a\\u062f\\u064a \\u0637\\u064a\\u0628 \\u0644\\u0648\\u0634\\u0627\\u0641\\u0646 \\u064a\\u0648\\u0633\\u0641 \\u0639\\u0644\\u064a\\u0647 \\u0627\\u0644\\u0633\\u0644\\u0627\\u0645 \\u0648\\u0634 \\u0628\\u064a\\u0642\\u0637\\u0639\\u0646 \\ud83d\\udd2a\\ud83d\\udd2a\\ud83d\\udc94\\ud83d\\ude02\\ud83d\\ude02\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"OFF\",\n \"NOT_OFF\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "id": "__s92VxN0c5y" }, "outputs": [], "source": [ "label2id = {\"NOT_OFF\": 0,\"OFF\": 1}\n", "id2label = {0: \"NOT_OFF\", 1: \"OFF\"}" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "id": "NpHWFtFk0c8b", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "649be2fe-70c0-4a22-a078-3d7824982b95" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Text label\n", "0 وصارت فطاير البقالات غذاء صحي 👎🏻 0\n", "1 روحي لبريده تلقين اشباه كثير بس ماحد زيكم مشفو... 1\n", "2 مش باين حاجه خالص 😣 مش عارف بقى 😔 0\n", "3 اليوم الاثنين 👏 يقولك من المسلمين عندهم خاله ا... 0\n", "4 حمدلله ماحطها في فمي اساسا 😷🤢 0" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Textlabel
0وصارت فطاير البقالات غذاء صحي 👎🏻0
1روحي لبريده تلقين اشباه كثير بس ماحد زيكم مشفو...1
2مش باين حاجه خالص 😣 مش عارف بقى 😔0
3اليوم الاثنين 👏 يقولك من المسلمين عندهم خاله ا...0
4حمدلله ماحطها في فمي اساسا 😷🤢0
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "train_data", "summary": "{\n \"name\": \"train_data\",\n \"rows\": 8886,\n \"fields\": [\n {\n \"column\": \"Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 8870,\n \"samples\": [\n \"\\u0647\\u0647 \\u0645\\u0648 \\u0628\\u0646\\u062a\\u064a \\ud83d\\udc4a\\ud83d\\udc4a\\ud83d\\ude02 \\u0645\\u0627\\u0632\\u0644\\u062a \\u0639\\u0630\\u0628\\u0627\\u0621 \\u0648\\u0627\\u0641\\u062a\\u062e\\u0631 \\ud83d\\ude0c\\ud83d\\ude02\",\n \"\\u064a\\u0627\\u0631\\u0628 \\u0635\\u0628\\u0631\\u0646\\u064a \\u0639\\u0644\\u0649 \\u0644\\u0648\\u0645\\u064a \\ud83d\\udc94\\ud83d\\udc94 \\u064a\\u0635\\u062d\\u064a\\u0646\\u064a \\u0645\\u0646 \\u0639\\u0632 \\u0646\\u0648\\u0645\\u064a \\u0628\\u0627\\u0644\\u0636\\u0631\\u0628 \\u0639\\u0644\\u0649 \\u0631\\u0627\\u0633\\u064a \\ud83d\\ude21\\ud83d\\udc4a\",\n \"\\u0648\\u0627\\u0644\\u0644\\u0647 \\u0627\\u0644\\u0646\\u0627\\u0642\\u0635\\u0647 \\u0627\\u0645\\u0643 \\u0644\\u0644\\u0627\\u0633\\u0641 \\u062a\\u062d\\u062a\\u0627\\u062c \\u0645\\u0646\\u0643\\u0645 \\u0627\\u0646 \\u062a\\u0643\\u0645\\u0644\\u0648\\u0647\\u0627 \\u0627\\u0645\\u0627 \\u0646\\u062d\\u0646 \\u0648\\u0627\\u0644\\u0644\\u0647 \\u0627\\u0644\\u062d\\u0645\\u062f \\u0643\\u0627\\u0645\\u0644\\u064a\\u0646 \\u0639\\u0642\\u0644 \\u064a\\u0627\\u0646\\u0627\\u0642\\u0635 \\u0627\\u0644\\u0639\\u0642\\u0644 \\u0644\\u0627 \\u0648\\u0627\\u0639\\u0644\\u0627\\u0645\\u064a \\ud83e\\udd22\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 19 } ], "source": [ "train_data['label'] = train_data['label'].apply(lambda x: label2id[x])\n", "train_data=train_data[[\"Text\", \"label\"]]\n", "train_data.head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "id": "QPAuj5Pz0c--", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "4ae3a361-bff6-48f4-d59f-50ad0e9d5225" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Text label\n", "0 داليا مبارك مادري ليش تقرفت 😷 0\n", "1 ابديت السناب الجديد ❌ حاس الناس حوس أشوف مشاهي... 0\n", "2 هييه والله واايدد 🔪🔪💔💔 0\n", "3 اكيد اخس شي 😤 0\n", "4 مابي شي الحين غير فراشي 😣 0" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Textlabel
0داليا مبارك مادري ليش تقرفت 😷0
1ابديت السناب الجديد ❌ حاس الناس حوس أشوف مشاهي...0
2هييه والله واايدد 🔪🔪💔💔0
3اكيد اخس شي 😤0
4مابي شي الحين غير فراشي 😣0
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "dev_data", "summary": "{\n \"name\": \"dev_data\",\n \"rows\": 1269,\n \"fields\": [\n {\n \"column\": \"Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1268,\n \"samples\": [\n \"\\u0644\\u0648 \\u0643\\u0627\\u0646 \\u0627\\u0644\\u0627\\u0646\\u062a\\u0638\\u0627\\u0631 \\u0631\\u062c\\u0644\\u0627 \\u0644\\u0642\\u062a\\u0644\\u062a\\u0647 \\ud83d\\ude21\",\n \"\\u0639\\u0646\\u062f\\u064a \\u0637\\u0641\\u0644\\u0647 \\u0635\\u063a\\u064a\\u0631\\u0629 \\u0643\\u0644 \\u0645\\u0627 \\u0633\\u0648\\u062a \\u0645\\u0635\\u064a\\u0628\\u0647 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d \\u0631\\u0645\\u062a \\u0627\\u0644\\u0645\\u0641\\u062a\\u0627\\u062d \\u0645\\u0639 \\u0627\\u0644\\u0634\\u0628\\u0627\\u0643 \\u0644\\u064a\\u0634 \\u064a\\u0627\\u0628\\u0627\\u0628\\u0627 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d \\u0631\\u0645\\u062a \\u0645\\u0639\\u062c\\u0648\\u0646 \\u0627\\u0644\\u0627\\u0633\\u0646\\u0627\\u0646 \\u0644\\u064a\\u0634 \\u064a\\u0627 \\u0628\\u0627\\u0628\\u0627 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d \\u0632\\u0639\\u0644\\u062a \\u0639\\u0644\\u064a \\u0642\\u0627\\u0644\\u062a \\u0644\\u064a \\u0643\\u0644 \\u062a\\u0631\\u0627\\u0628 \\u0627\\u0646\\u0635\\u062f\\u0645\\u062a \\u0645\\u062f\\u0631\\u064a \\u0645\\u0646 \\u0641\\u064a\\u0646 \\u062c\\u0627\\u064a\\u0628\\u0629 \\u0647\\u0627\\u0644\\u0643\\u0644\\u0645\\u0629 \\u0642\\u0644\\u062a \\u0627\\u064a\\u0634 \\u0627\\u064a\\u0634 \\ud83d\\ude20 \\u0642\\u0627\\u0644\\u062a \\u0627\\u0645\\u0632\\u062d \\u0647\\u0647\",\n \"\\u0648\\u0639\\u0642\\u0644\\u0643 \\u0643\\u0627\\u0646 \\u0641\\u064a\\u0646 \\u0645\\u0646 \\u0627\\u0644\\u0623\\u0648\\u0644 \\u064a\\u0627 \\u0633\\u0643\\u0631 \\ud83d\\ude02\\ud83d\\ude02 \\u0648\\u0627\\u0646\\u062a \\u062a\\u0642\\u062f\\u0631 \\u062a\\u0633\\u0639\\u062f\\u0647\\u0627 \\u064a\\u0627\\u0641\\u0627\\u0644\\u062d \\u0628\\u0633 \\u0627\\u0644\\u0628\\u062f\\u0627\\u064a\\u0629 \\u063a\\u0644\\u0637 \\ud83d\\udc4a\\ud83d\\udc4a\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 20 } ], "source": [ "dev_data['label'] = dev_data['label'].apply(lambda x: label2id[x])\n", "dev_data=dev_data[[\"Text\", \"label\"]]\n", "dev_data.head()" ] }, { "cell_type": "code", "source": [ "test_data['label'] = test_data['label'].apply(lambda x: label2id[x])\n", "test_data=test_data[[\"Text\", \"label\"]]\n", "test_data" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "9Dkns-XMjuvH", "outputId": "81b93b0a-989c-43c6-b56a-a5192657c375" }, "execution_count": 21, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Text label\n", "0 مشفتش العرض بتاعهم لا مش مهتمة لا 😩🐸😂 0\n", "1 عندما تكون لوحدك تحس انك لحالك صح 😊 حتى انا مث... 0\n", "2 ماشاء الله الرجال باين عليه محترم البنات متى ت... 0\n", "3 شسالفة احد يفهمني 😤 0\n", "4 اقوول استريح عاد احتفالاتنا تحط اغانينا 😡😡 0\n", "... ... ...\n", "2535 قله حيا وين اهلهم ذولي الله لايبلانا لهالدرجه ... 1\n", "2536 ثم الطحلبه 🐸🐸 1\n", "2537 يا وجه الله 😷 من اليوم ورايح شاورما انسى 🔪 0\n", "2538 متخلف حتى الحلال حرمتوه 😷 1\n", "2539 حنا خقينا على بنت رئيس مو على بياع كليجا 😝👊🏻 0\n", "\n", "[2540 rows x 2 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Textlabel
0مشفتش العرض بتاعهم لا مش مهتمة لا 😩🐸😂0
1عندما تكون لوحدك تحس انك لحالك صح 😊 حتى انا مث...0
2ماشاء الله الرجال باين عليه محترم البنات متى ت...0
3شسالفة احد يفهمني 😤0
4اقوول استريح عاد احتفالاتنا تحط اغانينا 😡😡0
.........
2535قله حيا وين اهلهم ذولي الله لايبلانا لهالدرجه ...1
2536ثم الطحلبه 🐸🐸1
2537يا وجه الله 😷 من اليوم ورايح شاورما انسى 🔪0
2538متخلف حتى الحلال حرمتوه 😷1
2539حنا خقينا على بنت رئيس مو على بياع كليجا 😝👊🏻0
\n", "

2540 rows × 2 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "test_data", "summary": "{\n \"name\": \"test_data\",\n \"rows\": 2540,\n \"fields\": [\n {\n \"column\": \"Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2539,\n \"samples\": [\n \"\\u0647\\u0647 \\ud83d\\ude02\\ud83d\\ude02\\ud83d\\udc94\\ud83d\\udd2a\",\n \"\\u0627\\u0648\\u0648\\u0641 \\ud83d\\ude21\",\n \"\\u0628\\u0627\\u064a\\u0639 \\u0627\\u0644\\u0643\\u0644\\u064a\\u062c\\u0627\\u0627\\u0644\\u0644\\u0647 \\u064a\\u0633\\u062a\\u0631 \\u0639\\u0644\\u064a\\u0647 \\u0645\\u0646 \\u0627\\u062e\\u0648\\u0627\\u062a \\u0632\\u0644\\u064a\\u062e\\u0647 \\u062c\\u0645\\u0627\\u0644\\u0647 \\u0639\\u0627\\u064a\\u062f\\u064a \\u0637\\u064a\\u0628 \\u0644\\u0648\\u0634\\u0627\\u0641\\u0646 \\u064a\\u0648\\u0633\\u0641 \\u0639\\u0644\\u064a\\u0647 \\u0627\\u0644\\u0633\\u0644\\u0627\\u0645 \\u0648\\u0634 \\u0628\\u064a\\u0642\\u0637\\u0639\\u0646 \\ud83d\\udd2a\\ud83d\\udd2a\\ud83d\\udc94\\ud83d\\ude02\\ud83d\\ude02\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 21 } ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "id": "kB57ziQ83vP3", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "5e827c2e-abb8-4857-cdff-7d9b9c3cf558" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Original class distribution: Counter({0: 5715, 1: 3171})\n", "Resampled class distribution: Counter({0: 5715, 1: 5715})\n" ] } ], "source": [ "import pandas as pd\n", "from imblearn.over_sampling import RandomOverSampler\n", "from collections import Counter\n", "\n", "X = train_data[['Text']]\n", "y = train_data['label']\n", "\n", "print('Original class distribution:', Counter(y))\n", "\n", "ros = RandomOverSampler(random_state=42)\n", "\n", "X_resampled, y_resampled = ros.fit_resample(X, y)\n", "\n", "train_data_resampled = pd.DataFrame(X_resampled, columns=['Text'])\n", "train_data_resampled['label'] = y_resampled\n", "\n", "print('Resampled class distribution:', Counter(y_resampled))" ] }, { "cell_type": "code", "source": [ "y_resampled.value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 178 }, "id": "yN8_AEhEj5IY", "outputId": "70aad1a5-ceee-4484-edb9-36a9150dc4be" }, "execution_count": 23, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "label\n", "0 5715\n", "1 5715\n", "Name: count, dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
label
05715
15715
\n", "

" ] }, "metadata": {}, "execution_count": 23 } ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "id": "lsoxSLHk7XsK", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "5b6b3acf-95b4-49ce-ee3d-a99ec3078071" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Text label\n", "0 وصارت فطاير البقالات غذاء صحي 👎🏻 0\n", "1 روحي لبريده تلقين اشباه كثير بس ماحد زيكم مشفو... 1\n", "2 مش باين حاجه خالص 😣 مش عارف بقى 😔 0\n", "3 اليوم الاثنين 👏 يقولك من المسلمين عندهم خاله ا... 0\n", "4 حمدلله ماحطها في فمي اساسا 😷🤢 0" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Textlabel
0وصارت فطاير البقالات غذاء صحي 👎🏻0
1روحي لبريده تلقين اشباه كثير بس ماحد زيكم مشفو...1
2مش باين حاجه خالص 😣 مش عارف بقى 😔0
3اليوم الاثنين 👏 يقولك من المسلمين عندهم خاله ا...0
4حمدلله ماحطها في فمي اساسا 😷🤢0
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "train_data_resampled", "summary": "{\n \"name\": \"train_data_resampled\",\n \"rows\": 11430,\n \"fields\": [\n {\n \"column\": \"Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 8870,\n \"samples\": [\n \"\\u0647\\u0647 \\u0645\\u0648 \\u0628\\u0646\\u062a\\u064a \\ud83d\\udc4a\\ud83d\\udc4a\\ud83d\\ude02 \\u0645\\u0627\\u0632\\u0644\\u062a \\u0639\\u0630\\u0628\\u0627\\u0621 \\u0648\\u0627\\u0641\\u062a\\u062e\\u0631 \\ud83d\\ude0c\\ud83d\\ude02\",\n \"\\u064a\\u0627\\u0631\\u0628 \\u0635\\u0628\\u0631\\u0646\\u064a \\u0639\\u0644\\u0649 \\u0644\\u0648\\u0645\\u064a \\ud83d\\udc94\\ud83d\\udc94 \\u064a\\u0635\\u062d\\u064a\\u0646\\u064a \\u0645\\u0646 \\u0639\\u0632 \\u0646\\u0648\\u0645\\u064a \\u0628\\u0627\\u0644\\u0636\\u0631\\u0628 \\u0639\\u0644\\u0649 \\u0631\\u0627\\u0633\\u064a \\ud83d\\ude21\\ud83d\\udc4a\",\n \"\\u0648\\u0627\\u0644\\u0644\\u0647 \\u0627\\u0644\\u0646\\u0627\\u0642\\u0635\\u0647 \\u0627\\u0645\\u0643 \\u0644\\u0644\\u0627\\u0633\\u0641 \\u062a\\u062d\\u062a\\u0627\\u062c \\u0645\\u0646\\u0643\\u0645 \\u0627\\u0646 \\u062a\\u0643\\u0645\\u0644\\u0648\\u0647\\u0627 \\u0627\\u0645\\u0627 \\u0646\\u062d\\u0646 \\u0648\\u0627\\u0644\\u0644\\u0647 \\u0627\\u0644\\u062d\\u0645\\u062f \\u0643\\u0627\\u0645\\u0644\\u064a\\u0646 \\u0639\\u0642\\u0644 \\u064a\\u0627\\u0646\\u0627\\u0642\\u0635 \\u0627\\u0644\\u0639\\u0642\\u0644 \\u0644\\u0627 \\u0648\\u0627\\u0639\\u0644\\u0627\\u0645\\u064a \\ud83e\\udd22\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 24 } ], "source": [ "train_data_resampled.head()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "id": "9UYWjyD_7Xzs", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "947db852-87e5-4e3c-e79c-afeaa7d9f96a" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Training data shape: (11430,) (11430,)\n", "Validation data shape: (1269,) (1269,)\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train = train_data_resampled['Text'].values\n", "y_train = train_data_resampled['label'].values\n", "\n", "X_val = dev_data['Text'].values\n", "y_val = dev_data['label'].values\n", "\n", "\n", "\n", "print(\"Training data shape:\", X_train.shape, y_train.shape)\n", "print(\"Validation data shape:\", X_val.shape, y_val.shape)\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "id": "C4lzRYvx7X6U", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "895b827b-d407-4c37-f630-5ce6f486025a" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Maximum length of text: 59\n" ] } ], "source": [ "train_text_lengths = [len(text.split()) for text in X_train]\n", "max_length = max(train_text_lengths)\n", "\n", "print(\"Maximum length of text:\", max_length)" ] }, { "cell_type": "markdown", "source": [ "### APPLYING QARIB MODEL" ], "metadata": { "id": "tEoMuIeVkKYc" } }, { "cell_type": "code", "source": [ "! pip install transformers[torch]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7ivxSOxUkIzt", "outputId": "c237ed8e-5972-4612-dfd6-69e692d579ab" }, "execution_count": 27, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: transformers[torch] in /usr/local/lib/python3.10/dist-packages (4.42.4)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (3.15.4)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.23.5)\n", "Requirement already satisfied: numpy<2.0,>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (1.26.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (24.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (6.0.2)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2024.5.15)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2.32.3)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.4.4)\n", "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.19.1)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (4.66.5)\n", "Requirement already satisfied: accelerate>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.32.1)\n", "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2.3.1+cu121)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->transformers[torch]) (5.9.5)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers[torch]) (2024.6.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers[torch]) (4.12.2)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (1.13.1)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (3.3)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (3.1.4)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.105)\n", "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (8.9.2.26)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.3.1)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (11.0.2.54)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (10.3.2.106)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (11.4.5.107)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.0.106)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (2.20.5)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.105)\n", "Requirement already satisfied: triton==2.3.1 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (2.3.1)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch->transformers[torch]) (12.6.20)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (2024.7.4)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->transformers[torch]) (2.1.5)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->transformers[torch]) (1.3.0)\n" ] } ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "\n", "# to prepare dataset and calculate metrics\n", "from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score\n", "\n", "from transformers import AutoConfig, BertForSequenceClassification, AutoTokenizer\n", "from transformers.data.processors import SingleSentenceClassificationProcessor, InputFeatures\n", "from transformers import Trainer , TrainingArguments" ], "metadata": { "id": "cEAZniEwkI25" }, "execution_count": 28, "outputs": [] }, { "cell_type": "code", "source": [ "train_df = pd.DataFrame({\n", " 'label':y_train,\n", " 'text': X_train\n", " })\n", "\n", "dev_df = pd.DataFrame({\n", " 'label':y_val,\n", " 'text': X_val\n", " })\n", "\n", "test_df = pd.DataFrame({\n", " 'label':test_data['label'],\n", " 'text': test_data['Text']\n", " })" ], "metadata": { "id": "QtGA8ndrkI5f" }, "execution_count": 29, "outputs": [] }, { "cell_type": "code", "source": [ "PREFIX_LIST = [\n", " \"ال\",\n", " \"و\",\n", " \"ف\",\n", " \"ب\",\n", " \"ك\",\n", " \"ل\",\n", " \"لل\",\n", " \"\\u0627\\u0644\",\n", " \"\\u0648\",\n", " \"\\u0641\",\n", " \"\\u0628\",\n", " \"\\u0643\",\n", " \"\\u0644\",\n", " \"\\u0644\\u0644\",\n", " \"س\",\n", "]\n", "SUFFIX_LIST = [\n", " \"ه\",\n", " \"ها\",\n", " \"ك\",\n", " \"ي\",\n", " \"هما\",\n", " \"كما\",\n", " \"نا\",\n", " \"كم\",\n", " \"هم\",\n", " \"هن\",\n", " \"كن\",\n", " \"ا\",\n", " \"ان\",\n", " \"ين\",\n", " \"ون\",\n", " \"وا\",\n", " \"ات\",\n", " \"ت\",\n", " \"ن\",\n", " \"ة\",\n", " \"\\u0647\",\n", " \"\\u0647\\u0627\",\n", " \"\\u0643\",\n", " \"\\u064a\",\n", " \"\\u0647\\u0645\\u0627\",\n", " \"\\u0643\\u0645\\u0627\",\n", " \"\\u0646\\u0627\",\n", " \"\\u0643\\u0645\",\n", " \"\\u0647\\u0645\",\n", " \"\\u0647\\u0646\",\n", " \"\\u0643\\u0646\",\n", " \"\\u0627\",\n", " \"\\u0627\\u0646\",\n", " \"\\u064a\\u0646\",\n", " \"\\u0648\\u0646\",\n", " \"\\u0648\\u0627\",\n", " \"\\u0627\\u062a\",\n", " \"\\u062a\",\n", " \"\\u0646\",\n", " \"\\u0629\",\n", "]\n", "\n", "\n", "# the never_split list is used with the transformers library\n", "_PREFIX_SYMBOLS = [x + \"+\" for x in PREFIX_LIST]\n", "_SUFFIX_SYMBOLS = [\"+\" + x for x in SUFFIX_LIST]\n", "NEVER_SPLIT_TOKENS = list(set(_PREFIX_SYMBOLS + _SUFFIX_SYMBOLS))" ], "metadata": { "id": "poPrBhk3kI8W" }, "execution_count": 30, "outputs": [] }, { "cell_type": "code", "source": [ "model_name = \"qarib/bert-base-qarib\"\n", "num_labels = 2\n", "config = AutoConfig.from_pretrained(model_name,num_labels=num_labels, output_attentions=True)\n", "tokenizer = AutoTokenizer.from_pretrained(model_name,\n", " do_lower_case=False,\n", " do_basic_tokenize=True,\n", " never_split=NEVER_SPLIT_TOKENS)\n", "tokenizer.max_len = 64\n", "model = BertForSequenceClassification.from_pretrained(model_name, config=config)\n", "\n", "train_dataset = SingleSentenceClassificationProcessor(mode='classification')\n", "dev_dataset = SingleSentenceClassificationProcessor(mode='classification')\n", "\n", "train_dataset.add_examples(texts_or_text_and_labels=train_df['text'],labels=train_df['label'],overwrite_examples = True)\n", "dev_dataset.add_examples(texts_or_text_and_labels=dev_df['text'],labels=dev_df['label'],overwrite_examples = True)\n", "print(train_dataset.examples[0])\n", "\n", "train_features = train_dataset.get_features(tokenizer = tokenizer, max_length =64)\n", "dev_features = dev_dataset.get_features(tokenizer = tokenizer, max_length =64)\n", "# print(config)\n", "\n", "print(len(train_features))\n", "print(len(dev_features))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mPueTThMkI-9", "outputId": "2064aba1-e125-4d87-b502-b7ac54dc7b94" }, "execution_count": 31, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n", "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at qarib/bert-base-qarib and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "InputExample(guid=None, text_a='وصارت فطاير البقالات غذاء صحي 👎🏻', text_b=None, label=0)\n", "11430\n", "1269\n" ] } ] }, { "cell_type": "code", "source": [ "\n", "def compute_metrics(p): #p should be of type EvalPrediction\n", " print(np.shape(p.predictions[0]))\n", " print(np.shape(p.predictions[1]))\n", " print(len(p.label_ids))\n", " preds = np.argmax(p.predictions[0], axis=1)\n", " assert len(preds) == len(p.label_ids)\n", " print(classification_report(p.label_ids,preds))\n", " print(confusion_matrix(p.label_ids,preds))\n", "\n", " macro_f1 = f1_score(p.label_ids,preds,average='macro')\n", " macro_precision = precision_score(p.label_ids,preds,average='macro')\n", " macro_recall = recall_score(p.label_ids,preds,average='macro')\n", " acc = accuracy_score(p.label_ids,preds)\n", " return {\n", " 'macro_f1' : macro_f1,\n", " 'macro_precision': macro_precision,\n", " 'macro_recall': macro_recall,\n", " 'accuracy': acc\n", " }" ], "metadata": { "id": "bw33WDOvkJBQ" }, "execution_count": 32, "outputs": [] }, { "cell_type": "code", "source": [ "! mkdir train\n", "training_args = TrainingArguments(\"./train\")\n", "training_args.do_train = True\n", "training_args.evaluate_during_training = True\n", "training_args.adam_epsilon = 1e-8\n", "training_args.learning_rate = 2e-5\n", "training_args.warmup_steps = 0\n", "training_args.per_device_train_batch_size = 32 #Increase batch size\n", "training_args.per_device_eval_batch_size = 32 #Increase batch size\n", "training_args.num_train_epochs = 2 #reduce number of epoch\n", "training_args.logging_steps = 200 #Increase logging steps\n", "training_args.save_steps = 1000 #Increase save steps\n", "training_args.seed = 42\n", "print(training_args.logging_steps)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "cv8L1xaEkJEA", "outputId": "cb4048b3-c6d5-41b1-f419-3448cd153c4f" }, "execution_count": 33, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "mkdir: cannot create directory ‘train’: File exists\n", "200\n" ] } ] }, { "cell_type": "code", "source": [ "# instantiate trainer\n", "trainer = Trainer(model=model,\n", " args = training_args,\n", " train_dataset = train_features,\n", " eval_dataset = dev_features,\n", " compute_metrics = compute_metrics)\n", "# start training\n", "trainer.train()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 224 }, "id": "Z5rCtBVskJGY", "outputId": "1e2bfab6-7e6b-4609-f330-309bca070d00" }, "execution_count": 34, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation=\"eager\"` when loading the model.\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "
\n", " \n", " \n", " [716/716 04:29, Epoch 2/2]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
2000.409100
4000.302700
6000.173300

" ] }, "metadata": {} }, { "output_type": "execute_result", "data": { "text/plain": [ "TrainOutput(global_step=716, training_loss=0.2719311820728153, metrics={'train_runtime': 273.0161, 'train_samples_per_second': 83.731, 'train_steps_per_second': 2.623, 'total_flos': 751839840691200.0, 'train_loss': 0.2719311820728153, 'epoch': 2.0})" ] }, "metadata": {}, "execution_count": 34 } ] }, { "cell_type": "code", "source": [ "trainer.evaluate()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 436 }, "id": "7rs4VqEmkJJP", "outputId": "886bc7fc-4f36-4108-8ae3-176d195b250b" }, "execution_count": 35, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "

\n", " \n", " \n", " [40/40 00:05]\n", "
\n", " " ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "(1269, 2)\n", "(12, 1269, 12, 64, 64)\n", "1269\n", " precision recall f1-score support\n", "\n", " 0 0.92 0.89 0.90 865\n", " 1 0.78 0.83 0.80 404\n", "\n", " accuracy 0.87 1269\n", " macro avg 0.85 0.86 0.85 1269\n", "weighted avg 0.87 0.87 0.87 1269\n", "\n", "[[768 97]\n", " [ 69 335]]\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "{'eval_loss': 0.44044920802116394,\n", " 'eval_macro_f1': 0.8519515458874727,\n", " 'eval_macro_precision': 0.84651284348865,\n", " 'eval_macro_recall': 0.8585345962341899,\n", " 'eval_accuracy': 0.8691883372734437,\n", " 'eval_runtime': 10.8551,\n", " 'eval_samples_per_second': 116.904,\n", " 'eval_steps_per_second': 3.685,\n", " 'epoch': 2.0}" ] }, "metadata": {}, "execution_count": 35 } ] }, { "cell_type": "code", "source": [ "!pip install fasttext\n", "import fasttext\n", "import fasttext.util\n", "from huggingface_hub import hf_hub_download\n", "\n", "model_path = hf_hub_download(repo_id=\"facebook/fasttext-ar-vectors\", filename=\"model.bin\")\n", "# model_path = \"./fasttext-ar-vectors-150.bin\"\n", "model_fasttext = fasttext.load_model(model_path)\n", "# model_fasttext = fasttext.util.reduce_model(model_fasttext, 150) # reduce embeddings dimension to 150 from 300; requires a huge memory notebook\n", "# model_fasttext.save_model(\"/content/drive/MyDrive/Colab Notebooks/text-aml/hate-speech-ds/fasttext-ar-vectors-150.bin\")\n", "print(len(model_fasttext.words))\n", "model_fasttext['bread'].shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nRVxt4kUsIrl", "outputId": "390379f6-2a02-40fe-da8f-df00a6869d57" }, "execution_count": 36, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: fasttext in /usr/local/lib/python3.10/dist-packages (0.9.2)\n", "Requirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.10/dist-packages (from fasttext) (2.13.4)\n", "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from fasttext) (71.0.4)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from fasttext) (1.26.4)\n", "2000000\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "(300,)" ] }, "metadata": {}, "execution_count": 36 } ] }, { "cell_type": "code", "source": [ "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import WordPunctTokenizer\n", "from nltk.stem.isri import ISRIStemmer\n", "import string\n", "import re\n", "from bs4 import BeautifulSoup\n", "nltk.download('stopwords')\n", "\n", "\n", "tok = WordPunctTokenizer()\n", "\n", "def normalize_arabic(text):\n", " text = re.sub(\"[إأآا]\", \"ا\", text)\n", " text = re.sub(\"ى\", \"ي\", text)\n", " text = re.sub(\"ؤ\", \"ء\", text)\n", " text = re.sub(\"ئ\", \"ء\", text)\n", " text = re.sub(\"ة\", \"ه\", text)\n", " text = re.sub(\"گ\", \"ك\", text)\n", " return text\n", "\n", "\n", "def remove_diacritics(text):\n", " arabic_diacritics = re.compile(\"\"\"\n", " ّ | # Tashdid\n", " َ | # Fatha\n", " ً | # Tanwin Fath\n", " ُ | # Damma\n", " ٌ | # Tanwin Damm\n", " ِ | # Kasra\n", " ٍ | # Tanwin Kasr\n", " ْ | # Sukun\n", " ـ # Tatwil/Kashida\n", " \"\"\", re.VERBOSE)\n", " return re.sub(arabic_diacritics, '', text)\n", "\n", "\n", "def remove_punctuations(text):\n", " arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:\"؟.,'{}~¦+|!”…“–ـ'''\n", " english_punctuations = string.punctuation\n", " punctuations_list = arabic_punctuations + english_punctuations\n", " translator = str.maketrans('', '', punctuations_list)\n", " return text.translate(translator)\n", "\n", "\n", "def remove_repeating_char(text):\n", " # return re.sub(r'(.)\\1+', r'\\1', text) # keep only 1 repeat\n", " return re.sub(r'(.)\\1+', r'\\1\\1', text) # keep 2 repeat\n", "\n", "def remove_stop_words(text):\n", " #nltk.download('stopwords')\n", " englishStopWords = stopwords.words('english')\n", "\n", " all_stopwords = set(englishStopWords + arabic_stop_words)\n", "\n", " word_list = nltk.tokenize.wordpunct_tokenize(text.lower())\n", " word_list = [ w for w in word_list if not w in all_stopwords ]\n", " return (\" \".join(word_list)).strip()\n", "\n", "def get_root(text):\n", " word_list = nltk.tokenize.wordpunct_tokenize(text.lower())\n", " result = []\n", " arstemmer = ISRIStemmer()\n", " for word in word_list: result.append(arstemmer.stem(word))\n", " return (' '.join(result)).strip()\n", "\n", "def clean_tweet(text):\n", " text = re.sub(r'([@A-Za-z0-9_]+)|#|http\\S+', ' ', text) # removes non arabic letters\n", " text = re.sub(r'ـــــــــــــ', '', text) # removes non arabic letters\n", " return text\n", "\n", "\n", "\n", "\n", "def clean_str(text):\n", " text = clean_tweet(text)\n", " # text = normalize_arabic(text)\n", " text = remove_punctuations(text) ###\n", " text = remove_diacritics(text)\n", " text = remove_repeating_char(text) ###\n", " # text = remove_stop_words(text) ###\n", "\n", "\n", " text = text.replace('وو', 'و') ###\n", " text = text.replace('يي', 'ي') ###\n", " text = text.replace('اا', 'ا') ###\n", "\n", " # text = get_root(text) ###\n", "\n", " soup = BeautifulSoup(text, 'lxml')\n", " souped = soup.get_text()\n", " pat1 = r'@[A-Za-z0-9]+'\n", " pat2 = r'https?://[A-Za-z0-9./]+'\n", " combined_pat = r'|'.join((pat1, pat2))\n", " stripped = re.sub(combined_pat, '', souped)\n", " try:\n", " clean = stripped.decode(\"utf-8-sig\").replace(u\"\\ufffd\", \"?\")\n", " except:\n", " clean = stripped\n", "\n", " words = tok.tokenize(clean)\n", " return (\" \".join(words)).strip()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GjFkTWjivAGM", "outputId": "f791d7bc-7f70-491b-b2f3-60d80a8322f5" }, "execution_count": 37, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] } ] }, { "cell_type": "code", "source": [ "!gdown \"165kzfZDsRTZAAfZKedeZiUlKzMcHNgPd\" # arabic stop words\n", "!gdown \"1WdgbvqDYIa-g5ijjsz5zb-3lVvUXUtmS&confirm=t\" # qarib pretrained model\n", "!gdown \"1foNTGFjhWAxS-_SfF7rga80UmFT7BDJ0&confirm=t\" # fasttext-ar-vectors-150.bin" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9OkLaRcYvOky", "outputId": "9b38be6e-aee6-4e34-ba3e-fa142aa95903" }, "execution_count": 38, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloading...\n", "From: https://drive.google.com/uc?id=165kzfZDsRTZAAfZKedeZiUlKzMcHNgPd\n", "To: /content/Arabic_stop_words.txt\n", "\r 0% 0.00/6.48k [00:00=1.14.0 in /usr/local/lib/python3.10/dist-packages (from pyarabic) (1.16.0)\n", "Requirement already satisfied: farasapy in /usr/local/lib/python3.10/dist-packages (0.0.14)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from farasapy) (2.32.3)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from farasapy) (4.66.5)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->farasapy) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->farasapy) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->farasapy) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->farasapy) (2024.7.4)\n", "Requirement already satisfied: transformers[torch] in /usr/local/lib/python3.10/dist-packages (4.42.4)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (3.15.4)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.23.5)\n", "Requirement already satisfied: numpy<2.0,>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (1.26.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (24.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (6.0.2)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2024.5.15)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2.32.3)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.4.4)\n", "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.19.1)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (4.66.5)\n", "Requirement already satisfied: accelerate>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.32.1)\n", "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2.3.1+cu121)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->transformers[torch]) (5.9.5)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers[torch]) (2024.6.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers[torch]) (4.12.2)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (1.13.1)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (3.3)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (3.1.4)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.105)\n", "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (8.9.2.26)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.3.1)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (11.0.2.54)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (10.3.2.106)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (11.4.5.107)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.0.106)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (2.20.5)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (12.1.105)\n", "Requirement already satisfied: triton==2.3.1 in /usr/local/lib/python3.10/dist-packages (from torch->transformers[torch]) (2.3.1)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch->transformers[torch]) (12.6.20)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (2024.7.4)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->transformers[torch]) (2.1.5)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->transformers[torch]) (1.3.0)\n", "Requirement already satisfied: Keras-Preprocessing in /usr/local/lib/python3.10/dist-packages (1.1.2)\n", "Requirement already satisfied: numpy>=1.9.1 in /usr/local/lib/python3.10/dist-packages (from Keras-Preprocessing) (1.26.4)\n", "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from Keras-Preprocessing) (1.16.0)\n", "fatal: destination path 'fastText' already exists and is not an empty directory.\n", "Processing /content/fastText\n", " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.10/dist-packages (from fasttext==0.9.2) (2.13.4)\n", "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from fasttext==0.9.2) (71.0.4)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from fasttext==0.9.2) (1.26.4)\n", "Building wheels for collected packages: fasttext\n", " Building wheel for fasttext (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4246493 sha256=885fb048658f5739230aa0f1739c694da4108e0389d8c475331adf658e2d5d91\n", " Stored in directory: /tmp/pip-ephem-wheel-cache-j31t7qci/wheels/8b/05/af/3cfae069d904597d44b309c956601b611bdf8967bcbe968903\n", "Successfully built fasttext\n", "Installing collected packages: fasttext\n", " Attempting uninstall: fasttext\n", " Found existing installation: fasttext 0.9.2\n", " Uninstalling fasttext-0.9.2:\n", " Successfully uninstalled fasttext-0.9.2\n", "Successfully installed fasttext-0.9.2\n" ] } ] }, { "cell_type": "code", "source": [ "from transformers import pipeline\n", "unmasker_MARBERT = pipeline('fill-mask', model='UBC-NLP/MARBERT', top_k=50)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3sXnrmNU7t66", "outputId": "6a70f0b3-a915-45c1-ae05-e846dcd56e88" }, "execution_count": 40, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Some weights of the model checkpoint at UBC-NLP/MARBERT were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n" ] } ] }, { "cell_type": "code", "source": [ "def light_preprocess(text):\n", " text = clean_tweet(text)\n", " # text = normalize_arabic(text)\n", " text = remove_punctuations(text) ###\n", " text = remove_diacritics(text)\n", " text = remove_repeating_char(text) ###\n", " text = text.replace('وو', 'و') ###\n", " text = text.replace('يي', 'ي') ###\n", " text = text.replace('اا', 'ا') ###\n", " return text\n", "\n", "nltk.download('stopwords')\n", "englishStopWords = stopwords.words('english')\n", "arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:\"؟.,'{}~¦+|!”…“–ـ'''\n", "english_punctuations = string.punctuation\n", "punctuations_list = arabic_punctuations + english_punctuations\n", "\n", "all_stopwords = set(englishStopWords + arabic_stop_words)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "erAHadS9sIww", "outputId": "db064763-15a9-4f1f-a2a3-246cf1cf09ad" }, "execution_count": 41, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] } ] }, { "cell_type": "code", "source": [ "!pip install torch # Install the PyTorch library if you haven't already\n", "\n", "import torch\n", "# Determine if a GPU is available and set the device accordingly\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "def classsify_tweets(tweet):\n", " df = pd.DataFrame({\"tweet\": tweet})\n", " df['clean_tweet'] = df['tweet'].apply(lambda x: clean_str(x))\n", "\n", " dev_df = pd.DataFrame({\n", " 'id':range(len(df)),\n", " 'text': df[\"clean_tweet\"]\n", " })\n", "\n", " test_example = SingleSentenceClassificationProcessor(mode='classification')\n", " test_example.add_examples(texts_or_text_and_labels=dev_df['text'], overwrite_examples = True)\n", "\n", " test_features = test_example.get_features(tokenizer = tokenizer, max_length =64)\n", "\n", " input_ids = [i.input_ids for i in test_features]\n", " attention_masks = [i.attention_mask for i in test_features]\n", "\n", " inputs = torch.tensor(input_ids)\n", " masks = torch.tensor(attention_masks)\n", "\n", " # Put the model in an evaluation state\n", " model.eval()\n", "\n", " # Transfer model to GPU\n", " model.to(device)\n", "\n", " torch.cuda.empty_cache() # empty the gpu memory\n", " # Transfer the batch to gpu\n", " inputs = inputs.to(device)\n", " masks = masks.to(device)\n", "\n", " # Run inference on the example\n", " output = model(inputs, attention_mask=masks)[\"logits\"]\n", " # Transfer the output to CPU again and convert to numpy\n", " output = output.cpu().detach().numpy()\n", "\n", " return output" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "68JNE5IX2o47", "outputId": "750d6a84-dbfb-4cb1-9983-88b8aeb7d547" }, "execution_count": 42, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.3.1+cu121)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.15.4)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch) (4.12.2)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.13.1)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.3)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.4)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch) (2024.6.1)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.105)\n", "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch) (8.9.2.26)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.3.1)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch) (11.0.2.54)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch) (10.3.2.106)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch) (11.4.5.107)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.0.106)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch) (2.20.5)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.105)\n", "Requirement already satisfied: triton==2.3.1 in /usr/local/lib/python3.10/dist-packages (from torch) (2.3.1)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch) (12.6.20)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.5)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n" ] } ] }, { "cell_type": "code", "source": [ "size = len(test_data)\n", "print(\"size of test set:\", size)\n", "correct_class_tweets = []\n", "correct_class = []\n", "for i in range(0, size):\n", " txt = test_data['Text'].astype('U')[i]\n", " cls = test_data['label'][i]\n", " label = id2label[np.argmax(classsify_tweets([txt]), axis=1)[0]]\n", " if label == cls and label == 1:\n", " correct_class_tweets.append(txt)\n", " correct_class.append(cls)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Qr6Xai9n6-O3", "outputId": "2ab47c28-e19f-4ba6-8f77-ca1be3ecf569" }, "execution_count": 69, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "size of test set: 2540\n" ] } ] }, { "cell_type": "code", "source": [ "from scipy.spatial import distance\n", "from farasa.stemmer import FarasaStemmer\n", "frasa_stemmer = FarasaStemmer(interactive=True)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_Ah-5HTtwxSq", "outputId": "405181a8-6017-4966-9048-9a8e2dd86c06" }, "execution_count": 45, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[2024-08-18 15:12:31,185 - farasapy_logger - WARNING]: Be careful with large lines as they may break on interactive mode. You may switch to Standalone mode for such cases.\n" ] } ] }, { "cell_type": "code", "source": [ "!pip install emoji" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "47lHG0uOwxZ9", "outputId": "aa715294-b4bd-47f2-c240-5a360e48c300" }, "execution_count": 46, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: emoji in /usr/local/lib/python3.10/dist-packages (2.12.1)\n", "Requirement already satisfied: typing-extensions>=4.7.0 in /usr/local/lib/python3.10/dist-packages (from emoji) (4.12.2)\n" ] } ] }, { "cell_type": "code", "source": [ "import emoji\n", "\n", "def select_best_replacement(pos, x_cur, verbose=False):\n", " \"\"\" Select the most effective replacement to word at pos (pos) in (x_cur)\"\"\"\n", "\n", " if bool(emoji.emoji_count(x_cur.split()[pos])):\n", " return None\n", "\n", " embedding_masked_word = model_fasttext[x_cur.split()[pos]]\n", "\n", " x_masked = (\" \".join(x_cur.split()[:pos]) + \" [MASK] \" + \" \".join(x_cur.split()[pos + 1:])).strip()\n", " unmasked_seq = unmasker_MARBERT(x_masked)[:20]\n", "\n", " max_sim = -1\n", " best_perturb_dict = {}\n", " for seq in unmasked_seq:\n", " if frasa_stemmer.stem(seq['token_str']) in frasa_stemmer.stem(x_cur.split()[pos]):\n", " continue\n", " if seq['token_str'] in punctuations_list or pos >= len(seq[\"sequence\"].split()):\n", " continue\n", " embedding_masked_word_new = model_fasttext[seq['token_str']]\n", " if np.sum(embedding_masked_word) == 0 or np.sum(embedding_masked_word_new) == 0:\n", " continue\n", " if verbose: print(\"New word: \", seq['token_str'])\n", " sim = 1 - distance.cosine(embedding_masked_word, embedding_masked_word_new)\n", " if sim > max_sim:\n", " max_sim = sim\n", " best_perturb_dict[\"sim\"] = sim\n", " best_perturb_dict[\"Masked word\"] = x_cur.split()[pos]\n", " best_perturb_dict[\"New word\"] = seq['token_str']\n", " best_perturb_dict[\"New seq\"] = x_cur.replace(x_cur.split()[pos], seq['token_str'])\n", "\n", " return best_perturb_dict.get(\"New seq\", None)\n", "\n", "# Process tweets and perturb\n", "perturb_counter = 0\n", "for tweet_ix, tweet in enumerate(correct_class_tweets):\n", " print(\"Tweet index: \", tweet_ix)\n", "\n", " x_adv = light_preprocess(tweet)\n", " x_len = len(x_adv.split())\n", " orig_class = np.argmax(classsify_tweets([x_adv]), axis=1)[0]\n", " orig_label = id2label[orig_class]\n", " print(f\"Original tweet: {x_adv} : Original label: {orig_label}.\")\n", " splits = len(x_adv.split())\n", " perturbed_flag = False\n", " for split_ix in range(splits):\n", " perturbed = select_best_replacement(split_ix, x_adv)\n", " if perturbed:\n", " new_class = np.argmax(classsify_tweets([perturbed]), axis=1)[0]\n", " if orig_class != new_class:\n", " print(f\"Perturbed tweet: {perturbed} : New label: {id2label[new_class]}.\")\n", " print(10 * \"==\")\n", " if not perturbed_flag:\n", " perturb_counter += 1\n", " perturbed_flag = True\n", " if not perturbed_flag:\n", " print(10 * \"==\")\n", "print(f\"Successful perturbation {perturb_counter} out of {len(correct_class_tweets)}.\")\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "sy4En-7yGWGN", "outputId": "97f075cb-e545-4606-be7e-e0ad4c08d27d" }, "execution_count": 75, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Successful perturbation 0 out of 0.\n" ] } ] }, { "cell_type": "code", "source": [ "off_tweets_count = sum(test_data['label'] == 1 )\n", "print(f\"Number of offensive tweets in the dataset: {off_tweets_count}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ttonOFkbDdI6", "outputId": "0d329caa-4ce7-49ca-d335-e50ece1e88da" }, "execution_count": 74, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Number of offensive tweets in the dataset: 887\n" ] } ] }, { "cell_type": "code", "source": [ "test_data['label'] = test_data['label'].astype(int)\n" ], "metadata": { "id": "M19XRkw7I2mk" }, "execution_count": 73, "outputs": [] } ], "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "nbformat": 4, "nbformat_minor": 0 }