File size: 58,796 Bytes
0734f8e
1
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.8.16","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install evaluate seaborn datasets transformers[sentencepiece] huggingface -q","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:10:38.588160Z","iopub.execute_input":"2023-05-12T08:10:38.588801Z","iopub.status.idle":"2023-05-12T08:11:05.038848Z","shell.execute_reply.started":"2023-05-12T08:10:38.588769Z","shell.execute_reply":"2023-05-12T08:11:05.037913Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\ntensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\ntensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n\u001b[0m\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.1.2 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"code","source":"import warnings\nwarnings.filterwarnings('ignore')\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n#import seaborn as sns\nimport os\nimport tensorflow as tf\nfrom tensorflow.keras.layers import Input, Dense\nfrom tensorflow.keras.models import Model\nfrom tensorflow.data import Dataset\n\nimport transformers\nfrom transformers import AutoTokenizer, TFAutoModel","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:11:23.828906Z","iopub.execute_input":"2023-05-12T08:11:23.829355Z","iopub.status.idle":"2023-05-12T08:12:04.965072Z","shell.execute_reply.started":"2023-05-12T08:11:23.829319Z","shell.execute_reply":"2023-05-12T08:12:04.964003Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"D0512 08:11:57.679357494      14 config.cc:119]                        gRPC EXPERIMENT tcp_frame_size_tuning               OFF (default:OFF)\nD0512 08:11:57.679397945      14 config.cc:119]                        gRPC EXPERIMENT tcp_rcv_lowat                       OFF (default:OFF)\nD0512 08:11:57.679401929      14 config.cc:119]                        gRPC EXPERIMENT peer_state_based_framing            OFF (default:OFF)\nD0512 08:11:57.679404612      14 config.cc:119]                        gRPC EXPERIMENT flow_control_fixes                  ON  (default:ON)\nD0512 08:11:57.679406720      14 config.cc:119]                        gRPC EXPERIMENT memory_pressure_controller          OFF (default:OFF)\nD0512 08:11:57.679409002      14 config.cc:119]                        gRPC EXPERIMENT unconstrained_max_quota_buffer_size OFF (default:OFF)\nD0512 08:11:57.679411626      14 config.cc:119]                        gRPC EXPERIMENT new_hpack_huffman_decoder           ON  (default:ON)\nD0512 08:11:57.679414267      14 config.cc:119]                        gRPC EXPERIMENT event_engine_client                 OFF (default:OFF)\nD0512 08:11:57.679416463      14 config.cc:119]                        gRPC EXPERIMENT monitoring_experiment               ON  (default:ON)\nD0512 08:11:57.679418575      14 config.cc:119]                        gRPC EXPERIMENT promise_based_client_call           OFF (default:OFF)\nD0512 08:11:57.679420670      14 config.cc:119]                        gRPC EXPERIMENT free_large_allocator                OFF (default:OFF)\nD0512 08:11:57.679422786      14 config.cc:119]                        gRPC EXPERIMENT promise_based_server_call           OFF (default:OFF)\nD0512 08:11:57.679424925      14 config.cc:119]                        gRPC EXPERIMENT transport_supplies_client_latency   OFF (default:OFF)\nD0512 08:11:57.679427123      14 config.cc:119]                        gRPC EXPERIMENT event_engine_listener               OFF (default:OFF)\nI0512 08:11:57.679611260      14 ev_epoll1_linux.cc:122]               grpc epoll fd: 62\nD0512 08:11:57.685069810      14 ev_posix.cc:144]                      Using polling engine: epoll1\nD0512 08:11:57.685091110      14 dns_resolver_ares.cc:822]             Using ares dns resolver\nD0512 08:11:57.685503222      14 lb_policy_registry.cc:46]             registering LB policy factory for \"priority_experimental\"\nD0512 08:11:57.685513372      14 lb_policy_registry.cc:46]             registering LB policy factory for \"outlier_detection_experimental\"\nD0512 08:11:57.685516328      14 lb_policy_registry.cc:46]             registering LB policy factory for \"weighted_target_experimental\"\nD0512 08:11:57.685518925      14 lb_policy_registry.cc:46]             registering LB policy factory for \"pick_first\"\nD0512 08:11:57.685521601      14 lb_policy_registry.cc:46]             registering LB policy factory for \"round_robin\"\nD0512 08:11:57.685524245      14 lb_policy_registry.cc:46]             registering LB policy factory for \"weighted_round_robin_experimental\"\nD0512 08:11:57.685530262      14 lb_policy_registry.cc:46]             registering LB policy factory for \"ring_hash_experimental\"\nD0512 08:11:57.685544918      14 lb_policy_registry.cc:46]             registering LB policy factory for \"grpclb\"\nD0512 08:11:57.685567780      14 lb_policy_registry.cc:46]             registering LB policy factory for \"rls_experimental\"\nD0512 08:11:57.685580119      14 lb_policy_registry.cc:46]             registering LB policy factory for \"xds_cluster_manager_experimental\"\nD0512 08:11:57.685583175      14 lb_policy_registry.cc:46]             registering LB policy factory for \"xds_cluster_impl_experimental\"\nD0512 08:11:57.685586100      14 lb_policy_registry.cc:46]             registering LB policy factory for \"cds_experimental\"\nD0512 08:11:57.685591323      14 lb_policy_registry.cc:46]             registering LB policy factory for \"xds_cluster_resolver_experimental\"\nD0512 08:11:57.685594369      14 lb_policy_registry.cc:46]             registering LB policy factory for \"xds_override_host_experimental\"\nD0512 08:11:57.685597356      14 lb_policy_registry.cc:46]             registering LB policy factory for \"xds_wrr_locality_experimental\"\nD0512 08:11:57.685601004      14 certificate_provider_registry.cc:35]  registering certificate provider factory for \"file_watcher\"\nI0512 08:11:57.687522778      14 socket_utils_common_posix.cc:408]     Disabling AF_INET6 sockets because ::1 is not available.\nI0512 08:11:57.713894001     315 socket_utils_common_posix.cc:337]     TCP_USER_TIMEOUT is available. TCP_USER_TIMEOUT will be used thereafter\nE0512 08:11:57.732068033     315 oauth2_credentials.cc:236]            oauth_fetch: UNKNOWN:C-ares status is not ARES_SUCCESS qtype=A name=metadata.google.internal. is_balancer=0: Domain name not found {grpc_status:2, created_time:\"2023-05-12T08:11:57.73203804+00:00\"}\n","output_type":"stream"}]},{"cell_type":"code","source":"## Setting up TPUs\ntpu = tf.distribute.cluster_resolver.TPUClusterResolver()\nprint('Running on TPU ', tpu.master())\ntf.config.experimental_connect_to_cluster(tpu)\ntf.tpu.experimental.initialize_tpu_system(tpu)\ntpu_strategy = tf.distribute.TPUStrategy(tpu)\nprint(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:04.967007Z","iopub.execute_input":"2023-05-12T08:12:04.967631Z","iopub.status.idle":"2023-05-12T08:12:15.185956Z","shell.execute_reply.started":"2023-05-12T08:12:04.967600Z","shell.execute_reply":"2023-05-12T08:12:15.184914Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"Running on TPU  \nINFO:tensorflow:Deallocate tpu buffers before initializing tpu system.\nINFO:tensorflow:Initializing the TPU system: local\nINFO:tensorflow:Finished initializing TPU system.\nINFO:tensorflow:Found TPU system:\nINFO:tensorflow:*** Num TPU Cores: 8\nINFO:tensorflow:*** Num TPU Workers: 1\nINFO:tensorflow:*** Num TPU Cores Per Worker: 8\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\nREPLICAS:  8\n","output_type":"stream"}]},{"cell_type":"code","source":"class Config:\n    EPOCHS = 3  #2\n    MODEL = \"xlm-roberta-large\"\n    BUFFER_SIZE = 2048\n    BATCH_SIZE = 16*tpu_strategy.num_replicas_in_sync\n    MAX_LEN = 192\n    LEARNING_RATE = 1e-5\n    WEIGHT_DECAY = 1e-6\n    RANDOM_STATE = 42","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:15.187164Z","iopub.execute_input":"2023-05-12T08:12:15.187478Z","iopub.status.idle":"2023-05-12T08:12:15.193012Z","shell.execute_reply.started":"2023-05-12T08:12:15.187450Z","shell.execute_reply":"2023-05-12T08:12:15.192060Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"input_dir = \"/kaggle/input/jigsaw-multilingual-toxic-comment-classification\"\ntrain1 = pd.read_csv(os.path.join(input_dir, \"jigsaw-toxic-comment-train.csv\"))\ntrain2 = pd.read_csv(os.path.join(input_dir, \"jigsaw-unintended-bias-train.csv\"))\nval = pd.read_csv(os.path.join(input_dir,\"validation.csv\"))\ntest = pd.read_csv(os.path.join(input_dir,\"test.csv\"))","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:15.195496Z","iopub.execute_input":"2023-05-12T08:12:15.195907Z","iopub.status.idle":"2023-05-12T08:12:42.439698Z","shell.execute_reply.started":"2023-05-12T08:12:15.195884Z","shell.execute_reply":"2023-05-12T08:12:42.438494Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"train1.head()","metadata":{"scrolled":true,"execution":{"iopub.status.busy":"2023-05-12T08:12:42.440974Z","iopub.execute_input":"2023-05-12T08:12:42.441315Z","iopub.status.idle":"2023-05-12T08:12:42.461414Z","shell.execute_reply.started":"2023-05-12T08:12:42.441285Z","shell.execute_reply":"2023-05-12T08:12:42.460195Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":"                 id                                       comment_text  toxic   \n0  0000997932d777bf  Explanation\\nWhy the edits made under my usern...      0  \\\n1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   \n2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   \n3  0001b41b1c6bb37e  \"\\nMore\\nI can't make any real suggestions on ...      0   \n4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   \n\n   severe_toxic  obscene  threat  insult  identity_hate  \n0             0        0       0       0              0  \n1             0        0       0       0              0  \n2             0        0       0       0              0  \n3             0        0       0       0              0  \n4             0        0       0       0              0  ","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>comment_text</th>\n      <th>toxic</th>\n      <th>severe_toxic</th>\n      <th>obscene</th>\n      <th>threat</th>\n      <th>insult</th>\n      <th>identity_hate</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0000997932d777bf</td>\n      <td>Explanation\\nWhy the edits made under my usern...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>000103f0d9cfb60f</td>\n      <td>D'aww! He matches this background colour I'm s...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>000113f07ec002fd</td>\n      <td>Hey man, I'm really not trying to edit war. It...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0001b41b1c6bb37e</td>\n      <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0001d958c54c6e35</td>\n      <td>You, sir, are my hero. Any chance you remember...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train2.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.462658Z","iopub.execute_input":"2023-05-12T08:12:42.462965Z","iopub.status.idle":"2023-05-12T08:12:42.487874Z","shell.execute_reply.started":"2023-05-12T08:12:42.462921Z","shell.execute_reply":"2023-05-12T08:12:42.487081Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"      id                                       comment_text     toxic   \n0  59848  This is so cool. It's like, 'would you want yo...  0.000000  \\\n1  59849  Thank you!! This would make my life a lot less...  0.000000   \n2  59852  This is such an urgent design problem; kudos t...  0.000000   \n3  59855  Is this something I'll be able to install on m...  0.000000   \n4  59856               haha you guys are a bunch of losers.  0.893617   \n\n   severe_toxicity  obscene  identity_attack   insult  threat  asian  atheist   \n0         0.000000      0.0         0.000000  0.00000     0.0    NaN      NaN  \\\n1         0.000000      0.0         0.000000  0.00000     0.0    NaN      NaN   \n2         0.000000      0.0         0.000000  0.00000     0.0    NaN      NaN   \n3         0.000000      0.0         0.000000  0.00000     0.0    NaN      NaN   \n4         0.021277      0.0         0.021277  0.87234     0.0    0.0      0.0   \n\n   ...  article_id    rating  funny  wow  sad  likes  disagree   \n0  ...        2006  rejected      0    0    0      0         0  \\\n1  ...        2006  rejected      0    0    0      0         0   \n2  ...        2006  rejected      0    0    0      0         0   \n3  ...        2006  rejected      0    0    0      0         0   \n4  ...        2006  rejected      0    0    0      1         0   \n\n   sexual_explicit  identity_annotator_count  toxicity_annotator_count  \n0              0.0                         0                         4  \n1              0.0                         0                         4  \n2              0.0                         0                         4  \n3              0.0                         0                         4  \n4              0.0                         4                        47  \n\n[5 rows x 45 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>comment_text</th>\n      <th>toxic</th>\n      <th>severe_toxicity</th>\n      <th>obscene</th>\n      <th>identity_attack</th>\n      <th>insult</th>\n      <th>threat</th>\n      <th>asian</th>\n      <th>atheist</th>\n      <th>...</th>\n      <th>article_id</th>\n      <th>rating</th>\n      <th>funny</th>\n      <th>wow</th>\n      <th>sad</th>\n      <th>likes</th>\n      <th>disagree</th>\n      <th>sexual_explicit</th>\n      <th>identity_annotator_count</th>\n      <th>toxicity_annotator_count</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>59848</td>\n      <td>This is so cool. It's like, 'would you want yo...</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.0</td>\n      <td>0.000000</td>\n      <td>0.00000</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>2006</td>\n      <td>rejected</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0.0</td>\n      <td>0</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>59849</td>\n      <td>Thank you!! This would make my life a lot less...</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.0</td>\n      <td>0.000000</td>\n      <td>0.00000</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>2006</td>\n      <td>rejected</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0.0</td>\n      <td>0</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>59852</td>\n      <td>This is such an urgent design problem; kudos t...</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.0</td>\n      <td>0.000000</td>\n      <td>0.00000</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>2006</td>\n      <td>rejected</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0.0</td>\n      <td>0</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>59855</td>\n      <td>Is this something I'll be able to install on m...</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.0</td>\n      <td>0.000000</td>\n      <td>0.00000</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>2006</td>\n      <td>rejected</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0.0</td>\n      <td>0</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>59856</td>\n      <td>haha you guys are a bunch of losers.</td>\n      <td>0.893617</td>\n      <td>0.021277</td>\n      <td>0.0</td>\n      <td>0.021277</td>\n      <td>0.87234</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>2006</td>\n      <td>rejected</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>1</td>\n      <td>0</td>\n      <td>0.0</td>\n      <td>4</td>\n      <td>47</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 45 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"val.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.488844Z","iopub.execute_input":"2023-05-12T08:12:42.489110Z","iopub.status.idle":"2023-05-12T08:12:42.504161Z","shell.execute_reply.started":"2023-05-12T08:12:42.489087Z","shell.execute_reply":"2023-05-12T08:12:42.503316Z"},"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":"   id                                       comment_text lang  toxic\n0   0  Este usuario ni siquiera llega al rango de    ...   es      0\n1   1  Il testo di questa voce pare esser scopiazzato...   it      0\n2   2  Vale. Sólo expongo mi pasado. Todo tiempo pasa...   es      1\n3   3  Bu maddenin alt başlığı olarak  uluslararası i...   tr      0\n4   4  Belçika nın şehirlerinin yanında ilçe ve belde...   tr      0","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>comment_text</th>\n      <th>lang</th>\n      <th>toxic</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>Este usuario ni siquiera llega al rango de    ...</td>\n      <td>es</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>Il testo di questa voce pare esser scopiazzato...</td>\n      <td>it</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>Vale. Sólo expongo mi pasado. Todo tiempo pasa...</td>\n      <td>es</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>Bu maddenin alt başlığı olarak  uluslararası i...</td>\n      <td>tr</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>Belçika nın şehirlerinin yanında ilçe ve belde...</td>\n      <td>tr</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.505217Z","iopub.execute_input":"2023-05-12T08:12:42.505504Z","iopub.status.idle":"2023-05-12T08:12:42.518947Z","shell.execute_reply.started":"2023-05-12T08:12:42.505480Z","shell.execute_reply":"2023-05-12T08:12:42.518159Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"   id                                            content lang\n0   0  Doctor Who adlı viki başlığına 12. doctor olar...   tr\n1   1   Вполне возможно, но я пока не вижу необходимо...   ru\n2   2  Quindi tu sei uno di quelli   conservativi  , ...   it\n3   3  Malesef gerçekleştirilmedi ancak şöyle bir şey...   tr\n4   4  :Resim:Seldabagcan.jpg resminde kaynak sorunu ...   tr","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>content</th>\n      <th>lang</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>Doctor Who adlı viki başlığına 12. doctor olar...</td>\n      <td>tr</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>Вполне возможно, но я пока не вижу необходимо...</td>\n      <td>ru</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>Quindi tu sei uno di quelli   conservativi  , ...</td>\n      <td>it</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>Malesef gerçekleştirilmedi ancak şöyle bir şey...</td>\n      <td>tr</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>:Resim:Seldabagcan.jpg resminde kaynak sorunu ...</td>\n      <td>tr</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train1[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.519956Z","iopub.execute_input":"2023-05-12T08:12:42.520259Z","iopub.status.idle":"2023-05-12T08:12:42.534176Z","shell.execute_reply.started":"2023-05-12T08:12:42.520234Z","shell.execute_reply":"2023-05-12T08:12:42.533484Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"toxic\n0    202165\n1     21384\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train2[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.537691Z","iopub.execute_input":"2023-05-12T08:12:42.537946Z","iopub.status.idle":"2023-05-12T08:12:42.574451Z","shell.execute_reply.started":"2023-05-12T08:12:42.537925Z","shell.execute_reply":"2023-05-12T08:12:42.573541Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"toxic\n0.000000    1333035\n0.166667     138501\n0.200000     113271\n0.300000      62195\n0.400000      52703\n             ...   \n0.037609          1\n0.971193          1\n0.988430          1\n0.008309          1\n0.967316          1\nName: count, Length: 3853, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"val[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.575526Z","iopub.execute_input":"2023-05-12T08:12:42.575805Z","iopub.status.idle":"2023-05-12T08:12:42.584242Z","shell.execute_reply.started":"2023-05-12T08:12:42.575781Z","shell.execute_reply":"2023-05-12T08:12:42.583468Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"toxic\n0    6770\n1    1230\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"val[\"lang\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.585256Z","iopub.execute_input":"2023-05-12T08:12:42.585532Z","iopub.status.idle":"2023-05-12T08:12:42.596996Z","shell.execute_reply.started":"2023-05-12T08:12:42.585510Z","shell.execute_reply":"2023-05-12T08:12:42.596246Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"lang\ntr    3000\nes    2500\nit    2500\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"test[\"lang\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.597893Z","iopub.execute_input":"2023-05-12T08:12:42.598151Z","iopub.status.idle":"2023-05-12T08:12:42.612575Z","shell.execute_reply.started":"2023-05-12T08:12:42.598129Z","shell.execute_reply":"2023-05-12T08:12:42.611766Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"lang\ntr    14000\npt    11012\nru    10948\nfr    10920\nit     8494\nes     8438\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train1 = train1.iloc[:,1:3]\ntrain2 = train2.iloc[:,1:3]\nval = val.loc[:,[\"comment_text\",\"toxic\"]]\ntest.rename(columns={\"content\":\"comment_text\"}, inplace=True)\nsub = test[['id']]\ntrain2.toxic = (train2.toxic>0.5).astype(int)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.613596Z","iopub.execute_input":"2023-05-12T08:12:42.613863Z","iopub.status.idle":"2023-05-12T08:12:42.689129Z","shell.execute_reply.started":"2023-05-12T08:12:42.613841Z","shell.execute_reply":"2023-05-12T08:12:42.687961Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"train2.toxic.value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.690307Z","iopub.execute_input":"2023-05-12T08:12:42.690632Z","iopub.status.idle":"2023-05-12T08:12:42.714449Z","shell.execute_reply.started":"2023-05-12T08:12:42.690603Z","shell.execute_reply":"2023-05-12T08:12:42.712954Z"},"trusted":true},"execution_count":16,"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"toxic\n0    1789968\n1     112226\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train = pd.concat([train1,\n                   train2.query(\"toxic==1\"),\n                   train2.query(\"toxic==0\").sample(n=200000, random_state=Config.RANDOM_STATE)])\ntrain.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.717790Z","iopub.execute_input":"2023-05-12T08:12:42.719209Z","iopub.status.idle":"2023-05-12T08:12:43.083471Z","shell.execute_reply.started":"2023-05-12T08:12:42.719178Z","shell.execute_reply":"2023-05-12T08:12:43.082364Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"train.shape","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.084697Z","iopub.execute_input":"2023-05-12T08:12:43.084992Z","iopub.status.idle":"2023-05-12T08:12:43.091149Z","shell.execute_reply.started":"2023-05-12T08:12:43.084966Z","shell.execute_reply":"2023-05-12T08:12:43.090173Z"},"trusted":true},"execution_count":18,"outputs":[{"execution_count":18,"output_type":"execute_result","data":{"text/plain":"(535775, 2)"},"metadata":{}}]},{"cell_type":"code","source":"train.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.092288Z","iopub.execute_input":"2023-05-12T08:12:43.092591Z","iopub.status.idle":"2023-05-12T08:12:43.108428Z","shell.execute_reply.started":"2023-05-12T08:12:43.092565Z","shell.execute_reply":"2023-05-12T08:12:43.107388Z"},"trusted":true},"execution_count":19,"outputs":[{"execution_count":19,"output_type":"execute_result","data":{"text/plain":"                                        comment_text  toxic\n0  Explanation\\nWhy the edits made under my usern...      0\n1  D'aww! He matches this background colour I'm s...      0\n2  Hey man, I'm really not trying to edit war. It...      0\n3  \"\\nMore\\nI can't make any real suggestions on ...      0\n4  You, sir, are my hero. Any chance you remember...      0","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>comment_text</th>\n      <th>toxic</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Explanation\\nWhy the edits made under my usern...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>D'aww! He matches this background colour I'm s...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Hey man, I'm really not trying to edit war. It...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>You, sir, are my hero. Any chance you remember...</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"val.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.109560Z","iopub.execute_input":"2023-05-12T08:12:43.109882Z","iopub.status.idle":"2023-05-12T08:12:43.124398Z","shell.execute_reply.started":"2023-05-12T08:12:43.109856Z","shell.execute_reply":"2023-05-12T08:12:43.123602Z"},"trusted":true},"execution_count":20,"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":"                                        comment_text  toxic\n0  Este usuario ni siquiera llega al rango de    ...      0\n1  Il testo di questa voce pare esser scopiazzato...      0\n2  Vale. Sólo expongo mi pasado. Todo tiempo pasa...      1\n3  Bu maddenin alt başlığı olarak  uluslararası i...      0\n4  Belçika nın şehirlerinin yanında ilçe ve belde...      0","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>comment_text</th>\n      <th>toxic</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Este usuario ni siquiera llega al rango de    ...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Il testo di questa voce pare esser scopiazzato...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Vale. Sólo expongo mi pasado. Todo tiempo pasa...</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Bu maddenin alt başlığı olarak  uluslararası i...</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Belçika nın şehirlerinin yanında ilçe ve belde...</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.125574Z","iopub.execute_input":"2023-05-12T08:12:43.125871Z","iopub.status.idle":"2023-05-12T08:12:43.140740Z","shell.execute_reply.started":"2023-05-12T08:12:43.125845Z","shell.execute_reply":"2023-05-12T08:12:43.139834Z"},"trusted":true},"execution_count":21,"outputs":[{"execution_count":21,"output_type":"execute_result","data":{"text/plain":"   id                                       comment_text lang\n0   0  Doctor Who adlı viki başlığına 12. doctor olar...   tr\n1   1   Вполне возможно, но я пока не вижу необходимо...   ru\n2   2  Quindi tu sei uno di quelli   conservativi  , ...   it\n3   3  Malesef gerçekleştirilmedi ancak şöyle bir şey...   tr\n4   4  :Resim:Seldabagcan.jpg resminde kaynak sorunu ...   tr","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>comment_text</th>\n      <th>lang</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>Doctor Who adlı viki başlığına 12. doctor olar...</td>\n      <td>tr</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>Вполне возможно, но я пока не вижу необходимо...</td>\n      <td>ru</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>Quindi tu sei uno di quelli   conservativi  , ...</td>\n      <td>it</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>Malesef gerçekleştirilmedi ancak şöyle bir şey...</td>\n      <td>tr</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>:Resim:Seldabagcan.jpg resminde kaynak sorunu ...</td>\n      <td>tr</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.rename(columns={\"content\":\"comment_text\"}, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.141938Z","iopub.execute_input":"2023-05-12T08:12:43.142286Z","iopub.status.idle":"2023-05-12T08:12:43.152137Z","shell.execute_reply.started":"2023-05-12T08:12:43.142257Z","shell.execute_reply":"2023-05-12T08:12:43.151267Z"},"trusted":true},"execution_count":22,"outputs":[]},{"cell_type":"code","source":"import re\ntrain['comment_text'] = train['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\nval['comment_text'] = val['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\ntest['comment_text'] = test['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:43.153204Z","iopub.execute_input":"2023-05-12T08:12:43.153504Z","iopub.status.idle":"2023-05-12T08:12:44.735211Z","shell.execute_reply.started":"2023-05-12T08:12:43.153479Z","shell.execute_reply":"2023-05-12T08:12:44.734019Z"},"trusted":true},"execution_count":23,"outputs":[]},{"cell_type":"code","source":"seq_len = [len(i.split()) for i in train.comment_text]\n\npd.Series(seq_len).hist(bins = 30)\nprint(np.mean(seq_len))\nprint(max(seq_len))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Tokenization","metadata":{}},{"cell_type":"code","source":"tokenizer = AutoTokenizer.from_pretrained(Config.MODEL)","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2023-05-12T08:12:44.736464Z","iopub.execute_input":"2023-05-12T08:12:44.736759Z","iopub.status.idle":"2023-05-12T08:12:46.680516Z","shell.execute_reply.started":"2023-05-12T08:12:44.736733Z","shell.execute_reply":"2023-05-12T08:12:46.679299Z"},"trusted":true},"execution_count":24,"outputs":[{"name":"stderr","text":"Downloading (…)lve/main/config.json: 100%|██████████| 616/616 [00:00<00:00, 133kB/s]\nDownloading (…)tencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 61.2MB/s]\nDownloading (…)/main/tokenizer.json: 100%|██████████| 9.10M/9.10M [00:00<00:00, 38.3MB/s]\n","output_type":"stream"}]},{"cell_type":"code","source":"def encoder(text_data, tokenizer=tokenizer, max_len=Config.MAX_LEN):\n    return tokenizer(text_data.comment_text.values.tolist(), \n                     max_length=max_len, \n                     truncation=True, \n                     padding=\"max_length\",\n                     add_special_tokens=True,\n                     return_tensors=\"tf\",\n                     return_token_type_ids = False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:46.681935Z","iopub.execute_input":"2023-05-12T08:12:46.682277Z","iopub.status.idle":"2023-05-12T08:12:46.688026Z","shell.execute_reply.started":"2023-05-12T08:12:46.682252Z","shell.execute_reply":"2023-05-12T08:12:46.687060Z"},"trusted":true},"execution_count":25,"outputs":[]},{"cell_type":"code","source":"encoded_train = encoder(text_data = train)\nencoded_val = encoder(text_data = val)\nencoded_test = encoder(text_data = test)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:46.689142Z","iopub.execute_input":"2023-05-12T08:12:46.689525Z","iopub.status.idle":"2023-05-12T08:13:40.477757Z","shell.execute_reply.started":"2023-05-12T08:12:46.689501Z","shell.execute_reply":"2023-05-12T08:13:40.476350Z"},"trusted":true},"execution_count":26,"outputs":[]},{"cell_type":"code","source":"train_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_train), train[\"toxic\"]))\n                .repeat()\n                .shuffle(Config.BUFFER_SIZE)\n                .batch(Config.BATCH_SIZE)\n                .prefetch(tf.data.AUTOTUNE))\n\nval_dataset = (tf.data.Dataset.from_tensor_slices((dict(encoded_val), val[\"toxic\"]))\n                .batch(Config.BATCH_SIZE)\n                .prefetch(tf.data.AUTOTUNE))\n\ntest_dataset = tf.data.Dataset.from_tensor_slices(dict(encoded_test)).batch(Config.BATCH_SIZE)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:13:40.479182Z","iopub.execute_input":"2023-05-12T08:13:40.479716Z","iopub.status.idle":"2023-05-12T08:13:40.514773Z","shell.execute_reply.started":"2023-05-12T08:13:40.479687Z","shell.execute_reply":"2023-05-12T08:13:40.513645Z"},"trusted":true},"execution_count":27,"outputs":[]},{"cell_type":"code","source":"def model_builder(transformers_layers, max_len=Config.MAX_LEN):\n    input_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"input_ids\")\n    masks = Input(shape=(max_len,), dtype=tf.int32, name=\"attention_mask\")\n    \n    roberta_layers = transformers_layers.roberta(input_ids, attention_mask=masks)[1]\n    intermediate = Dense(1024, activation='relu')(roberta_layers)\n    output = Dense(1, activation=\"sigmoid\", name=\"output_layer\")(intermediate)\n    model = Model(inputs=[input_ids, masks], outputs=output)\n    model.layers[2].trainable = True\n    \n    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=Config.LEARNING_RATE, weight_decay=Config.WEIGHT_DECAY),\n                  loss=tf.keras.losses.BinaryCrossentropy(),\n                  metrics=tf.keras.metrics.AUC())\n    return model","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:19:18.861686Z","iopub.execute_input":"2023-05-12T08:19:18.862636Z","iopub.status.idle":"2023-05-12T08:19:18.872779Z","shell.execute_reply.started":"2023-05-12T08:19:18.862595Z","shell.execute_reply":"2023-05-12T08:19:18.871516Z"},"trusted":true},"execution_count":36,"outputs":[]},{"cell_type":"code","source":"with tpu_strategy.scope():\n    transformers_layers = TFAutoModel.from_pretrained(Config.MODEL)\n    model = model_builder(transformers_layers=transformers_layers)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:19:23.540792Z","iopub.execute_input":"2023-05-12T08:19:23.541710Z","iopub.status.idle":"2023-05-12T08:19:58.819514Z","shell.execute_reply.started":"2023-05-12T08:19:23.541670Z","shell.execute_reply":"2023-05-12T08:19:58.818311Z"},"trusted":true},"execution_count":37,"outputs":[{"name":"stderr","text":"All model checkpoint layers were used when initializing TFXLMRobertaModel.\n\nAll the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-large.\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.\n","output_type":"stream"}]},{"cell_type":"code","source":"model.summary()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:19:58.821255Z","iopub.execute_input":"2023-05-12T08:19:58.821564Z","iopub.status.idle":"2023-05-12T08:19:58.877105Z","shell.execute_reply.started":"2023-05-12T08:19:58.821537Z","shell.execute_reply":"2023-05-12T08:19:58.876009Z"},"trusted":true},"execution_count":38,"outputs":[{"name":"stdout","text":"Model: \"model_2\"\n__________________________________________________________________________________________________\n Layer (type)                   Output Shape         Param #     Connected to                     \n==================================================================================================\n input_ids (InputLayer)         [(None, 192)]        0           []                               \n                                                                                                  \n attention_mask (InputLayer)    [(None, 192)]        0           []                               \n                                                                                                  \n roberta (TFXLMRobertaMainLayer  TFBaseModelOutputWi  559890432  ['input_ids[0][0]',              \n )                              thPoolingAndCrossAt               'attention_mask[0][0]']         \n                                tentions(last_hidde                                               \n                                n_state=(None, 192,                                               \n                                 1024),                                                           \n                                 pooler_output=(Non                                               \n                                e, 1024),                                                         \n                                 past_key_values=No                                               \n                                ne, hidden_states=N                                               \n                                one, attentions=Non                                               \n                                e, cross_attentions                                               \n                                =None)                                                            \n                                                                                                  \n dense_4 (Dense)                (None, 1024)         1049600     ['roberta[0][1]']                \n                                                                                                  \n output_layer (Dense)           (None, 1)            1025        ['dense_4[0][0]']                \n                                                                                                  \n==================================================================================================\nTotal params: 560,941,057\nTrainable params: 560,941,057\nNon-trainable params: 0\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"train_steps_per_epoch = train.shape[0]//Config.BATCH_SIZE\n\nhistory=model.fit(train_dataset,\n                  validation_data=val_dataset,\n                  steps_per_epoch=train_steps_per_epoch,\n                  epochs=Config.EPOCHS)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:20:30.201166Z","iopub.execute_input":"2023-05-12T08:20:30.201570Z","iopub.status.idle":"2023-05-12T09:40:52.828332Z","shell.execute_reply.started":"2023-05-12T08:20:30.201539Z","shell.execute_reply":"2023-05-12T09:40:52.826896Z"},"trusted":true},"execution_count":39,"outputs":[{"name":"stdout","text":"Epoch 1/3\n","output_type":"stream"},{"name":"stderr","text":"2023-05-12 08:21:52.144761: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_790/ReadVariableOp.\n2023-05-12 08:21:54.569388: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_790/ReadVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - ETA: 0s - loss: 0.0501 - auc_2: 0.9972","output_type":"stream"},{"name":"stderr","text":"2023-05-12 08:49:07.665397: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n2023-05-12 08:49:08.172000: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"4185/4185 [==============================] - 1747s 375ms/step - loss: 0.0501 - auc_2: 0.9972 - val_loss: 0.3338 - val_auc_2: 0.9137\nEpoch 2/3\n4185/4185 [==============================] - 1538s 367ms/step - loss: 0.0420 - auc_2: 0.9981 - val_loss: 0.2931 - val_auc_2: 0.9114\nEpoch 3/3\n4185/4185 [==============================] - 1537s 367ms/step - loss: 0.0369 - auc_2: 0.9985 - val_loss: 0.3070 - val_auc_2: 0.9039\n","output_type":"stream"}]},{"cell_type":"code","source":"model.evaluate(val_dataset)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"val_steps_per_epoch = val.shape[0]//Config.BATCH_SIZE\nval_history=model.fit(val_dataset.repeat(),\n                  steps_per_epoch=val_steps_per_epoch,\n                  epochs=2)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:43:06.760317Z","iopub.execute_input":"2023-05-12T09:43:06.760739Z","iopub.status.idle":"2023-05-12T09:43:52.591536Z","shell.execute_reply.started":"2023-05-12T09:43:06.760702Z","shell.execute_reply":"2023-05-12T09:43:52.590324Z"},"trusted":true},"execution_count":41,"outputs":[{"name":"stdout","text":"Epoch 1/2\n62/62 [==============================] - 23s 365ms/step - loss: 0.0899 - auc_2: 0.9893\nEpoch 2/2\n62/62 [==============================] - 23s 365ms/step - loss: 0.0800 - auc_2: 0.9916\n","output_type":"stream"}]},{"cell_type":"code","source":"preds = model.predict(test_dataset)\nsub['toxic'] = preds\nsub.to_csv(\"submission.csv\",index=False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:47:56.071510Z","iopub.execute_input":"2023-05-12T09:47:56.072708Z","iopub.status.idle":"2023-05-12T09:49:15.802261Z","shell.execute_reply.started":"2023-05-12T09:47:56.072664Z","shell.execute_reply":"2023-05-12T09:49:15.800711Z"},"trusted":true},"execution_count":42,"outputs":[{"name":"stderr","text":"2023-05-12 09:48:05.583905: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.\n2023-05-12 09:48:05.992232: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.\n","output_type":"stream"},{"name":"stdout","text":"499/499 [==============================] - 79s 118ms/step\n","output_type":"stream"}]},{"cell_type":"code","source":"model.save(\"roberta-fine-tuned-2\")","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:49:24.580208Z","iopub.execute_input":"2023-05-12T09:49:24.580625Z","iopub.status.idle":"2023-05-12T09:50:44.681561Z","shell.execute_reply.started":"2023-05-12T09:49:24.580595Z","shell.execute_reply":"2023-05-12T09:50:44.680112Z"},"trusted":true},"execution_count":43,"outputs":[{"name":"stderr","text":"WARNING:absl:Found untraced functions such as _update_step_xla, encoder_layer_call_fn, encoder_layer_call_and_return_conditional_losses, pooler_layer_call_fn, pooler_layer_call_and_return_conditional_losses while saving (showing 5 of 829). These functions will not be directly callable after loading.\n","output_type":"stream"},{"name":"stdout","text":"INFO:tensorflow:Assets written to: roberta-fine-tuned-2/assets\n","output_type":"stream"},{"name":"stderr","text":"INFO:tensorflow:Assets written to: roberta-fine-tuned-2/assets\n","output_type":"stream"}]},{"cell_type":"code","source":"import shutil\nshutil.make_archive(\"roberta-fine-tuned-2\",\"zip\",'/kaggle/working/roberta-fine-tuned-2')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T09:53:15.505782Z","iopub.execute_input":"2023-05-12T09:53:15.506262Z","iopub.status.idle":"2023-05-12T10:00:10.288432Z","shell.execute_reply.started":"2023-05-12T09:53:15.506226Z","shell.execute_reply":"2023-05-12T10:00:10.287215Z"},"trusted":true},"execution_count":44,"outputs":[{"execution_count":44,"output_type":"execute_result","data":{"text/plain":"'/kaggle/working/roberta-fine-tuned-2.zip'"},"metadata":{}}]},{"cell_type":"code","source":"model.save(\"roberta-fine-tuned-2-best\", save_format='h5')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:06:24.426264Z","iopub.execute_input":"2023-05-12T10:06:24.426727Z","iopub.status.idle":"2023-05-12T10:06:40.506795Z","shell.execute_reply.started":"2023-05-12T10:06:24.426692Z","shell.execute_reply":"2023-05-12T10:06:40.505341Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"markdown","source":"### Pushing Model to Hugging Face","metadata":{}},{"cell_type":"code","source":"model = tf.keras.models.load_model('/kaggle/working/roberta-fine-tuned-2-best')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:07:36.737706Z","iopub.execute_input":"2023-05-12T10:07:36.738837Z","iopub.status.idle":"2023-05-12T10:07:59.902966Z","shell.execute_reply.started":"2023-05-12T10:07:36.738795Z","shell.execute_reply":"2023-05-12T10:07:59.901400Z"},"trusted":true},"execution_count":49,"outputs":[]},{"cell_type":"code","source":"\"\"\"%%capture\n!pip install git+https://github.com/huggingface/huggingface_hub.git@main\n!sudo apt -qq install git-lfs\n!git config --global credential.helper store\"\"\"","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"!huggingface-cli login --token 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","metadata":{"execution":{"iopub.status.busy":"2023-05-12T10:12:13.025974Z","iopub.execute_input":"2023-05-12T10:12:13.026917Z","iopub.status.idle":"2023-05-12T10:12:15.351277Z","shell.execute_reply.started":"2023-05-12T10:12:13.026877Z","shell.execute_reply":"2023-05-12T10:12:15.349659Z"},"trusted":true},"execution_count":55,"outputs":[{"name":"stdout","text":"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nToken will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.\nToken is valid.\nYour token has been saved to /root/.cache/huggingface/token\nLogin successful\n","output_type":"stream"}]},{"cell_type":"code","source":"from huggingface_hub import push_to_hub_keras\npush_to_hub_keras(model, 'Multilingual-Toxic-Comment-Roberta-best')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from huggingface_hub import HfApi\napi = HfApi()\napi.upload_folder(\n    folder_path=\"/kaggle/working/\",\n    repo_id=\"shivansh-ka/Toxic-Comment-Classifier-Multi\",\n    repo_type=\"space\",\n)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Loading model from Hub","metadata":{}},{"cell_type":"code","source":"from huggingface_hub import from_pretrained_keras\nm = from_pretrained_keras('shivansh-ka/Multilingual-Toxic-Comment-Roberta')","metadata":{"execution":{"iopub.status.busy":"2023-05-12T06:59:23.928089Z","iopub.execute_input":"2023-05-12T06:59:23.928495Z","iopub.status.idle":"2023-05-12T06:59:56.375479Z","shell.execute_reply.started":"2023-05-12T06:59:23.928466Z","shell.execute_reply":"2023-05-12T06:59:56.374295Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n  warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\nconfig.json not found in HuggingFace Hub.\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"84f3f3229b3e42668708162e27df3168"}},"metadata":{}}]},{"cell_type":"code","source":"preds = m.predict(test_dataset)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T07:06:50.246933Z","iopub.execute_input":"2023-05-12T07:06:50.247789Z","iopub.status.idle":"2023-05-12T07:29:11.940923Z","shell.execute_reply.started":"2023-05-12T07:06:50.247752Z","shell.execute_reply":"2023-05-12T07:29:11.939745Z"},"trusted":true},"execution_count":18,"outputs":[{"name":"stdout","text":"499/499 [==============================] - 1341s 3s/step\n","output_type":"stream"}]},{"cell_type":"code","source":"m.summary()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T07:31:58.337639Z","iopub.execute_input":"2023-05-12T07:31:58.338344Z","iopub.status.idle":"2023-05-12T07:31:58.425154Z","shell.execute_reply.started":"2023-05-12T07:31:58.338300Z","shell.execute_reply":"2023-05-12T07:31:58.424117Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"Model: \"model\"\n__________________________________________________________________________________________________\n Layer (type)                   Output Shape         Param #     Connected to                     \n==================================================================================================\n input_ids (InputLayer)         [(None, 192)]        0           []                               \n                                                                                                  \n attention_mask (InputLayer)    [(None, 192)]        0           []                               \n                                                                                                  \n roberta (Custom>TFXLMRobertaMa  {'pooler_output': (  559890432  ['input_ids[0][0]',              \n inLayer)                       None, 1024),                      'attention_mask[0][0]']         \n                                 'last_hidden_state                                               \n                                ': (None, 192, 1024                                               \n                                )}                                                                \n                                                                                                  \n dense (Dense)                  (None, 1024)         1049600     ['roberta[0][1]']                \n                                                                                                  \n output_layer (Dense)           (None, 1)            1025        ['dense[0][0]']                  \n                                                                                                  \n==================================================================================================\nTotal params: 560,941,057\nTrainable params: 560,941,057\nNon-trainable params: 0\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"sub['toxic'] = preds\nsub.to_csv(\"submission.csv\",index=False)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T07:32:36.768119Z","iopub.execute_input":"2023-05-12T07:32:36.768542Z","iopub.status.idle":"2023-05-12T07:32:36.963761Z","shell.execute_reply.started":"2023-05-12T07:32:36.768512Z","shell.execute_reply":"2023-05-12T07:32:36.962584Z"},"trusted":true},"execution_count":21,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}