{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.8.16","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install transformers[sentencepiece] huggingface -q","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:47:51.413800Z","iopub.execute_input":"2023-05-12T11:47:51.414070Z","iopub.status.idle":"2023-05-12T11:48:18.602918Z","shell.execute_reply.started":"2023-05-12T11:47:51.414046Z","shell.execute_reply":"2023-05-12T11:48:18.601877Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\ntensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\ntensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n\u001b[0m\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.1.2 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n\u001b[0m","output_type":"stream"}]},{"cell_type":"code","source":"import warnings\nwarnings.filterwarnings('ignore')\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n#import seaborn as sns\nimport os\nimport tensorflow as tf\nfrom tensorflow.keras.layers import Input, Dense\nfrom tensorflow.keras.models import Model\nfrom tensorflow.data import Dataset\n\nimport transformers\nfrom transformers import AutoTokenizer, TFAutoModel","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:49:44.573340Z","iopub.execute_input":"2023-05-12T11:49:44.574079Z","iopub.status.idle":"2023-05-12T11:50:25.814142Z","shell.execute_reply.started":"2023-05-12T11:49:44.574046Z","shell.execute_reply":"2023-05-12T11:50:25.812749Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stderr","text":"D0512 11:50:18.442119879 14 config.cc:119] gRPC EXPERIMENT tcp_frame_size_tuning OFF (default:OFF)\nD0512 11:50:18.442154551 14 config.cc:119] gRPC EXPERIMENT tcp_rcv_lowat OFF (default:OFF)\nD0512 11:50:18.442158337 14 config.cc:119] gRPC EXPERIMENT peer_state_based_framing OFF (default:OFF)\nD0512 11:50:18.442160973 14 config.cc:119] gRPC EXPERIMENT flow_control_fixes ON (default:ON)\nD0512 11:50:18.442163321 14 config.cc:119] gRPC EXPERIMENT memory_pressure_controller OFF (default:OFF)\nD0512 11:50:18.442165969 14 config.cc:119] gRPC EXPERIMENT unconstrained_max_quota_buffer_size OFF (default:OFF)\nD0512 11:50:18.442168796 14 config.cc:119] gRPC EXPERIMENT new_hpack_huffman_decoder ON (default:ON)\nD0512 11:50:18.442171109 14 config.cc:119] gRPC EXPERIMENT event_engine_client OFF (default:OFF)\nD0512 11:50:18.442173402 14 config.cc:119] gRPC EXPERIMENT monitoring_experiment ON (default:ON)\nD0512 11:50:18.442175638 14 config.cc:119] gRPC EXPERIMENT promise_based_client_call OFF (default:OFF)\nD0512 11:50:18.442177867 14 config.cc:119] gRPC EXPERIMENT free_large_allocator OFF (default:OFF)\nD0512 11:50:18.442181062 14 config.cc:119] gRPC EXPERIMENT promise_based_server_call OFF (default:OFF)\nD0512 11:50:18.442183630 14 config.cc:119] gRPC EXPERIMENT transport_supplies_client_latency OFF (default:OFF)\nD0512 11:50:18.442185959 14 config.cc:119] gRPC EXPERIMENT event_engine_listener OFF (default:OFF)\nI0512 11:50:18.442394344 14 ev_epoll1_linux.cc:122] grpc epoll fd: 62\nD0512 11:50:18.453257763 14 ev_posix.cc:144] Using polling engine: epoll1\nD0512 11:50:18.453301358 14 dns_resolver_ares.cc:822] Using ares dns resolver\nD0512 11:50:18.453762003 14 lb_policy_registry.cc:46] registering LB policy factory for \"priority_experimental\"\nD0512 11:50:18.453774538 14 lb_policy_registry.cc:46] registering LB policy factory for \"outlier_detection_experimental\"\nD0512 11:50:18.453779385 14 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_target_experimental\"\nD0512 11:50:18.453782660 14 lb_policy_registry.cc:46] registering LB policy factory for \"pick_first\"\nD0512 11:50:18.453786243 14 lb_policy_registry.cc:46] registering LB policy factory for \"round_robin\"\nD0512 11:50:18.453789942 14 lb_policy_registry.cc:46] registering LB policy factory for \"weighted_round_robin_experimental\"\nD0512 11:50:18.453797356 14 lb_policy_registry.cc:46] registering LB policy factory for \"ring_hash_experimental\"\nD0512 11:50:18.453818829 14 lb_policy_registry.cc:46] registering LB policy factory for \"grpclb\"\nD0512 11:50:18.453851056 14 lb_policy_registry.cc:46] registering LB policy factory for \"rls_experimental\"\nD0512 11:50:18.453873781 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_manager_experimental\"\nD0512 11:50:18.453877823 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_impl_experimental\"\nD0512 11:50:18.453881490 14 lb_policy_registry.cc:46] registering LB policy factory for \"cds_experimental\"\nD0512 11:50:18.453888362 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_cluster_resolver_experimental\"\nD0512 11:50:18.453892163 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_override_host_experimental\"\nD0512 11:50:18.453896027 14 lb_policy_registry.cc:46] registering LB policy factory for \"xds_wrr_locality_experimental\"\nD0512 11:50:18.453901564 14 certificate_provider_registry.cc:35] registering certificate provider factory for \"file_watcher\"\nI0512 11:50:18.456269287 14 socket_utils_common_posix.cc:408] Disabling AF_INET6 sockets because ::1 is not available.\nI0512 11:50:18.476859295 376 socket_utils_common_posix.cc:337] TCP_USER_TIMEOUT is available. TCP_USER_TIMEOUT will be used thereafter\nE0512 11:50:18.484409363 376 oauth2_credentials.cc:236] oauth_fetch: UNKNOWN:C-ares status is not ARES_SUCCESS qtype=A name=metadata.google.internal. is_balancer=0: Domain name not found {created_time:\"2023-05-12T11:50:18.484390999+00:00\", grpc_status:2}\n","output_type":"stream"}]},{"cell_type":"code","source":"## Setting up TPUs\ntpu = tf.distribute.cluster_resolver.TPUClusterResolver()\nprint('Running on TPU ', tpu.master())\ntf.config.experimental_connect_to_cluster(tpu)\ntf.tpu.experimental.initialize_tpu_system(tpu)\ntpu_strategy = tf.distribute.TPUStrategy(tpu)\nprint(\"REPLICAS: \", tpu_strategy.num_replicas_in_sync)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:50:25.816399Z","iopub.execute_input":"2023-05-12T11:50:25.817031Z","iopub.status.idle":"2023-05-12T11:50:35.943243Z","shell.execute_reply.started":"2023-05-12T11:50:25.816998Z","shell.execute_reply":"2023-05-12T11:50:35.942201Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"Running on TPU \nINFO:tensorflow:Deallocate tpu buffers before initializing tpu system.\nINFO:tensorflow:Initializing the TPU system: local\nINFO:tensorflow:Finished initializing TPU system.\nINFO:tensorflow:Found TPU system:\nINFO:tensorflow:*** Num TPU Cores: 8\nINFO:tensorflow:*** Num TPU Workers: 1\nINFO:tensorflow:*** Num TPU Cores Per Worker: 8\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)\nINFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\nREPLICAS: 8\n","output_type":"stream"}]},{"cell_type":"code","source":"class Config:\n EPOCHS = 3 #2\n MODEL = \"bert-base-multilingual-uncased\"\n BUFFER_SIZE = 2048\n BATCH_SIZE = 16*tpu_strategy.num_replicas_in_sync\n MAX_LEN = 192\n LEARNING_RATE = 1e-5\n WEIGHT_DECAY = 1e-6\n RANDOM_STATE = 42","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:50:35.944622Z","iopub.execute_input":"2023-05-12T11:50:35.944945Z","iopub.status.idle":"2023-05-12T11:50:35.950932Z","shell.execute_reply.started":"2023-05-12T11:50:35.944916Z","shell.execute_reply":"2023-05-12T11:50:35.949929Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"input_dir = \"/kaggle/input/jigsaw-multilingual-toxic-comment-classification\"\ntrain1 = pd.read_csv(os.path.join(input_dir, \"jigsaw-toxic-comment-train.csv\"))\ntrain2 = pd.read_csv(os.path.join(input_dir, \"jigsaw-unintended-bias-train.csv\"))\nval = pd.read_csv(os.path.join(input_dir,\"validation.csv\"))\ntest = pd.read_csv(os.path.join(input_dir,\"test.csv\"))","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:50:35.953167Z","iopub.execute_input":"2023-05-12T11:50:35.953494Z","iopub.status.idle":"2023-05-12T11:51:03.310955Z","shell.execute_reply.started":"2023-05-12T11:50:35.953467Z","shell.execute_reply":"2023-05-12T11:51:03.309809Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"train1.head()","metadata":{"scrolled":true,"execution":{"iopub.status.busy":"2023-05-12T08:12:42.440974Z","iopub.execute_input":"2023-05-12T08:12:42.441315Z","iopub.status.idle":"2023-05-12T08:12:42.461414Z","shell.execute_reply.started":"2023-05-12T08:12:42.441285Z","shell.execute_reply":"2023-05-12T08:12:42.460195Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":" id comment_text toxic \n0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \\\n1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n\n severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 \n1 0 0 0 0 0 \n2 0 0 0 0 0 \n3 0 0 0 0 0 \n4 0 0 0 0 0 ","text/html":"
\n\n
\n \n
\n
\n
id
\n
comment_text
\n
toxic
\n
severe_toxic
\n
obscene
\n
threat
\n
insult
\n
identity_hate
\n
\n \n \n
\n
0
\n
0000997932d777bf
\n
Explanation\\nWhy the edits made under my usern...
\n
0
\n
0
\n
0
\n
0
\n
0
\n
0
\n
\n
\n
1
\n
000103f0d9cfb60f
\n
D'aww! He matches this background colour I'm s...
\n
0
\n
0
\n
0
\n
0
\n
0
\n
0
\n
\n
\n
2
\n
000113f07ec002fd
\n
Hey man, I'm really not trying to edit war. It...
\n
0
\n
0
\n
0
\n
0
\n
0
\n
0
\n
\n
\n
3
\n
0001b41b1c6bb37e
\n
\"\\nMore\\nI can't make any real suggestions on ...
\n
0
\n
0
\n
0
\n
0
\n
0
\n
0
\n
\n
\n
4
\n
0001d958c54c6e35
\n
You, sir, are my hero. Any chance you remember...
\n
0
\n
0
\n
0
\n
0
\n
0
\n
0
\n
\n \n
\n
"},"metadata":{}}]},{"cell_type":"code","source":"train2.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.462658Z","iopub.execute_input":"2023-05-12T08:12:42.462965Z","iopub.status.idle":"2023-05-12T08:12:42.487874Z","shell.execute_reply.started":"2023-05-12T08:12:42.462921Z","shell.execute_reply":"2023-05-12T08:12:42.487081Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":" id comment_text toxic \n0 59848 This is so cool. It's like, 'would you want yo... 0.000000 \\\n1 59849 Thank you!! This would make my life a lot less... 0.000000 \n2 59852 This is such an urgent design problem; kudos t... 0.000000 \n3 59855 Is this something I'll be able to install on m... 0.000000 \n4 59856 haha you guys are a bunch of losers. 0.893617 \n\n severe_toxicity obscene identity_attack insult threat asian atheist \n0 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \\\n1 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n2 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n3 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n4 0.021277 0.0 0.021277 0.87234 0.0 0.0 0.0 \n\n ... article_id rating funny wow sad likes disagree \n0 ... 2006 rejected 0 0 0 0 0 \\\n1 ... 2006 rejected 0 0 0 0 0 \n2 ... 2006 rejected 0 0 0 0 0 \n3 ... 2006 rejected 0 0 0 0 0 \n4 ... 2006 rejected 0 0 0 1 0 \n\n sexual_explicit identity_annotator_count toxicity_annotator_count \n0 0.0 0 4 \n1 0.0 0 4 \n2 0.0 0 4 \n3 0.0 0 4 \n4 0.0 4 47 \n\n[5 rows x 45 columns]","text/html":"
\n\n
\n \n
\n
\n
id
\n
comment_text
\n
toxic
\n
severe_toxicity
\n
obscene
\n
identity_attack
\n
insult
\n
threat
\n
asian
\n
atheist
\n
...
\n
article_id
\n
rating
\n
funny
\n
wow
\n
sad
\n
likes
\n
disagree
\n
sexual_explicit
\n
identity_annotator_count
\n
toxicity_annotator_count
\n
\n \n \n
\n
0
\n
59848
\n
This is so cool. It's like, 'would you want yo...
\n
0.000000
\n
0.000000
\n
0.0
\n
0.000000
\n
0.00000
\n
0.0
\n
NaN
\n
NaN
\n
...
\n
2006
\n
rejected
\n
0
\n
0
\n
0
\n
0
\n
0
\n
0.0
\n
0
\n
4
\n
\n
\n
1
\n
59849
\n
Thank you!! This would make my life a lot less...
\n
0.000000
\n
0.000000
\n
0.0
\n
0.000000
\n
0.00000
\n
0.0
\n
NaN
\n
NaN
\n
...
\n
2006
\n
rejected
\n
0
\n
0
\n
0
\n
0
\n
0
\n
0.0
\n
0
\n
4
\n
\n
\n
2
\n
59852
\n
This is such an urgent design problem; kudos t...
\n
0.000000
\n
0.000000
\n
0.0
\n
0.000000
\n
0.00000
\n
0.0
\n
NaN
\n
NaN
\n
...
\n
2006
\n
rejected
\n
0
\n
0
\n
0
\n
0
\n
0
\n
0.0
\n
0
\n
4
\n
\n
\n
3
\n
59855
\n
Is this something I'll be able to install on m...
\n
0.000000
\n
0.000000
\n
0.0
\n
0.000000
\n
0.00000
\n
0.0
\n
NaN
\n
NaN
\n
...
\n
2006
\n
rejected
\n
0
\n
0
\n
0
\n
0
\n
0
\n
0.0
\n
0
\n
4
\n
\n
\n
4
\n
59856
\n
haha you guys are a bunch of losers.
\n
0.893617
\n
0.021277
\n
0.0
\n
0.021277
\n
0.87234
\n
0.0
\n
0.0
\n
0.0
\n
...
\n
2006
\n
rejected
\n
0
\n
0
\n
0
\n
1
\n
0
\n
0.0
\n
4
\n
47
\n
\n \n
\n
5 rows × 45 columns
\n
"},"metadata":{}}]},{"cell_type":"code","source":"val.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.488844Z","iopub.execute_input":"2023-05-12T08:12:42.489110Z","iopub.status.idle":"2023-05-12T08:12:42.504161Z","shell.execute_reply.started":"2023-05-12T08:12:42.489087Z","shell.execute_reply":"2023-05-12T08:12:42.503316Z"},"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":" id comment_text lang toxic\n0 0 Este usuario ni siquiera llega al rango de ... es 0\n1 1 Il testo di questa voce pare esser scopiazzato... it 0\n2 2 Vale. Sólo expongo mi pasado. Todo tiempo pasa... es 1\n3 3 Bu maddenin alt başlığı olarak uluslararası i... tr 0\n4 4 Belçika nın şehirlerinin yanında ilçe ve belde... tr 0","text/html":"
\n\n
\n \n
\n
\n
id
\n
comment_text
\n
lang
\n
toxic
\n
\n \n \n
\n
0
\n
0
\n
Este usuario ni siquiera llega al rango de ...
\n
es
\n
0
\n
\n
\n
1
\n
1
\n
Il testo di questa voce pare esser scopiazzato...
\n
it
\n
0
\n
\n
\n
2
\n
2
\n
Vale. Sólo expongo mi pasado. Todo tiempo pasa...
\n
es
\n
1
\n
\n
\n
3
\n
3
\n
Bu maddenin alt başlığı olarak uluslararası i...
\n
tr
\n
0
\n
\n
\n
4
\n
4
\n
Belçika nın şehirlerinin yanında ilçe ve belde...
\n
tr
\n
0
\n
\n \n
\n
"},"metadata":{}}]},{"cell_type":"code","source":"test.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.505217Z","iopub.execute_input":"2023-05-12T08:12:42.505504Z","iopub.status.idle":"2023-05-12T08:12:42.518947Z","shell.execute_reply.started":"2023-05-12T08:12:42.505480Z","shell.execute_reply":"2023-05-12T08:12:42.518159Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":" id content lang\n0 0 Doctor Who adlı viki başlığına 12. doctor olar... tr\n1 1 Вполне возможно, но я пока не вижу необходимо... ru\n2 2 Quindi tu sei uno di quelli conservativi , ... it\n3 3 Malesef gerçekleştirilmedi ancak şöyle bir şey... tr\n4 4 :Resim:Seldabagcan.jpg resminde kaynak sorunu ... tr","text/html":"
\n\n
\n \n
\n
\n
id
\n
content
\n
lang
\n
\n \n \n
\n
0
\n
0
\n
Doctor Who adlı viki başlığına 12. doctor olar...
\n
tr
\n
\n
\n
1
\n
1
\n
Вполне возможно, но я пока не вижу необходимо...
\n
ru
\n
\n
\n
2
\n
2
\n
Quindi tu sei uno di quelli conservativi , ...
\n
it
\n
\n
\n
3
\n
3
\n
Malesef gerçekleştirilmedi ancak şöyle bir şey...
\n
tr
\n
\n
\n
4
\n
4
\n
:Resim:Seldabagcan.jpg resminde kaynak sorunu ...
\n
tr
\n
\n \n
\n
"},"metadata":{}}]},{"cell_type":"code","source":"train1[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.519956Z","iopub.execute_input":"2023-05-12T08:12:42.520259Z","iopub.status.idle":"2023-05-12T08:12:42.534176Z","shell.execute_reply.started":"2023-05-12T08:12:42.520234Z","shell.execute_reply":"2023-05-12T08:12:42.533484Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"toxic\n0 202165\n1 21384\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train2[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.537691Z","iopub.execute_input":"2023-05-12T08:12:42.537946Z","iopub.status.idle":"2023-05-12T08:12:42.574451Z","shell.execute_reply.started":"2023-05-12T08:12:42.537925Z","shell.execute_reply":"2023-05-12T08:12:42.573541Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"toxic\n0.000000 1333035\n0.166667 138501\n0.200000 113271\n0.300000 62195\n0.400000 52703\n ... \n0.037609 1\n0.971193 1\n0.988430 1\n0.008309 1\n0.967316 1\nName: count, Length: 3853, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"val[\"toxic\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.575526Z","iopub.execute_input":"2023-05-12T08:12:42.575805Z","iopub.status.idle":"2023-05-12T08:12:42.584242Z","shell.execute_reply.started":"2023-05-12T08:12:42.575781Z","shell.execute_reply":"2023-05-12T08:12:42.583468Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"toxic\n0 6770\n1 1230\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"val[\"lang\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.585256Z","iopub.execute_input":"2023-05-12T08:12:42.585532Z","iopub.status.idle":"2023-05-12T08:12:42.596996Z","shell.execute_reply.started":"2023-05-12T08:12:42.585510Z","shell.execute_reply":"2023-05-12T08:12:42.596246Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"lang\ntr 3000\nes 2500\nit 2500\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"test[\"lang\"].value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T08:12:42.597893Z","iopub.execute_input":"2023-05-12T08:12:42.598151Z","iopub.status.idle":"2023-05-12T08:12:42.612575Z","shell.execute_reply.started":"2023-05-12T08:12:42.598129Z","shell.execute_reply":"2023-05-12T08:12:42.611766Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"lang\ntr 14000\npt 11012\nru 10948\nfr 10920\nit 8494\nes 8438\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train1 = train1.iloc[:,1:3]\ntrain2 = train2.iloc[:,1:3]\nval = val.loc[:,[\"comment_text\",\"toxic\"]]\ntest.rename(columns={\"content\":\"comment_text\"}, inplace=True)\nsub = test[['id']]\ntrain2.toxic = (train2.toxic>0.5).astype(int)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.312161Z","iopub.execute_input":"2023-05-12T11:51:03.312475Z","iopub.status.idle":"2023-05-12T11:51:03.453706Z","shell.execute_reply.started":"2023-05-12T11:51:03.312450Z","shell.execute_reply":"2023-05-12T11:51:03.452741Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"train2.toxic.value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.454767Z","iopub.execute_input":"2023-05-12T11:51:03.455331Z","iopub.status.idle":"2023-05-12T11:51:03.481303Z","shell.execute_reply.started":"2023-05-12T11:51:03.455304Z","shell.execute_reply":"2023-05-12T11:51:03.480425Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"toxic\n0 1789968\n1 112226\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"train = pd.concat([train1,\n train2.query(\"toxic==1\"),\n train2.query(\"toxic==0\").sample(n=200000, random_state=Config.RANDOM_STATE)])\ntrain.dropna(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.482311Z","iopub.execute_input":"2023-05-12T11:51:03.482966Z","iopub.status.idle":"2023-05-12T11:51:03.827807Z","shell.execute_reply.started":"2023-05-12T11:51:03.482940Z","shell.execute_reply":"2023-05-12T11:51:03.826717Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"code","source":"train.shape","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.829068Z","iopub.execute_input":"2023-05-12T11:51:03.829375Z","iopub.status.idle":"2023-05-12T11:51:03.834997Z","shell.execute_reply.started":"2023-05-12T11:51:03.829350Z","shell.execute_reply":"2023-05-12T11:51:03.834118Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"(535775, 2)"},"metadata":{}}]},{"cell_type":"code","source":"train.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.835982Z","iopub.execute_input":"2023-05-12T11:51:03.836269Z","iopub.status.idle":"2023-05-12T11:51:03.854775Z","shell.execute_reply.started":"2023-05-12T11:51:03.836227Z","shell.execute_reply":"2023-05-12T11:51:03.853871Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":" comment_text toxic\n0 Explanation\\nWhy the edits made under my usern... 0\n1 D'aww! He matches this background colour I'm s... 0\n2 Hey man, I'm really not trying to edit war. It... 0\n3 \"\\nMore\\nI can't make any real suggestions on ... 0\n4 You, sir, are my hero. Any chance you remember... 0","text/html":"
\n\n
\n \n
\n
\n
comment_text
\n
toxic
\n
\n \n \n
\n
0
\n
Explanation\\nWhy the edits made under my usern...
\n
0
\n
\n
\n
1
\n
D'aww! He matches this background colour I'm s...
\n
0
\n
\n
\n
2
\n
Hey man, I'm really not trying to edit war. It...
\n
0
\n
\n
\n
3
\n
\"\\nMore\\nI can't make any real suggestions on ...
\n
0
\n
\n
\n
4
\n
You, sir, are my hero. Any chance you remember...
\n
0
\n
\n \n
\n
"},"metadata":{}}]},{"cell_type":"code","source":"val.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.858708Z","iopub.execute_input":"2023-05-12T11:51:03.859118Z","iopub.status.idle":"2023-05-12T11:51:03.866689Z","shell.execute_reply.started":"2023-05-12T11:51:03.859092Z","shell.execute_reply":"2023-05-12T11:51:03.865871Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":" comment_text toxic\n0 Este usuario ni siquiera llega al rango de ... 0\n1 Il testo di questa voce pare esser scopiazzato... 0\n2 Vale. Sólo expongo mi pasado. Todo tiempo pasa... 1\n3 Bu maddenin alt başlığı olarak uluslararası i... 0\n4 Belçika nın şehirlerinin yanında ilçe ve belde... 0","text/html":"
\n\n
\n \n
\n
\n
comment_text
\n
toxic
\n
\n \n \n
\n
0
\n
Este usuario ni siquiera llega al rango de ...
\n
0
\n
\n
\n
1
\n
Il testo di questa voce pare esser scopiazzato...
\n
0
\n
\n
\n
2
\n
Vale. Sólo expongo mi pasado. Todo tiempo pasa...
\n
1
\n
\n
\n
3
\n
Bu maddenin alt başlığı olarak uluslararası i...
\n
0
\n
\n
\n
4
\n
Belçika nın şehirlerinin yanında ilçe ve belde...
\n
0
\n
\n \n
\n
"},"metadata":{}}]},{"cell_type":"code","source":"test.head()","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.867828Z","iopub.execute_input":"2023-05-12T11:51:03.868255Z","iopub.status.idle":"2023-05-12T11:51:03.881894Z","shell.execute_reply.started":"2023-05-12T11:51:03.868213Z","shell.execute_reply":"2023-05-12T11:51:03.881141Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":" id comment_text lang\n0 0 Doctor Who adlı viki başlığına 12. doctor olar... tr\n1 1 Вполне возможно, но я пока не вижу необходимо... ru\n2 2 Quindi tu sei uno di quelli conservativi , ... it\n3 3 Malesef gerçekleştirilmedi ancak şöyle bir şey... tr\n4 4 :Resim:Seldabagcan.jpg resminde kaynak sorunu ... tr","text/html":"
\n\n
\n \n
\n
\n
id
\n
comment_text
\n
lang
\n
\n \n \n
\n
0
\n
0
\n
Doctor Who adlı viki başlığına 12. doctor olar...
\n
tr
\n
\n
\n
1
\n
1
\n
Вполне возможно, но я пока не вижу необходимо...
\n
ru
\n
\n
\n
2
\n
2
\n
Quindi tu sei uno di quelli conservativi , ...
\n
it
\n
\n
\n
3
\n
3
\n
Malesef gerçekleştirilmedi ancak şöyle bir şey...
\n
tr
\n
\n
\n
4
\n
4
\n
:Resim:Seldabagcan.jpg resminde kaynak sorunu ...
\n
tr
\n
\n \n
\n
"},"metadata":{}}]},{"cell_type":"code","source":"test.rename(columns={\"content\":\"comment_text\"}, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.882947Z","iopub.execute_input":"2023-05-12T11:51:03.883338Z","iopub.status.idle":"2023-05-12T11:51:03.892723Z","shell.execute_reply.started":"2023-05-12T11:51:03.883311Z","shell.execute_reply":"2023-05-12T11:51:03.891955Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"code","source":"import re\ntrain['comment_text'] = train['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\nval['comment_text'] = val['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())\ntest['comment_text'] = test['comment_text'].apply(lambda x: re.sub('\\n',' ',x).strip())","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:03.893692Z","iopub.execute_input":"2023-05-12T11:51:03.894038Z","iopub.status.idle":"2023-05-12T11:51:05.368808Z","shell.execute_reply.started":"2023-05-12T11:51:03.894014Z","shell.execute_reply":"2023-05-12T11:51:05.367736Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"seq_len = [len(i.split()) for i in train.comment_text]\n\npd.Series(seq_len).hist(bins = 30)\nprint(np.mean(seq_len))\nprint(max(seq_len))","metadata":{"execution":{"iopub.status.busy":"2023-05-12T11:51:05.369914Z","iopub.execute_input":"2023-05-12T11:51:05.370196Z","iopub.status.idle":"2023-05-12T11:51:08.102915Z","shell.execute_reply.started":"2023-05-12T11:51:05.370173Z","shell.execute_reply":"2023-05-12T11:51:08.101871Z"},"trusted":true},"execution_count":15,"outputs":[{"name":"stdout","text":"56.28243572395129\n2321\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"