mario commited on
Commit
c68f5ab
1 Parent(s): d81e7c2

Added deps

Browse files
notebooks/HMM and CRF.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/Results LSTM.ipynb ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Toxic Spans Detection"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "import pandas as pd\n",
17
+ "import numpy as np\n",
18
+ "import matplotlib.pyplot as plt\n",
19
+ "import seaborn as sns\n",
20
+ "\n",
21
+ "import torch\n",
22
+ "import torch.nn as nn\n",
23
+ "\n",
24
+ "import spacy\n",
25
+ "import ast\n",
26
+ "from termcolor import colored\n",
27
+ "\n",
28
+ "from tqdm import tqdm\n",
29
+ "import gdown\n",
30
+ "\n",
31
+ "from utils.processing import get_index_toxic_words, color_toxic_words, f1\n",
32
+ "from utils.lstm import spacy_tokenizer, get_vocab\n",
33
+ "\n",
34
+ "sns.set_style('darkgrid')\n",
35
+ "dev = 'cuda:0' if torch.cuda.is_available() else 'cpu'"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 2,
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "# To plot using LaTeX, sometimes it gives trouble, in that case comment these two lines\n",
45
+ "plt.rc('text', usetex=True)\n",
46
+ "plt.rc('font', family='serif')"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "markdown",
51
+ "metadata": {},
52
+ "source": [
53
+ "## Resultados\n",
54
+ "\n",
55
+ "De los distintos preprocesamientos el que mayor F1 score tiene en `test` es poner los posts con [ ] como completamente tóxicos (**best-model-try2.pt**, *train*=0.6498 , *test*=0.6526), así que usaremos ese."
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 3,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "train = pd.read_csv('../data/tsd_train.csv', converters={'spans':ast.literal_eval})\n",
65
+ "test = pd.read_csv('../data/tsd_trial.csv', converters={'spans':ast.literal_eval})"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": null,
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "# Nuestros embeddings\n",
75
+ "vocab = get_vocab(train)"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 5,
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "class LSTMTagger(nn.Module):\n",
85
+ "\n",
86
+ " def __init__(self, embedding_dim, stacked_layers, dropout_p, weight, hidden_dim, vocab_size):\n",
87
+ " super(LSTMTagger, self).__init__()\n",
88
+ " self.hidden_dim = hidden_dim # Dimension del estado oculta en cada direccion de la LSTM\n",
89
+ " self.stacked_layers = stacked_layers # Cuantas capas en la LSTM\n",
90
+ " \n",
91
+ " self.word_embeddings = nn.Embedding.from_pretrained(weight)\n",
92
+ " self.lstm = nn.LSTM(embedding_dim,\n",
93
+ " hidden_dim,\n",
94
+ " num_layers=stacked_layers,\n",
95
+ " dropout=dropout_p,\n",
96
+ " bidirectional=True)\n",
97
+ "\n",
98
+ " # Linear layers\n",
99
+ " self.fc1 = nn.Linear(hidden_dim*2, 1) # 2 veces el tamaño de hidden_dim por ser bidireccional\n",
100
+ "\n",
101
+ " def forward(self, sentence):\n",
102
+ " embeds = self.word_embeddings(sentence)\n",
103
+ " output, _ = self.lstm(embeds.view(len(sentence), 1, -1))\n",
104
+ " x = torch.sigmoid(self.fc1(output.view(len(sentence), -1)))\n",
105
+ " return x"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 6,
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "def prepare_sequence(seq):\n",
115
+ " idxs = vocab.lookup_indices(seq) # Si no está lo pone como 0\n",
116
+ " return torch.tensor(idxs, dtype=torch.long, device=dev)\n",
117
+ "\n",
118
+ "def prepare_sequence_tags(seq):\n",
119
+ " tag_to_ix = {\"non_toxic\": 0, \"toxic\": 1} \n",
120
+ " idxs = [tag_to_ix[s] for s in seq]\n",
121
+ " return torch.tensor(idxs, dtype=torch.long, device=dev)\n",
122
+ "\n",
123
+ "def tagger_LSTM(text, threshold=0.5):\n",
124
+ " \"\"\"\n",
125
+ " Hace el tagging con el modelo que entrenamos.\n",
126
+ " \"\"\"\n",
127
+ " ix_to_tag = {0: 'non_toxic', 1: 'toxic'}\n",
128
+ " words = spacy_tokenizer(text.lower()) # Parece funcionar mejor\n",
129
+ " \n",
130
+ " with torch.no_grad():\n",
131
+ " inputs = prepare_sequence(words)\n",
132
+ " tag_scores = model(inputs)\n",
133
+ " \n",
134
+ " tags = [1 if x > threshold else 0 for x in tag_scores]\n",
135
+ " tagged_sentence = list(zip(words, tags))\n",
136
+ "\n",
137
+ " return tagged_sentence"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "markdown",
142
+ "metadata": {},
143
+ "source": [
144
+ "El archivo con el modelo está algo pesado para GitHub, así que lo pondremos en un Drive para descargarlo."
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": null,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "url = 'https://drive.google.com/uc?id=1KO-QXUBfwzjauWLhiVi9StD3y0GtiBbj'\n",
154
+ "output = 'models/best-model.pt'\n",
155
+ "\n",
156
+ "gdown.download(url, output, quiet=False) "
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": 8,
162
+ "metadata": {},
163
+ "outputs": [
164
+ {
165
+ "data": {
166
+ "text/plain": [
167
+ "LSTMTagger(\n",
168
+ " (word_embeddings): Embedding(19611, 200)\n",
169
+ " (lstm): LSTM(200, 600, num_layers=6, dropout=0.2, bidirectional=True)\n",
170
+ " (fc1): Linear(in_features=1200, out_features=1, bias=True)\n",
171
+ ")"
172
+ ]
173
+ },
174
+ "execution_count": 8,
175
+ "metadata": {},
176
+ "output_type": "execute_result"
177
+ }
178
+ ],
179
+ "source": [
180
+ "model = torch.load(output)\n",
181
+ "model.to(torch.device(dev))"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 9,
187
+ "metadata": {},
188
+ "outputs": [
189
+ {
190
+ "name": "stdout",
191
+ "output_type": "stream",
192
+ "text": [
193
+ "0\u001b[1m\u001b[36m Pred: \u001b[0mB\u001b[0me\u001b[0mc\u001b[0ma\u001b[0mu\u001b[0ms\u001b[0me\u001b[0m \u001b[0mh\u001b[0me\u001b[0m'\u001b[0ms\u001b[0m \u001b[0ma\u001b[0m \u001b[0m\u001b[41mm\u001b[0m\u001b[41mo\u001b[0m\u001b[41mr\u001b[0m\u001b[41mo\u001b[0m\u001b[41mn\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0ma\u001b[0m \u001b[0m\u001b[41mb\u001b[0m\u001b[41mi\u001b[0m\u001b[41mg\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m.\u001b[0m \u001b[0mI\u001b[0mt\u001b[0m'\u001b[0ms\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0ma\u001b[0mn\u001b[0my\u001b[0m \u001b[0mm\u001b[0mo\u001b[0mr\u001b[0me\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mp\u001b[0ml\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0mt\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m.\u001b[0m\n",
194
+ "0\u001b[1m\u001b[33m Gold: \u001b[0mB\u001b[0me\u001b[0mc\u001b[0ma\u001b[0mu\u001b[0ms\u001b[0me\u001b[0m \u001b[0mh\u001b[0me\u001b[0m'\u001b[0ms\u001b[0m \u001b[0ma\u001b[0m \u001b[0m\u001b[41mm\u001b[0m\u001b[41mo\u001b[0m\u001b[41mr\u001b[0m\u001b[41mo\u001b[0m\u001b[41mn\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0ma\u001b[0m \u001b[0m\u001b[41mb\u001b[0m\u001b[41mi\u001b[0m\u001b[41mg\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m.\u001b[0m \u001b[0mI\u001b[0mt\u001b[0m'\u001b[0ms\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0ma\u001b[0mn\u001b[0my\u001b[0m \u001b[0mm\u001b[0mo\u001b[0mr\u001b[0me\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mp\u001b[0ml\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0mt\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m.\u001b[0m\n",
195
+ "\n",
196
+ "\n",
197
+ "1\u001b[1m\u001b[36m Pred: \u001b[0mH\u001b[0mo\u001b[0mw\u001b[0m \u001b[0ma\u001b[0mb\u001b[0mo\u001b[0mu\u001b[0mt\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0ms\u001b[0mt\u001b[0mo\u001b[0mp\u001b[0m \u001b[0mp\u001b[0mr\u001b[0mo\u001b[0mt\u001b[0me\u001b[0mc\u001b[0mt\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m\u001b[41mi\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m\u001b[41ms\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0ml\u001b[0me\u001b[0mt\u001b[0m \u001b[0mn\u001b[0ma\u001b[0mt\u001b[0mu\u001b[0mr\u001b[0me\u001b[0m \u001b[0ma\u001b[0md\u001b[0md\u001b[0m \u001b[0ms\u001b[0mo\u001b[0mm\u001b[0me\u001b[0m \u001b[0mb\u001b[0ml\u001b[0me\u001b[0ma\u001b[0mc\u001b[0mh\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mg\u001b[0me\u001b[0mn\u001b[0me\u001b[0m \u001b[0mp\u001b[0mo\u001b[0mo\u001b[0ml\u001b[0m.\u001b[0m \u001b[0mW\u001b[0me\u001b[0m \u001b[0mc\u001b[0ma\u001b[0mn\u001b[0m \u001b[0ma\u001b[0ml\u001b[0mw\u001b[0ma\u001b[0my\u001b[0ms\u001b[0m \u001b[0ms\u001b[0mu\u001b[0mb\u001b[0mm\u001b[0mi\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mi\u001b[0mr\u001b[0m \u001b[0mn\u001b[0ma\u001b[0mm\u001b[0me\u001b[0ms\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mD\u001b[0ma\u001b[0mr\u001b[0mw\u001b[0mi\u001b[0mn\u001b[0m \u001b[0ma\u001b[0mw\u001b[0ma\u001b[0mr\u001b[0md\u001b[0ms\u001b[0m.\u001b[0m\n",
198
+ "1\u001b[1m\u001b[33m Gold: \u001b[0mH\u001b[0mo\u001b[0mw\u001b[0m \u001b[0ma\u001b[0mb\u001b[0mo\u001b[0mu\u001b[0mt\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0ms\u001b[0mt\u001b[0mo\u001b[0mp\u001b[0m \u001b[0mp\u001b[0mr\u001b[0mo\u001b[0mt\u001b[0me\u001b[0mc\u001b[0mt\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m\u001b[41mi\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m\u001b[41ms\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0ml\u001b[0me\u001b[0mt\u001b[0m \u001b[0mn\u001b[0ma\u001b[0mt\u001b[0mu\u001b[0mr\u001b[0me\u001b[0m \u001b[0ma\u001b[0md\u001b[0md\u001b[0m \u001b[0ms\u001b[0mo\u001b[0mm\u001b[0me\u001b[0m \u001b[0mb\u001b[0ml\u001b[0me\u001b[0ma\u001b[0mc\u001b[0mh\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mg\u001b[0me\u001b[0mn\u001b[0me\u001b[0m \u001b[0mp\u001b[0mo\u001b[0mo\u001b[0ml\u001b[0m.\u001b[0m \u001b[0mW\u001b[0me\u001b[0m \u001b[0mc\u001b[0ma\u001b[0mn\u001b[0m \u001b[0ma\u001b[0ml\u001b[0mw\u001b[0ma\u001b[0my\u001b[0ms\u001b[0m \u001b[0ms\u001b[0mu\u001b[0mb\u001b[0mm\u001b[0mi\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mi\u001b[0mr\u001b[0m \u001b[0mn\u001b[0ma\u001b[0mm\u001b[0me\u001b[0ms\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mD\u001b[0ma\u001b[0mr\u001b[0mw\u001b[0mi\u001b[0mn\u001b[0m \u001b[0ma\u001b[0mw\u001b[0ma\u001b[0mr\u001b[0md\u001b[0ms\u001b[0m.\u001b[0m\n",
199
+ "\n",
200
+ "\n",
201
+ "2\u001b[1m\u001b[36m Pred: \u001b[0mI\u001b[0mf\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m \u001b[0m \u001b[0mw\u001b[0me\u001b[0mr\u001b[0me\u001b[0m \u001b[0m \u001b[0ms\u001b[0mm\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0m,\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mu\u001b[0ml\u001b[0md\u001b[0m \u001b[0m \u001b[0mB\u001b[0mo\u001b[0my\u001b[0mc\u001b[0mo\u001b[0mt\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0mi\u001b[0ms\u001b[0m \u001b[0m \u001b[0mi\u001b[0mn\u001b[0me\u001b[0mp\u001b[0mt\u001b[0m \u001b[0m \u001b[0ma\u001b[0mi\u001b[0mr\u001b[0ml\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m,\u001b[0m \u001b[0m \u001b[0mb\u001b[0mu\u001b[0mt\u001b[0m \u001b[0m \u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0m \u001b[0ms\u001b[0mm\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0m,\u001b[0m \u001b[0m \u001b[0ms\u001b[0mo\u001b[0m \u001b[0m \u001b[0m \u001b[0mr\u001b[0mo\u001b[0mg\u001b[0mu\u001b[0me\u001b[0m \u001b[0m \u001b[0mb\u001b[0mu\u001b[0ms\u001b[0mi\u001b[0mn\u001b[0me\u001b[0ms\u001b[0ms\u001b[0me\u001b[0ms\u001b[0m \u001b[0m \u001b[0ml\u001b[0mi\u001b[0mk\u001b[0me\u001b[0m \u001b[0m \u001b[0mt\u001b[0mh\u001b[0mi\u001b[0ms\u001b[0m \u001b[0m \u001b[0mo\u001b[0mn\u001b[0me\u001b[0m,\u001b[0m \u001b[0m \u001b[0m \u001b[0ms\u001b[0mt\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0m \u001b[0mt\u001b[0mh\u001b[0mr\u001b[0mi\u001b[0mv\u001b[0me\u001b[0m \u001b[0m \u001b[0m \u001b[0mt\u001b[0ma\u001b[0mk\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m\u001b[41mi\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m\u001b[41ms\u001b[0m \u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0m \u001b[0ma\u001b[0m \u001b[0mr\u001b[0mi\u001b[0md\u001b[0me\u001b[0m.\u001b[0m.\u001b[0m.\u001b[0m\n",
202
+ "2\u001b[1m\u001b[33m Gold: \u001b[0mI\u001b[0mf\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m \u001b[0m \u001b[0mw\u001b[0me\u001b[0mr\u001b[0me\u001b[0m \u001b[0m \u001b[0ms\u001b[0mm\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0m,\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mu\u001b[0ml\u001b[0md\u001b[0m \u001b[0m \u001b[0mB\u001b[0mo\u001b[0my\u001b[0mc\u001b[0mo\u001b[0mt\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0mi\u001b[0ms\u001b[0m \u001b[0m \u001b[0mi\u001b[0mn\u001b[0me\u001b[0mp\u001b[0mt\u001b[0m \u001b[0m \u001b[0ma\u001b[0mi\u001b[0mr\u001b[0ml\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m,\u001b[0m \u001b[0m \u001b[0mb\u001b[0mu\u001b[0mt\u001b[0m \u001b[0m \u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0m \u001b[0ms\u001b[0mm\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0m,\u001b[0m \u001b[0m \u001b[0ms\u001b[0mo\u001b[0m \u001b[0m \u001b[0m \u001b[0mr\u001b[0mo\u001b[0mg\u001b[0mu\u001b[0me\u001b[0m \u001b[0m \u001b[0mb\u001b[0mu\u001b[0ms\u001b[0mi\u001b[0mn\u001b[0me\u001b[0ms\u001b[0ms\u001b[0me\u001b[0ms\u001b[0m \u001b[0m \u001b[0ml\u001b[0mi\u001b[0mk\u001b[0me\u001b[0m \u001b[0m \u001b[0mt\u001b[0mh\u001b[0mi\u001b[0ms\u001b[0m \u001b[0m \u001b[0mo\u001b[0mn\u001b[0me\u001b[0m,\u001b[0m \u001b[0m \u001b[0m \u001b[0ms\u001b[0mt\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0m \u001b[0mt\u001b[0mh\u001b[0mr\u001b[0mi\u001b[0mv\u001b[0me\u001b[0m \u001b[0m \u001b[0m \u001b[0mt\u001b[0ma\u001b[0mk\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m\u001b[41mi\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m\u001b[41ms\u001b[0m \u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0m \u001b[0ma\u001b[0m \u001b[0mr\u001b[0mi\u001b[0md\u001b[0me\u001b[0m.\u001b[0m.\u001b[0m.\u001b[0m\n",
203
+ "\n",
204
+ "\n",
205
+ "3\u001b[1m\u001b[36m Pred: \u001b[0mT\u001b[0mr\u001b[0mu\u001b[0mm\u001b[0mp\u001b[0m \u001b[0mC\u001b[0ml\u001b[0ma\u001b[0mi\u001b[0mm\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mR\u001b[0mu\u001b[0ms\u001b[0ms\u001b[0mi\u001b[0ma\u001b[0m \u001b[0mw\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0m \u001b[0mn\u001b[0me\u001b[0mv\u001b[0me\u001b[0mr\u001b[0m \u001b[0mi\u001b[0mn\u001b[0mv\u001b[0ma\u001b[0md\u001b[0me\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mU\u001b[0mk\u001b[0mr\u001b[0ma\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m,\u001b[0m \u001b[0mw\u001b[0mh\u001b[0me\u001b[0mn\u001b[0m \u001b[0mR\u001b[0mu\u001b[0ms\u001b[0ms\u001b[0mi\u001b[0ma\u001b[0m \u001b[0ma\u001b[0ml\u001b[0mr\u001b[0me\u001b[0ma\u001b[0md\u001b[0my\u001b[0m \u001b[0mh\u001b[0ma\u001b[0ms\u001b[0m \u001b[0m-\u001b[0m \u001b[0mh\u001b[0mo\u001b[0mw\u001b[0m \u001b[0m\u001b[41ms\u001b[0m\u001b[41mt\u001b[0m\u001b[41mu\u001b[0m\u001b[41mp\u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m \u001b[0mc\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m \u001b[0mb\u001b[0me\u001b[0m?\u001b[0m\n",
206
+ "3\u001b[1m\u001b[33m Gold: \u001b[0mT\u001b[0mr\u001b[0mu\u001b[0mm\u001b[0mp\u001b[0m \u001b[0mC\u001b[0ml\u001b[0ma\u001b[0mi\u001b[0mm\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mR\u001b[0mu\u001b[0ms\u001b[0ms\u001b[0mi\u001b[0ma\u001b[0m \u001b[0mw\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0m \u001b[0mn\u001b[0me\u001b[0mv\u001b[0me\u001b[0mr\u001b[0m \u001b[0mi\u001b[0mn\u001b[0mv\u001b[0ma\u001b[0md\u001b[0me\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mU\u001b[0mk\u001b[0mr\u001b[0ma\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m,\u001b[0m \u001b[0mw\u001b[0mh\u001b[0me\u001b[0mn\u001b[0m \u001b[0mR\u001b[0mu\u001b[0ms\u001b[0ms\u001b[0mi\u001b[0ma\u001b[0m \u001b[0ma\u001b[0ml\u001b[0mr\u001b[0me\u001b[0ma\u001b[0md\u001b[0my\u001b[0m \u001b[0mh\u001b[0ma\u001b[0ms\u001b[0m \u001b[0m-\u001b[0m \u001b[0mh\u001b[0mo\u001b[0mw\u001b[0m \u001b[0m\u001b[41ms\u001b[0m\u001b[41mt\u001b[0m\u001b[41mu\u001b[0m\u001b[41mp\u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m \u001b[0mc\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m \u001b[0mb\u001b[0me\u001b[0m?\u001b[0m\n",
207
+ "\n",
208
+ "\n",
209
+ "4\u001b[1m\u001b[36m Pred: \u001b[0mA\u001b[0ms\u001b[0m \u001b[0ml\u001b[0mo\u001b[0mn\u001b[0mg\u001b[0m \u001b[0ma\u001b[0ms\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0mr\u001b[0m \u001b[0mw\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mp\u001b[0ma\u001b[0my\u001b[0m \u001b[0ma\u001b[0m \u001b[0ml\u001b[0mo\u001b[0mt\u001b[0m \u001b[0mm\u001b[0mo\u001b[0mr\u001b[0me\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mp\u001b[0mr\u001b[0mo\u001b[0md\u001b[0mu\u001b[0mc\u001b[0mt\u001b[0ms\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0mb\u001b[0mu\u001b[0my\u001b[0m,\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mn\u001b[0m \u001b[0mf\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m.\u001b[0m\n",
210
+ "\u001b[0mB\u001b[0mu\u001b[0mt\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0mb\u001b[0me\u001b[0mt\u001b[0mt\u001b[0me\u001b[0mr\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0mb\u001b[0me\u001b[0m \u001b[0mg\u001b[0mo\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mC\u001b[0mo\u001b[0ms\u001b[0mt\u001b[0mc\u001b[0mo\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0mW\u001b[0ma\u001b[0ml\u001b[0mm\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mb\u001b[0mu\u001b[0my\u001b[0m \u001b[0ms\u001b[0mt\u001b[0mu\u001b[0mf\u001b[0mf\u001b[0m \u001b[0mb\u001b[0me\u001b[0mc\u001b[0ma\u001b[0mu\u001b[0ms\u001b[0me\u001b[0m \u001b[0mi\u001b[0mt\u001b[0m'\u001b[0ms\u001b[0m \u001b[0mc\u001b[0mh\u001b[0me\u001b[0ma\u001b[0mp\u001b[0me\u001b[0mr\u001b[0m.\u001b[0m\n",
211
+ "\u001b[0mI\u001b[0mf\u001b[0m \u001b[0ms\u001b[0mo\u001b[0m,\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0mg\u001b[0me\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mc\u001b[0ma\u001b[0ml\u001b[0ml\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0ma\u001b[0m \u001b[0m\u001b[41mh\u001b[0m\u001b[41my\u001b[0m\u001b[41mp\u001b[0m\u001b[41mo\u001b[0m\u001b[41mc\u001b[0m\u001b[41mr\u001b[0m\u001b[41mi\u001b[0m\u001b[41mt\u001b[0m\u001b[41mi\u001b[0m\u001b[41mc\u001b[0m\u001b[41ma\u001b[0m\u001b[41ml\u001b[0m \u001b[0mw\u001b[0ma\u001b[0mn\u001b[0mk\u001b[0me\u001b[0mr\u001b[0m.\u001b[0m\n",
212
+ "4\u001b[1m\u001b[33m Gold: \u001b[0mA\u001b[0ms\u001b[0m \u001b[0ml\u001b[0mo\u001b[0mn\u001b[0mg\u001b[0m \u001b[0ma\u001b[0ms\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0mr\u001b[0m \u001b[0mw\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mp\u001b[0ma\u001b[0my\u001b[0m \u001b[0ma\u001b[0m \u001b[0ml\u001b[0mo\u001b[0mt\u001b[0m \u001b[0mm\u001b[0mo\u001b[0mr\u001b[0me\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mp\u001b[0mr\u001b[0mo\u001b[0md\u001b[0mu\u001b[0mc\u001b[0mt\u001b[0ms\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0mb\u001b[0mu\u001b[0my\u001b[0m,\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mn\u001b[0m \u001b[0mf\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m.\u001b[0m\n",
213
+ "\u001b[0mB\u001b[0mu\u001b[0mt\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0mb\u001b[0me\u001b[0mt\u001b[0mt\u001b[0me\u001b[0mr\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0mb\u001b[0me\u001b[0m \u001b[0mg\u001b[0mo\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mC\u001b[0mo\u001b[0ms\u001b[0mt\u001b[0mc\u001b[0mo\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0mW\u001b[0ma\u001b[0ml\u001b[0mm\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mb\u001b[0mu\u001b[0my\u001b[0m \u001b[0ms\u001b[0mt\u001b[0mu\u001b[0mf\u001b[0mf\u001b[0m \u001b[0mb\u001b[0me\u001b[0mc\u001b[0ma\u001b[0mu\u001b[0ms\u001b[0me\u001b[0m \u001b[0mi\u001b[0mt\u001b[0m'\u001b[0ms\u001b[0m \u001b[0mc\u001b[0mh\u001b[0me\u001b[0ma\u001b[0mp\u001b[0me\u001b[0mr\u001b[0m.\u001b[0m\n",
214
+ "\u001b[0mI\u001b[0mf\u001b[0m \u001b[0ms\u001b[0mo\u001b[0m,\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0mg\u001b[0me\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mc\u001b[0ma\u001b[0ml\u001b[0ml\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0ma\u001b[0m \u001b[0mh\u001b[0my\u001b[0mp\u001b[0mo\u001b[0mc\u001b[0mr\u001b[0mi\u001b[0mt\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0ml\u001b[0m \u001b[0mw\u001b[0ma\u001b[0mn\u001b[0mk\u001b[0me\u001b[0mr\u001b[0m.\u001b[0m\n",
215
+ "\n",
216
+ "\n"
217
+ ]
218
+ }
219
+ ],
220
+ "source": [
221
+ "indices_test = []\n",
222
+ "for i, (gold_index, text) in enumerate(zip(test['spans'],test['text'])):\n",
223
+ " tagged_sentence = tagger_LSTM(text) \n",
224
+ " prediction_index = get_index_toxic_words(text.lower(), tagged_sentence)\n",
225
+ " indices_test.append(prediction_index)\n",
226
+ " \n",
227
+ " if i < 5:\n",
228
+ " print(str(i) + colored(' Pred: ', color='cyan', attrs=['bold']) + \n",
229
+ " color_toxic_words(prediction_index, text))\n",
230
+ " print(str(i) + colored(' Gold: ', color='yellow', attrs=['bold']) + \n",
231
+ " color_toxic_words(gold_index, text) + '\\n'*2)"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "execution_count": 10,
237
+ "metadata": {},
238
+ "outputs": [
239
+ {
240
+ "name": "stdout",
241
+ "output_type": "stream",
242
+ "text": [
243
+ "F1 in test: 0.648722\n"
244
+ ]
245
+ }
246
+ ],
247
+ "source": [
248
+ "score_test = [f1(pred, gold) for pred,gold in zip(indices_test, test['spans'])]\n",
249
+ "print('F1 in test: {:.6f}'.format(np.mean(score_test)))"
250
+ ]
251
+ },
252
+ {
253
+ "cell_type": "markdown",
254
+ "metadata": {},
255
+ "source": [
256
+ "## Predicción en el dataset de *evaluation*"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": 11,
262
+ "metadata": {},
263
+ "outputs": [],
264
+ "source": [
265
+ "evaluation = pd.read_csv('../data/tsd_test.csv')"
266
+ ]
267
+ },
268
+ {
269
+ "cell_type": "code",
270
+ "execution_count": 12,
271
+ "metadata": {},
272
+ "outputs": [
273
+ {
274
+ "name": "stdout",
275
+ "output_type": "stream",
276
+ "text": [
277
+ "0\u001b[1m\u001b[36m Pred: \u001b[0mT\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m'\u001b[0ms\u001b[0m \u001b[0mr\u001b[0mi\u001b[0mg\u001b[0mh\u001b[0mt\u001b[0m.\u001b[0m \u001b[0mT\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mr\u001b[0mm\u001b[0ma\u001b[0ml\u001b[0m.\u001b[0m \u001b[0mA\u001b[0mn\u001b[0md\u001b[0m \u001b[0mI\u001b[0m \u001b[0ma\u001b[0mm\u001b[0m \u001b[0ms\u001b[0mt\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mf\u001b[0mr\u001b[0mo\u001b[0mm\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mp\u001b[0mr\u001b[0me\u001b[0mm\u001b[0mi\u001b[0ms\u001b[0me\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0mA\u001b[0mB\u001b[0mN\u001b[0mO\u001b[0mR\u001b[0mM\u001b[0mA\u001b[0mL\u001b[0m.\u001b[0m \u001b[0mP\u001b[0mr\u001b[0mo\u001b[0mc\u001b[0me\u001b[0me\u001b[0md\u001b[0m \u001b[0mw\u001b[0mt\u001b[0mh\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mt\u001b[0my\u001b[0mp\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0ml\u001b[0m \u001b[0m\u001b[41mr\u001b[0m\u001b[41ma\u001b[0m\u001b[41mc\u001b[0m\u001b[41mi\u001b[0m\u001b[41ms\u001b[0m\u001b[41mt\u001b[0m\u001b[41m,\u001b[0m\u001b[41m \u001b[0m\u001b[41mb\u001b[0m\u001b[41mi\u001b[0m\u001b[41mg\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m\u001b[41m,\u001b[0m\u001b[41m \u001b[0m\u001b[41ms\u001b[0m\u001b[41me\u001b[0m\u001b[41mx\u001b[0m\u001b[41mi\u001b[0m\u001b[41ms\u001b[0m\u001b[41mt\u001b[0m\u001b[41m \u001b[0m\u001b[41mr\u001b[0m\u001b[41mu\u001b[0m\u001b[41mb\u001b[0m\u001b[41mb\u001b[0m\u001b[41mi\u001b[0m\u001b[41ms\u001b[0m\u001b[41mh\u001b[0m.\u001b[0m\n",
278
+ "\u001b[0mT\u001b[0mh\u001b[0ma\u001b[0mn\u001b[0mk\u001b[0ms\u001b[0m!\u001b[0m\n",
279
+ "\n",
280
+ "1\u001b[1m\u001b[36m Pred: \u001b[0m\"\u001b[0mW\u001b[0ma\u001b[0mt\u001b[0mc\u001b[0mh\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m \u001b[0md\u001b[0mi\u001b[0me\u001b[0m \u001b[0mf\u001b[0mr\u001b[0mo\u001b[0mm\u001b[0m \u001b[0mt\u001b[0ma\u001b[0mk\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0ma\u001b[0mw\u001b[0ma\u001b[0my\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mi\u001b[0mr\u001b[0m \u001b[0mh\u001b[0me\u001b[0ma\u001b[0ml\u001b[0mt\u001b[0mh\u001b[0mc\u001b[0ma\u001b[0mr\u001b[0me\u001b[0m\"\u001b[0m\n",
281
+ "\u001b[0m\n",
282
+ "\u001b[0mD\u001b[0mI\u001b[0mN\u001b[0mG\u001b[0m \u001b[0mD\u001b[0mI\u001b[0mN\u001b[0mG\u001b[0m \u001b[0mD\u001b[0mI\u001b[0mN\u001b[0mG\u001b[0m!\u001b[0m \u001b[0m \u001b[0mW\u001b[0mi\u001b[0mn\u001b[0mn\u001b[0me\u001b[0mr\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0m\u001b[41ms\u001b[0m\u001b[41mt\u001b[0m\u001b[41mu\u001b[0m\u001b[41mp\u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m \u001b[0mp\u001b[0mo\u001b[0ms\u001b[0mt\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0md\u001b[0ma\u001b[0my\u001b[0m \u001b[0ma\u001b[0mw\u001b[0ma\u001b[0mr\u001b[0md\u001b[0m!\u001b[0m\n",
283
+ "\n",
284
+ "2\u001b[1m\u001b[36m Pred: \u001b[0mt\u001b[0me\u001b[0mn\u001b[0ms\u001b[0m \u001b[0my\u001b[0me\u001b[0ma\u001b[0mr\u001b[0ms\u001b[0m \u001b[0ma\u001b[0mg\u001b[0mo\u001b[0m \u001b[0mi\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mn\u001b[0mt\u001b[0ma\u001b[0mc\u001b[0mt\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mP\u001b[0mD\u001b[0mR\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0ms\u001b[0mu\u001b[0mg\u001b[0mg\u001b[0me\u001b[0ms\u001b[0mt\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mt\u001b[0mi\u001b[0mm\u001b[0me\u001b[0m \u001b[0mm\u001b[0mi\u001b[0mg\u001b[0mh\u001b[0mt\u001b[0m \u001b[0mb\u001b[0me\u001b[0m \u001b[0mg\u001b[0mo\u001b[0mo\u001b[0md\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mr\u001b[0mk\u001b[0m \u001b[0mw\u001b[0mi\u001b[0mt\u001b[0mh\u001b[0m \u001b[0ma\u001b[0ml\u001b[0ma\u001b[0ms\u001b[0mk\u001b[0ma\u001b[0m \u001b[0mo\u001b[0mn\u001b[0m \u001b[0mb\u001b[0mu\u001b[0mi\u001b[0ml\u001b[0md\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0ma\u001b[0m \u001b[0mg\u001b[0ma\u001b[0ms\u001b[0m \u001b[0ml\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m.\u001b[0m.\u001b[0m \u001b[0ma\u001b[0ml\u001b[0ma\u001b[0ms\u001b[0mk\u001b[0ma\u001b[0m \u001b[0mr\u001b[0me\u001b[0mj\u001b[0me\u001b[0mc\u001b[0mt\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mm\u001b[0m \u001b[0mw\u001b[0mi\u001b[0mt\u001b[0mh\u001b[0mo\u001b[0mu\u001b[0mt\u001b[0m \u001b[0me\u001b[0mv\u001b[0me\u001b[0mn\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mn\u001b[0ms\u001b[0mi\u001b[0md\u001b[0me\u001b[0mr\u001b[0ma\u001b[0mt\u001b[0mi\u001b[0mo\u001b[0mn\u001b[0m \u001b[0md\u001b[0me\u001b[0ms\u001b[0mp\u001b[0mi\u001b[0mt\u001b[0me\u001b[0m \u001b[0mc\u001b[0mh\u001b[0mi\u001b[0mn\u001b[0ma\u001b[0m \u001b[0mb\u001b[0me\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mf\u001b[0ml\u001b[0mu\u001b[0ms\u001b[0mh\u001b[0m \u001b[0mw\u001b[0mi\u001b[0mt\u001b[0mh\u001b[0m \u001b[0mc\u001b[0ma\u001b[0ms\u001b[0mh\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0mh\u001b[0mu\u001b[0mn\u001b[0mg\u001b[0mr\u001b[0my\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mg\u001b[0ma\u001b[0ms\u001b[0m.\u001b[0m.\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0ms\u001b[0me\u001b[0mt\u001b[0m \u001b[0mu\u001b[0mp\u001b[0m \u001b[0ma\u001b[0mn\u001b[0mo\u001b[0mt\u001b[0mh\u001b[0me\u001b[0mr\u001b[0m \u001b[0mi\u001b[0mn\u001b[0mf\u001b[0ma\u001b[0mm\u001b[0mo\u001b[0mu\u001b[0ms\u001b[0m \u001b[0mb\u001b[0mo\u001b[0mo\u001b[0mn\u001b[0md\u001b[0mo\u001b[0mg\u001b[0mg\u001b[0ml\u001b[0me\u001b[0m.\u001b[0m.\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mt\u001b[0mr\u001b[0ma\u001b[0mn\u001b[0ms\u001b[0mc\u001b[0ma\u001b[0mn\u001b[0ma\u001b[0md\u001b[0ma\u001b[0m-\u001b[0me\u001b[0mx\u001b[0mx\u001b[0mo\u001b[0mn\u001b[0m \u001b[0mr\u001b[0mi\u001b[0mp\u001b[0m \u001b[0mo\u001b[0mf\u001b[0mf\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0ms\u001b[0mt\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0m \u001b[0mp\u001b[0ma\u001b[0my\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0mh\u001b[0ma\u001b[0mv\u001b[0me\u001b[0m \u001b[0my\u001b[0me\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mr\u001b[0me\u001b[0mc\u001b[0me\u001b[0mi\u001b[0mv\u001b[0me\u001b[0m \u001b[0ma\u001b[0mn\u001b[0my\u001b[0mt\u001b[0mh\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0mv\u001b[0ma\u001b[0ml\u001b[0mu\u001b[0me\u001b[0m.\u001b[0m.\u001b[0m \u001b[0mh\u001b[0mu\u001b[0mn\u001b[0md\u001b[0mr\u001b[0me\u001b[0md\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0mm\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0mi\u001b[0mo\u001b[0mn\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0md\u001b[0mo\u001b[0ml\u001b[0ml\u001b[0ma\u001b[0mr\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mn\u001b[0m \u001b[0ms\u001b[0mt\u001b[0mu\u001b[0md\u001b[0mi\u001b[0me\u001b[0ms\u001b[0m.\u001b[0m.\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0mb\u001b[0mu\u001b[0my\u001b[0mo\u001b[0mu\u001b[0mt\u001b[0ms\u001b[0m.\u001b[0m.\u001b[0m \u001b[0mi\u001b[0m \u001b[0mh\u001b[0mo\u001b[0mp\u001b[0me\u001b[0m \u001b[0mc\u001b[0mh\u001b[0mi\u001b[0mn\u001b[0ma\u001b[0m \u001b[0ms\u001b[0ma\u001b[0my\u001b[0ms\u001b[0m \u001b[0mf\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0ma\u001b[0ml\u001b[0ma\u001b[0ms\u001b[0mk\u001b[0ma\u001b[0m.\u001b[0m.\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0mh\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mb\u001b[0mu\u001b[0mt\u001b[0m \u001b[0m\u001b[41mi\u001b[0m\u001b[41mg\u001b[0m\u001b[41mn\u001b[0m\u001b[41mo\u001b[0m\u001b[41mr\u001b[0m\u001b[41ma\u001b[0m\u001b[41mn\u001b[0m\u001b[41mt\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m.\u001b[0m.\u001b[0m\n",
285
+ "\n",
286
+ "3\u001b[1m\u001b[36m Pred: \u001b[0mT\u001b[0mh\u001b[0me\u001b[0m \u001b[0mp\u001b[0ma\u001b[0mr\u001b[0ma\u001b[0ml\u001b[0ml\u001b[0me\u001b[0ml\u001b[0ms\u001b[0m \u001b[0mb\u001b[0me\u001b[0mt\u001b[0mw\u001b[0me\u001b[0me\u001b[0mn\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mA\u001b[0mN\u001b[0mC\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mS\u001b[0mi\u001b[0mc\u001b[0mi\u001b[0ml\u001b[0mi\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mM\u001b[0ma\u001b[0mf\u001b[0mi\u001b[0ma\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0mg\u001b[0ml\u001b[0ma\u001b[0mr\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m.\u001b[0m \u001b[0mT\u001b[0mh\u001b[0me\u001b[0m \u001b[0mA\u001b[0mN\u001b[0mC\u001b[0m \u001b[0mh\u001b[0ma\u001b[0ms\u001b[0m \u001b[0ma\u001b[0ml\u001b[0mw\u001b[0ma\u001b[0my\u001b[0ms\u001b[0m \u001b[0mb\u001b[0me\u001b[0me\u001b[0mn\u001b[0m \u001b[0mr\u001b[0mu\u001b[0mn\u001b[0m \u001b[0mb\u001b[0my\u001b[0m \u001b[0ma\u001b[0m \u001b[0mf\u001b[0me\u001b[0mw\u001b[0m \u001b[0m\"\u001b[0mf\u001b[0ma\u001b[0mm\u001b[0mi\u001b[0ml\u001b[0mi\u001b[0me\u001b[0ms\u001b[0m\"\u001b[0m \u001b[0mw\u001b[0mh\u001b[0mo\u001b[0m \u001b[0mt\u001b[0mr\u001b[0me\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0ms\u001b[0mt\u001b[0ma\u001b[0mt\u001b[0me\u001b[0m \u001b[0ma\u001b[0ms\u001b[0m \u001b[0m'\u001b[0mt\u001b[0mu\u001b[0mr\u001b[0mf\u001b[0m'\u001b[0m \u001b[0m;\u001b[0m \u001b[0ma\u001b[0ms\u001b[0m \u001b[0mj\u001b[0mu\u001b[0ms\u001b[0mt\u001b[0m \u001b[0mo\u001b[0mn\u001b[0me\u001b[0m \u001b[0mb\u001b[0mi\u001b[0mg\u001b[0m \u001b[0mp\u001b[0mi\u001b[0mg\u001b[0mg\u001b[0my\u001b[0m \u001b[0mb\u001b[0ma\u001b[0mn\u001b[0mk\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mi\u001b[0mr\u001b[0m \u001b[0ms\u001b[0me\u001b[0ml\u001b[0mf\u001b[0m-\u001b[0me\u001b[0mn\u001b[0mr\u001b[0mi\u001b[0mc\u001b[0mh\u001b[0mm\u001b[0me\u001b[0mn\u001b[0mt\u001b[0m.\u001b[0m \u001b[0mT\u001b[0mh\u001b[0me\u001b[0m \u001b[0mg\u001b[0mo\u001b[0mv\u001b[0me\u001b[0mr\u001b[0mn\u001b[0mm\u001b[0me\u001b[0mn\u001b[0mt\u001b[0m \u001b[0mb\u001b[0ma\u001b[0ms\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0ml\u001b[0ml\u001b[0my\u001b[0m \u001b[0mb\u001b[0me\u001b[0ml\u001b[0mi\u001b[0me\u001b[0mv\u001b[0me\u001b[0ms\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0ma\u001b[0ml\u001b[0ml\u001b[0m \u001b[0mj\u001b[0mu\u001b[0ms\u001b[0mt\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mr\u001b[0mk\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mm\u001b[0m.\u001b[0m \u001b[0mT\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0mn\u001b[0m'\u001b[0mt\u001b[0m \u001b[0ma\u001b[0m \u001b[0md\u001b[0me\u001b[0mm\u001b[0mo\u001b[0mc\u001b[0mr\u001b[0ma\u001b[0mt\u001b[0mi\u001b[0mc\u001b[0m \u001b[0mg\u001b[0mo\u001b[0mv\u001b[0me\u001b[0mr\u001b[0mn\u001b[0mm\u001b[0me\u001b[0mn\u001b[0mt\u001b[0m \u001b[0ma\u001b[0mt\u001b[0m \u001b[0ma\u001b[0ml\u001b[0ml\u001b[0m,\u001b[0m \u001b[0mb\u001b[0mu\u001b[0mt\u001b[0m \u001b[0mu\u001b[0ms\u001b[0me\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0ma\u001b[0mp\u001b[0mp\u001b[0me\u001b[0ma\u001b[0mr\u001b[0ma\u001b[0mn\u001b[0mc\u001b[0me\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0md\u001b[0me\u001b[0mm\u001b[0mo\u001b[0mc\u001b[0mr\u001b[0ma\u001b[0mc\u001b[0my\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mg\u001b[0mi\u001b[0mv\u001b[0me\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mi\u001b[0mr\u001b[0m \u001b[0me\u001b[0mn\u001b[0mt\u001b[0mi\u001b[0mt\u001b[0ml\u001b[0me\u001b[0mm\u001b[0me\u001b[0mn\u001b[0mt\u001b[0m \u001b[0mp\u001b[0mr\u001b[0ma\u001b[0mc\u001b[0mt\u001b[0mi\u001b[0mc\u001b[0me\u001b[0ms\u001b[0m \u001b[0ma\u001b[0m \u001b[0mm\u001b[0ma\u001b[0ms\u001b[0mk\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0ml\u001b[0me\u001b[0mg\u001b[0mi\u001b[0mt\u001b[0mi\u001b[0mm\u001b[0ma\u001b[0mc\u001b[0my\u001b[0m.\u001b[0m \u001b[0mT\u001b[0mh\u001b[0me\u001b[0m \u001b[0mp\u001b[0mo\u001b[0mo\u001b[0mr\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0m\u001b[41mi\u001b[0m\u001b[41mg\u001b[0m\u001b[41mn\u001b[0m\u001b[41mo\u001b[0m\u001b[41mr\u001b[0m\u001b[41ma\u001b[0m\u001b[41mn\u001b[0m\u001b[41mt\u001b[0m \u001b[0mh\u001b[0ma\u001b[0mv\u001b[0me\u001b[0m \u001b[0mb\u001b[0me\u001b[0me\u001b[0mn\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mo\u001b[0ml\u001b[0me\u001b[0md\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0ms\u001b[0mo\u001b[0m \u001b[0ml\u001b[0mo\u001b[0mn\u001b[0mg\u001b[0m,\u001b[0m \u001b[0mb\u001b[0mu\u001b[0mt\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0ms\u001b[0ml\u001b[0mo\u001b[0mw\u001b[0ml\u001b[0my\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mA\u001b[0mN\u001b[0mC\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mw\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mi\u001b[0mt\u001b[0m \u001b[0mi\u001b[0ms\u001b[0m:\u001b[0m \u001b[0ma\u001b[0m \u001b[0ms\u001b[0me\u001b[0ml\u001b[0mf\u001b[0m-\u001b[0ms\u001b[0me\u001b[0mr\u001b[0mv\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mM\u001b[0ma\u001b[0mf\u001b[0mi\u001b[0ma\u001b[0m!\u001b[0m\n",
287
+ "\n",
288
+ "4\u001b[1m\u001b[36m Pred: \u001b[0mI\u001b[0mn\u001b[0mt\u001b[0me\u001b[0ml\u001b[0m \u001b[0mC\u001b[0mo\u001b[0mm\u001b[0mm\u001b[0mu\u001b[0mn\u001b[0mi\u001b[0mt\u001b[0my\u001b[0m:\u001b[0m \u001b[0m‘\u001b[0mH\u001b[0mo\u001b[0mw\u001b[0m \u001b[0mc\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mr\u001b[0mk\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0ma\u001b[0m \u001b[0mP\u001b[0mr\u001b[0me\u001b[0ms\u001b[0mi\u001b[0md\u001b[0me\u001b[0mn\u001b[0mt\u001b[0m \u001b[0mw\u001b[0mh\u001b[0mo\u001b[0m \u001b[0mU\u001b[0mn\u001b[0md\u001b[0me\u001b[0mr\u001b[0mm\u001b[0mi\u001b[0mn\u001b[0me\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mu\u001b[0mr\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mr\u001b[0mk\u001b[0m?\u001b[0m’\u001b[0m\n",
289
+ "\u001b[0m\n",
290
+ "\u001b[0mD\u001b[0ma\u001b[0my\u001b[0ms\u001b[0m \u001b[0mb\u001b[0me\u001b[0mf\u001b[0mo\u001b[0mr\u001b[0me\u001b[0m \u001b[0mt\u001b[0ma\u001b[0mk\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mo\u001b[0mf\u001b[0mf\u001b[0mi\u001b[0mc\u001b[0me\u001b[0m,\u001b[0m \u001b[0mh\u001b[0me\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mp\u001b[0ma\u001b[0mr\u001b[0me\u001b[0md\u001b[0m \u001b[0mA\u001b[0mm\u001b[0me\u001b[0mr\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0m’\u001b[0ms\u001b[0m \u001b[0ms\u001b[0mp\u001b[0mi\u001b[0me\u001b[0ms\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mA\u001b[0md\u001b[0mo\u001b[0ml\u001b[0mf\u001b[0m \u001b[0mH\u001b[0mi\u001b[0mt\u001b[0ml\u001b[0me\u001b[0mr\u001b[0m’\u001b[0ms\u001b[0m \u001b[0m\u001b[41mG\u001b[0m\u001b[41me\u001b[0m\u001b[41ms\u001b[0m\u001b[41mt\u001b[0m\u001b[41ma\u001b[0m\u001b[41mp\u001b[0m\u001b[41mo\u001b[0m.\u001b[0m\n",
291
+ "\u001b[0m\n",
292
+ "\u001b[0mA\u001b[0mm\u001b[0me\u001b[0mr\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0mn\u001b[0m \u001b[0ms\u001b[0mp\u001b[0mi\u001b[0me\u001b[0ms\u001b[0m \u001b[0md\u001b[0mo\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0ml\u001b[0mi\u001b[0mk\u001b[0me\u001b[0m \u001b[0mh\u001b[0ma\u001b[0mv\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mi\u001b[0mr\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mr\u001b[0mk\u001b[0m \u001b[0md\u001b[0mi\u001b[0ms\u001b[0mm\u001b[0mi\u001b[0ms\u001b[0ms\u001b[0me\u001b[0md\u001b[0m \u001b[0mb\u001b[0my\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mp\u001b[0mr\u001b[0me\u001b[0ms\u001b[0mi\u001b[0md\u001b[0me\u001b[0mn\u001b[0mt\u001b[0m.\u001b[0m \u001b[0mN\u001b[0mo\u001b[0mr\u001b[0m \u001b[0md\u001b[0mo\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0ma\u001b[0mp\u001b[0mp\u001b[0mr\u001b[0me\u001b[0mc\u001b[0mi\u001b[0ma\u001b[0mt\u001b[0me\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mp\u001b[0ma\u001b[0mr\u001b[0mi\u001b[0ms\u001b[0mo\u001b[0mn\u001b[0ms\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0m\u001b[41mN\u001b[0m\u001b[41ma\u001b[0m\u001b[41mz\u001b[0m\u001b[41mi\u001b[0m \u001b[0mG\u001b[0me\u001b[0mr\u001b[0mm\u001b[0ma\u001b[0mn\u001b[0my\u001b[0m.\u001b[0m\n",
293
+ "\u001b[0m\n",
294
+ "\u001b[0mF\u001b[0mo\u001b[0mr\u001b[0mm\u001b[0me\u001b[0mr\u001b[0m \u001b[0mC\u001b[0mI\u001b[0mA\u001b[0m \u001b[0mD\u001b[0mi\u001b[0mr\u001b[0me\u001b[0mc\u001b[0mt\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mJ\u001b[0mo\u001b[0mh\u001b[0mn\u001b[0m \u001b[0mB\u001b[0mr\u001b[0me\u001b[0mn\u001b[0mn\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mm\u001b[0ma\u001b[0md\u001b[0me\u001b[0m \u001b[0mi\u001b[0mt\u001b[0m \u001b[0mc\u001b[0ml\u001b[0me\u001b[0ma\u001b[0mr\u001b[0m:\u001b[0m \u001b[0m“\u001b[0mT\u001b[0mh\u001b[0me\u001b[0m \u001b[0mp\u001b[0me\u001b[0mr\u001b[0ms\u001b[0mo\u001b[0mn\u001b[0m \u001b[0mw\u001b[0mh\u001b[0mo\u001b[0m \u001b[0ms\u001b[0ma\u001b[0mi\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0ms\u001b[0mh\u001b[0mo\u001b[0mu\u001b[0ml\u001b[0md\u001b[0m \u001b[0mb\u001b[0me\u001b[0m \u001b[0ma\u001b[0ms\u001b[0mh\u001b[0ma\u001b[0mm\u001b[0me\u001b[0md\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0mh\u001b[0mi\u001b[0mm\u001b[0ms\u001b[0me\u001b[0ml\u001b[0mf\u001b[0m,\u001b[0m”\u001b[0m \u001b[0mB\u001b[0mr\u001b[0me\u001b[0mn\u001b[0mn\u001b[0ma\u001b[0mn\u001b[0m \u001b[0ms\u001b[0ma\u001b[0mi\u001b[0md\u001b[0m.\u001b[0m\n",
295
+ "\u001b[0m\n",
296
+ "\u001b[0mC\u001b[0mo\u001b[0ma\u001b[0mt\u001b[0ms\u001b[0m’\u001b[0m \u001b[0mp\u001b[0mr\u001b[0me\u001b[0md\u001b[0me\u001b[0mc\u001b[0me\u001b[0ms\u001b[0ms\u001b[0mo\u001b[0mr\u001b[0m,\u001b[0m \u001b[0mG\u001b[0me\u001b[0mn\u001b[0m.\u001b[0m \u001b[0mJ\u001b[0ma\u001b[0mm\u001b[0me\u001b[0ms\u001b[0m \u001b[0mC\u001b[0ml\u001b[0ma\u001b[0mp\u001b[0mp\u001b[0me\u001b[0mr\u001b[0m,\u001b[0m \u001b[0mc\u001b[0ma\u001b[0ml\u001b[0ml\u001b[0me\u001b[0md\u001b[0m \u001b[0mT\u001b[0mr\u001b[0mu\u001b[0mm\u001b[0mp\u001b[0m’\u001b[0ms\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mm\u001b[0me\u001b[0mn\u001b[0mt\u001b[0ms\u001b[0m \u001b[0m“\u001b[0ma\u001b[0m \u001b[0mt\u001b[0me\u001b[0mr\u001b[0mr\u001b[0mi\u001b[0mb\u001b[0ml\u001b[0me\u001b[0m,\u001b[0m \u001b[0mi\u001b[0mn\u001b[0ms\u001b[0mu\u001b[0ml\u001b[0mt\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0ma\u001b[0mf\u001b[0mf\u001b[0mr\u001b[0mo\u001b[0mn\u001b[0mt\u001b[0m \u001b[0m…\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mp\u001b[0ml\u001b[0me\u001b[0mt\u001b[0me\u001b[0ml\u001b[0my\u001b[0m \u001b[0mi\u001b[0mn\u001b[0ma\u001b[0mp\u001b[0mp\u001b[0mr\u001b[0mo\u001b[0mp\u001b[0mr\u001b[0mi\u001b[0ma\u001b[0mt\u001b[0me\u001b[0m.\u001b[0m”\u001b[0m\n",
297
+ "\u001b[0m\n",
298
+ "\u001b[0mT\u001b[0mr\u001b[0mu\u001b[0mm\u001b[0mp\u001b[0m’\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mp\u001b[0mi\u001b[0mn\u001b[0mi\u001b[0mo\u001b[0mn\u001b[0ms\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0mo\u001b[0mu\u001b[0mt\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0ms\u001b[0mt\u001b[0me\u001b[0mp\u001b[0m \u001b[0mw\u001b[0mi\u001b[0mt\u001b[0mh\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mn\u001b[0mc\u001b[0ml\u001b[0mu\u001b[0ms\u001b[0mi\u001b[0mo\u001b[0mn\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0ma\u001b[0mg\u001b[0me\u001b[0mn\u001b[0mc\u001b[0mi\u001b[0me\u001b[0ms\u001b[0m \u001b[0mh\u001b[0me\u001b[0m \u001b[0mi\u001b[0ms\u001b[0m \u001b[0ms\u001b[0mu\u001b[0mp\u001b[0mp\u001b[0mo\u001b[0ms\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mb\u001b[0me\u001b[0m \u001b[0ml\u001b[0me\u001b[0ma\u001b[0md\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m.\u001b[0m\n",
299
+ "\u001b[0m\n",
300
+ "\u001b[0mN\u001b[0mo\u001b[0mt\u001b[0m \u001b[0ms\u001b[0mu\u001b[0mr\u001b[0mp\u001b[0mr\u001b[0mi\u001b[0ms\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mi\u001b[0mn\u001b[0m \u001b[0ma\u001b[0m \u001b[0mt\u001b[0mr\u001b[0me\u001b[0ma\u001b[0ms\u001b[0mo\u001b[0mn\u001b[0mo\u001b[0mu\u001b[0ms\u001b[0m \u001b[0mt\u001b[0mr\u001b[0ma\u001b[0mi\u001b[0mt\u001b[0mo\u001b[0mr\u001b[0m.\u001b[0m\n",
301
+ "\n"
302
+ ]
303
+ }
304
+ ],
305
+ "source": [
306
+ "indices_evaluation = []\n",
307
+ "for i,text in enumerate(evaluation['text']):\n",
308
+ " tagged_sentence = tagger_LSTM(text) \n",
309
+ " prediction_index = get_index_toxic_words(text.lower(), tagged_sentence)\n",
310
+ " indices_evaluation.append(prediction_index)\n",
311
+ " \n",
312
+ " if i < 5:\n",
313
+ " print(str(i) + colored(' Pred: ', color='cyan', attrs=['bold']) + \n",
314
+ " color_toxic_words(prediction_index, text) + '\\n') "
315
+ ]
316
+ },
317
+ {
318
+ "cell_type": "code",
319
+ "execution_count": 13,
320
+ "metadata": {},
321
+ "outputs": [
322
+ {
323
+ "data": {
324
+ "text/html": [
325
+ "<div>\n",
326
+ "<style scoped>\n",
327
+ " .dataframe tbody tr th:only-of-type {\n",
328
+ " vertical-align: middle;\n",
329
+ " }\n",
330
+ "\n",
331
+ " .dataframe tbody tr th {\n",
332
+ " vertical-align: top;\n",
333
+ " }\n",
334
+ "\n",
335
+ " .dataframe thead th {\n",
336
+ " text-align: right;\n",
337
+ " }\n",
338
+ "</style>\n",
339
+ "<table border=\"1\" class=\"dataframe\">\n",
340
+ " <thead>\n",
341
+ " <tr style=\"text-align: right;\">\n",
342
+ " <th></th>\n",
343
+ " <th>spans</th>\n",
344
+ " <th>text</th>\n",
345
+ " </tr>\n",
346
+ " </thead>\n",
347
+ " <tbody>\n",
348
+ " <tr>\n",
349
+ " <th>0</th>\n",
350
+ " <td>[118, 119, 120, 121, 122, 123, 124, 125, 126, ...</td>\n",
351
+ " <td>That's right. They are not normal. And I am st...</td>\n",
352
+ " </tr>\n",
353
+ " <tr>\n",
354
+ " <th>1</th>\n",
355
+ " <td>[81, 82, 83, 84, 85, 86]</td>\n",
356
+ " <td>\"Watch people die from taking away their healt...</td>\n",
357
+ " </tr>\n",
358
+ " <tr>\n",
359
+ " <th>2</th>\n",
360
+ " <td>[483, 484, 485, 486, 487, 488, 489, 490]</td>\n",
361
+ " <td>tens years ago i contacted the PDR and suggest...</td>\n",
362
+ " </tr>\n",
363
+ " <tr>\n",
364
+ " <th>3</th>\n",
365
+ " <td>[413, 414, 415, 416, 417, 418, 419, 420]</td>\n",
366
+ " <td>The parallels between the ANC and the Sicilian...</td>\n",
367
+ " </tr>\n",
368
+ " <tr>\n",
369
+ " <th>4</th>\n",
370
+ " <td>[150, 151, 152, 153, 154, 155, 156, 271, 272, ...</td>\n",
371
+ " <td>Intel Community: ‘How can we work for a Presid...</td>\n",
372
+ " </tr>\n",
373
+ " </tbody>\n",
374
+ "</table>\n",
375
+ "</div>"
376
+ ],
377
+ "text/plain": [
378
+ " spans \\\n",
379
+ "0 [118, 119, 120, 121, 122, 123, 124, 125, 126, ... \n",
380
+ "1 [81, 82, 83, 84, 85, 86] \n",
381
+ "2 [483, 484, 485, 486, 487, 488, 489, 490] \n",
382
+ "3 [413, 414, 415, 416, 417, 418, 419, 420] \n",
383
+ "4 [150, 151, 152, 153, 154, 155, 156, 271, 272, ... \n",
384
+ "\n",
385
+ " text \n",
386
+ "0 That's right. They are not normal. And I am st... \n",
387
+ "1 \"Watch people die from taking away their healt... \n",
388
+ "2 tens years ago i contacted the PDR and suggest... \n",
389
+ "3 The parallels between the ANC and the Sicilian... \n",
390
+ "4 Intel Community: ‘How can we work for a Presid... "
391
+ ]
392
+ },
393
+ "execution_count": 13,
394
+ "metadata": {},
395
+ "output_type": "execute_result"
396
+ }
397
+ ],
398
+ "source": [
399
+ "evaluation['spans'] = indices_evaluation\n",
400
+ "evaluation = evaluation[['spans', 'text']]\n",
401
+ "evaluation.head()"
402
+ ]
403
+ },
404
+ {
405
+ "cell_type": "markdown",
406
+ "metadata": {},
407
+ "source": [
408
+ "Para la evaluación se debe subir un zip con un archivo txt de la siguiente manera (al final subir el archivo `spans-pred.zip` que se produce):"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 14,
414
+ "metadata": {},
415
+ "outputs": [
416
+ {
417
+ "name": "stdout",
418
+ "output_type": "stream",
419
+ "text": [
420
+ " adding: spans-pred.txt (deflated 84%)\n"
421
+ ]
422
+ }
423
+ ],
424
+ "source": [
425
+ "predictions = evaluation['spans'].tolist()\n",
426
+ "ids = evaluation.index.tolist()\n",
427
+ "\n",
428
+ "with open(\"spans-pred.txt\", \"w\") as out:\n",
429
+ " for uid, text_scores in zip(ids, predictions):\n",
430
+ " out.write(f\"{str(uid)}\\t{str(text_scores)}\\n\")\n",
431
+ " \n",
432
+ "# Zip the predictions\n",
433
+ "! zip -r spans-pred.zip ./spans-pred.* \n",
434
+ "! rm spans-pred.txt\n",
435
+ "! mv spans-pred.zip ../spans-pred.zip"
436
+ ]
437
+ }
438
+ ],
439
+ "metadata": {
440
+ "kernelspec": {
441
+ "display_name": "Python 3",
442
+ "language": "python",
443
+ "name": "python3"
444
+ },
445
+ "language_info": {
446
+ "codemirror_mode": {
447
+ "name": "ipython",
448
+ "version": 3
449
+ },
450
+ "file_extension": ".py",
451
+ "mimetype": "text/x-python",
452
+ "name": "python",
453
+ "nbconvert_exporter": "python",
454
+ "pygments_lexer": "ipython3",
455
+ "version": "3.7.3"
456
+ },
457
+ "toc-autonumbering": false
458
+ },
459
+ "nbformat": 4,
460
+ "nbformat_minor": 4
461
+ }
notebooks/Training LSTM-bidirectional.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/models/toxic_speech.crfsuite ADDED
Binary file (440 kB). View file
 
notebooks/utils/__pycache__/lstm.cpython-39.pyc ADDED
Binary file (6.14 kB). View file
 
notebooks/utils/__pycache__/processing.cpython-39.pyc ADDED
Binary file (3.92 kB). View file
 
notebooks/utils/basic_models.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk import pos_tag
2
+ from nltk.tokenize import word_tokenize
3
+
4
+ # The following is for the CRF, which we don't use anymore, instead we focused in the LSTM
5
+
6
+ def word2features(sent, i):
7
+ word = sent[i][0]
8
+ postag = sent[i][1]
9
+ features = [
10
+ 'bias',
11
+ 'word.lower=' + word.lower(),
12
+ 'word[-3:]=' + word[-3:],
13
+ 'word[-2:]=' + word[-2:],
14
+ 'word.isupper=%s' % word.isupper(),
15
+ 'word.istitle=%s' % word.istitle(),
16
+ 'word.isdigit=%s' % word.isdigit(),
17
+ 'postag=' + postag,
18
+ 'postag[:2]=' + postag[:2],
19
+ ]
20
+ if i > 0:
21
+ word1 = sent[i-1][0]
22
+ postag1 = sent[i-1][1]
23
+ features.extend([
24
+ '-1:word.lower=' + word1.lower(),
25
+ '-1:word.istitle=%s' % word1.istitle(),
26
+ '-1:word.isupper=%s' % word1.isupper(),
27
+ '-1:postag=' + postag1,
28
+ '-1:postag[:2]=' + postag1[:2],
29
+ ])
30
+ else:
31
+ features.append('BOS')
32
+
33
+ if i < len(sent)-1:
34
+ word1 = sent[i+1][0]
35
+ postag1 = sent[i+1][1]
36
+ features.extend([
37
+ '+1:word.lower=' + word1.lower(),
38
+ '+1:word.istitle=%s' % word1.istitle(),
39
+ '+1:word.isupper=%s' % word1.isupper(),
40
+ '+1:postag=' + postag1,
41
+ '+1:postag[:2]=' + postag1[:2],
42
+ ])
43
+ else:
44
+ features.append('EOS')
45
+
46
+ return features
47
+
48
+ def sent2features(sent):
49
+ return [word2features(sent, i) for i in range(len(sent))]
50
+
51
+ def sent2labels(sent):
52
+ return [label for token, postag, label in sent]
53
+
54
+ def sent2tokens(sent):
55
+ return [token for token, postag, label in sent]
56
+
57
+ def token_postag_label(sentence):
58
+ return pos_tag(word_tokenize(sentence))
notebooks/utils/lstm.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.optim as optim
8
+
9
+ from torchtext.data import Field
10
+
11
+ from .processing import separate_words, f1_scores
12
+
13
+ import spacy
14
+ import ast
15
+
16
+ from tqdm import tqdm
17
+ from IPython.display import clear_output
18
+
19
+ nlp = spacy.load('en_core_web_md')
20
+ dev = 'cuda:0' if torch.cuda.is_available() else 'cpu'
21
+ torch.manual_seed(42)
22
+ torch.backends.cudnn.deterministic = True
23
+
24
+ # SpaCy hace cosas no deseadas con algunas palabras al tokenizar, como don't -> [do, n't], pero se puede corregir.
25
+ # Pero de acuerdo a SpaCy esa es la convención, además, eso se debería codificar en los embeddings, así que se quede
26
+ # así, sólo hay que usar el mismo tokenizador en Field de torchtext (permite el de SpaCy entre otros).
27
+
28
+ # from spacy.symbols import ORTH, LEMMA, POS
29
+ # nlp.tokenizer.add_special_case("don't", [{ORTH: "do"}, {ORTH: "not"}])
30
+ # nlp.tokenizer.add_special_case("don't", [{ORTH: "don't"}])
31
+ # nlp.tokenizer.add_special_case("doesn't", [{ORTH: "does"}, {ORTH: "not"}])
32
+
33
+ def spacy_tokenizer (text):
34
+ return [str(token) for token in nlp(text)]
35
+
36
+ def prepare_data(spans, texts):
37
+ data = []
38
+ for index, text in tqdm(zip(spans, texts), total=len(texts)):
39
+ toxic_words = [text[i[0]:i[-1]+1] for i in separate_words(index) if len(index) > 0]
40
+
41
+ tokens = spacy_tokenizer(text)
42
+ tagged_tokens = []
43
+
44
+ for token in tokens:
45
+ if token in toxic_words:
46
+ tagged_tokens.append('toxic')
47
+ # Removemos en caso de que se repita posteriormente pero esté como 'non_toxic'
48
+ toxic_words.remove(token)
49
+ else:
50
+ tagged_tokens.append('non_toxic')
51
+
52
+ data.append((tokens, tagged_tokens, text, index))
53
+
54
+ return data
55
+
56
+ def get_vocab(train_df):
57
+ train_df['text'] = train_df['text'].apply(lambda x:x.lower())
58
+
59
+ # Aquí había un problema, estábamos usando 2 tokenizadores diferentes para sacar los
60
+ # embeddings y para preprocesar el texto para entrenar. Pondré el de SpaCy como
61
+ # tokenizador en común con el corpus de 'en_core_web_md'
62
+ text_field = Field(
63
+ tokenize='spacy',
64
+ tokenizer_language='en_core_web_md',
65
+ lower=True
66
+ )
67
+ # sadly have to apply preprocess manually
68
+ preprocessed_text = train_df['text'].apply(lambda x: text_field.preprocess(x))
69
+ # load fastext simple embedding with 200d
70
+ text_field.build_vocab(
71
+ preprocessed_text,
72
+ vectors='glove.twitter.27B.200d'
73
+ )
74
+ # get the vocab instance
75
+ vocab = text_field.vocab
76
+
77
+ return vocab
78
+
79
+ def plot_loss_and_score(train_loss, test_loss, f1_scores_train, f1_scores_test, show=True):
80
+ _, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(18,7))
81
+
82
+ ax0.plot(np.arange(1, len(train_loss) + 1), train_loss, marker='o', label='Train loss')
83
+ ax0.plot(np.arange(1, len(test_loss) + 1), test_loss, marker='o', label='Test loss')
84
+ ax0.set_xlabel(r'\textbf{Epochs}',size=16)
85
+ ax0.set_ylabel(r'\textbf{Loss}', size=16)
86
+ ax0.tick_params(labelsize=14)
87
+ ax0.legend(fontsize=14)
88
+
89
+ ax1.plot(np.arange(1, len(f1_scores_train) + 1), f1_scores_train,
90
+ marker='o', label='F1 score in train')
91
+ ax1.plot(np.arange(1, len(f1_scores_test) + 1), f1_scores_test,
92
+ marker='o', label='F1 score in test')
93
+ ax1.set_xlabel(r'\textbf{Epochs}',size=16)
94
+ ax1.set_ylabel(r'\textbf{F1 score}', size=16)
95
+ ax1.tick_params(labelsize=14)
96
+ ax1.legend(fontsize=14)
97
+
98
+ title = 'train-F1: {:.4f} \n test-F1: {:.4f}'.format(np.max(f1_scores_train), np.max(f1_scores_test))
99
+ ax1.set_title(title, fontweight='bold', size=16)
100
+
101
+
102
+ if show:
103
+ plt.show()
104
+
105
+ # WTF Mario, this is a mess
106
+ def train_model(model, trainloader, testloader, stop_after_best, savefile):
107
+ criterion = nn.BCELoss()
108
+ optimizer = optim.Adam(model.parameters())
109
+
110
+ loss_per_epoch = [0]
111
+ training_loss = [0]
112
+ f1_scores_train = [0]
113
+ f1_scores_dev = [0]
114
+ best_l = None
115
+ best_tl = None
116
+ worst_l = None
117
+ worst_tl = None
118
+ worst_l_f1 = None
119
+ best_l_f1 = None
120
+ worst_tl_f1 = None
121
+ last_epoch_save = 0
122
+
123
+ epochs_without_change = 0
124
+ epochs = len(loss_per_epoch)
125
+
126
+ while epochs_without_change < stop_after_best:
127
+ clear_output(wait=True)
128
+
129
+ print("Training on: " + torch.cuda.get_device_name(torch.cuda.current_device()))
130
+ print("###############################################")
131
+ print("Current epoch: " + str(epochs))
132
+ print("Last model save was in epoch " + str(last_epoch_save))
133
+ print("Stopping training in: " + str(stop_after_best - epochs_without_change) + " epochs.")
134
+ print("###############################################")
135
+ print("[Best iter] training F1 is: " + str(best_tl))
136
+ print("[Best iter] dev F1 is: " + str(best_l))
137
+ print("###############################################")
138
+ print("[Last iter] training F1 was: " + str(f1_scores_train[-1]))
139
+ print("[Last iter] dev. F1 was: " + str(f1_scores_dev[-1]))
140
+ print("###############################################")
141
+
142
+ # Dibujo lo que puedo
143
+ plot_loss_and_score(training_loss, loss_per_epoch, f1_scores_train, f1_scores_dev, show=True)
144
+
145
+ tl = 0
146
+ t_pred_l = []
147
+ t_true_index_l = []
148
+ t_tokenized_l = []
149
+ t_text_l = []
150
+
151
+ for _, v in tqdm(enumerate(trainloader), total=len(trainloader)): # Not using batches yet
152
+ text = torch.reshape(v['text'], (-1,))
153
+ tags = torch.reshape(v['spans'], (-1,))
154
+ optimizer.zero_grad()
155
+ tag_scores = model(text)
156
+
157
+ # Para la F1
158
+ t_pred_l.append(tag_scores.cpu().detach().numpy())
159
+ t_true_index_l.append([a.cpu().detach().numpy()[0] for a in v['true_index']])
160
+ t_tokenized_l.append([a[0] for a in v['tokenized']])
161
+ t_text_l.append(v['original_text'][0])
162
+
163
+ loss = criterion(torch.reshape(tag_scores, (-1,)), torch.reshape(tags, (-1,)).float())
164
+ tl += loss.item()
165
+ loss.backward()
166
+ optimizer.step()
167
+
168
+ tl /= len(trainloader)
169
+ l = 0
170
+ print("Starting evaluation for loss function.")
171
+ # evaluar el modelo
172
+ pred_l = []
173
+ true_index_l = []
174
+ tokenized_l = []
175
+ text_l = []
176
+
177
+ model.eval()
178
+ with torch.no_grad():
179
+ for v in testloader:
180
+ text = torch.reshape(v['text'], (-1,))
181
+ tags = torch.reshape(v['spans'], (-1,))
182
+
183
+ tag_scores = model(text)
184
+
185
+ #Para la F1
186
+ pred_l.append(tag_scores.cpu().detach().numpy())
187
+ true_index_l.append([a.cpu().detach().numpy()[0] for a in v['true_index']])
188
+ tokenized_l.append([a[0] for a in v['tokenized']])
189
+ text_l.append(v['original_text'][0])
190
+
191
+ loss = criterion(torch.reshape(tag_scores, (-1,)), torch.reshape(tags, (-1,)).float())
192
+ l += loss.item()
193
+
194
+ model.train()
195
+ l /= len(testloader)
196
+ print("Starting evaluation for dev F1")
197
+ f1_d = f1_scores(pred_l, true_index_l, tokenized_l, text_l)
198
+ # Es aproximado, pero solo es una referencia
199
+ f1_t = f1_scores(t_pred_l, t_true_index_l, t_tokenized_l, t_text_l)
200
+
201
+
202
+ epochs_without_change += 1
203
+ if best_l is None or best_l < f1_d:
204
+ print("Model improved, saving.")
205
+ torch.save(model, savefile)
206
+ best_l = f1_d
207
+ best_tl = f1_t
208
+ epochs_without_change = 0
209
+ last_epoch_save = epochs
210
+ print("Model improved, saved.")
211
+
212
+ # Para graficar con una escala coherente.
213
+ if(worst_l_f1 is None or f1_d < worst_l_f1):
214
+ worst_l_f1 = f1_d
215
+ f1_scores_dev[0] = worst_l_f1
216
+ if(worst_tl_f1 is None or f1_t < worst_tl_f1):
217
+ worst_tl_f1 = f1_t
218
+ f1_scores_train[0] = worst_tl_f1
219
+ if(worst_tl is None or tl > worst_tl):
220
+ worst_tl = tl
221
+ training_loss[0] = worst_tl
222
+ if(worst_l is None or l > worst_l):
223
+ worst_l = l
224
+ loss_per_epoch[0] = worst_l
225
+
226
+ # Rastreo las perdidas
227
+ loss_per_epoch.append(l)
228
+ training_loss.append(tl)
229
+ f1_scores_train.append(f1_t)
230
+ f1_scores_dev.append(f1_d)
231
+ # Rastreo la época actual
232
+ epochs += 1
233
+ print('Finished Training')
234
+
235
+ return loss_per_epoch, training_loss, f1_scores_train, f1_scores_dev
notebooks/utils/processing.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from termcolor import colored
2
+ import string
3
+
4
+ def color_toxic_words(index, text, html=False):
5
+ if not html:
6
+ colored_string = ''
7
+ for i, x in enumerate(text):
8
+ if i in index:
9
+ colored_string += colored(x, on_color='on_red')
10
+ else:
11
+ colored_string += colored(x)
12
+ else:
13
+ colored_string = ''
14
+ for i, x in enumerate(text):
15
+ if i in index:
16
+ colored_string += f'<span style="background-color: #FF0000">{x}</span>'
17
+ else:
18
+ colored_string += x
19
+
20
+ return colored_string
21
+
22
+ def remove_symbols(index, text):
23
+ """
24
+ Remueve los índices que corresponden a símbolos 'no tóxicos', como espacios en blanco
25
+ comas, puntos, etc.
26
+ """
27
+ index_clean = []
28
+ for i in index:
29
+ x = text[i]
30
+ if x not in ('"()+,-./:;<=>[\\]^_`{|}~' + string.whitespace):
31
+ index_clean.append(i)
32
+
33
+ return index_clean
34
+
35
+ def completely_toxic(span, text):
36
+ if span == []:
37
+ return [i for i in range(len(text))]
38
+ else:
39
+ return span
40
+
41
+ def separate_words(indices):
42
+ """
43
+ Separa los índices por palabras.
44
+ """
45
+
46
+ toxic_words_indices = []
47
+ m = 0
48
+ for i,(j,k) in enumerate(zip(indices[0:-1], indices[1:])):
49
+ if k-j != 1:
50
+ toxic_words_indices.append(indices[m:i+1])
51
+ m = i+1
52
+ toxic_words_indices.append(indices[m:]) # Última palabra
53
+
54
+ return toxic_words_indices
55
+
56
+
57
+ def postprocessing(indices_list, delta=7):
58
+ """
59
+ Pone como tóxicos los caracteres en medio de dos palabras tóxicas si el espacio
60
+ entre ellas es menor a delta.
61
+ """
62
+
63
+ # Asumiendo que tienes indices numéricos enteros.
64
+ if len(indices_list) > 1:
65
+ l = sorted(indices_list)
66
+ new_list = []
67
+ for i in range(len(indices_list)-1):
68
+ # Agrego el indice existente
69
+ new_list.append(l[i])
70
+ # Si no hay mucho espacio entre este y el siguiente indice, selecciono todos los indices intermedios
71
+ if (l[i+1] - l[i]) <= delta:
72
+ new_list = new_list + list(range(l[i]+1,l[i+1]))
73
+
74
+ new_list.append(l[-1]) # El ultimo elemento
75
+ return new_list
76
+ else:
77
+ return indices_list
78
+
79
+
80
+ def get_index_toxic_words(sentence, tagged_sentence, delta=7):
81
+ toxic_indices = []
82
+ m = 0
83
+ #tag_to_ix = {"non_toxic": 0, "toxic": 1}
84
+ for word_tag in tagged_sentence:
85
+ word, tag = word_tag
86
+ if tag == 1: #toxic
87
+ # Si la palabra tóxica aparece 2 o más veces ésto solo dará la primera
88
+ # aparición, hay que arreglar eso pero por lo mientras sirve
89
+ # word_indices = [sentence.find(word) + i for i in range(len(word))]
90
+ # toxic_indices.append(word_indices)
91
+
92
+ # Así parece evitar el problema de la palabra repetida
93
+ word_indices = [m + sentence[m:].find(word) + i for i in range(len(word))]
94
+ toxic_indices.append(word_indices)
95
+ # Ya se arregla el 'bug' de 'stupidity'
96
+ m += sentence[m:].find(word) + len(word)
97
+
98
+ toxic_indices = [val for sublist in toxic_indices for val in sublist]
99
+
100
+ # Unir espacios y otras cosas para que suba el F1
101
+ return postprocessing(toxic_indices, delta)
102
+
103
+
104
+ def f1(predictions, gold):
105
+ """
106
+ F1 (a.k.a. DICE) operating on two lists of offsets (e.g., character).
107
+ >>> assert f1([0, 1, 4, 5], [0, 1, 6]) == 0.5714285714285714
108
+ :param predictions: a list of predicted offsets
109
+ :param gold: a list of offsets serving as the ground truth
110
+ :return: a score between 0 and 1
111
+ """
112
+ if len(gold) == 0:
113
+ return 1. if len(predictions) == 0 else 0.
114
+ if len(predictions) == 0:
115
+ return 0.
116
+ predictions_set = set(predictions)
117
+ gold_set = set(gold)
118
+ nom = 2 * len(predictions_set.intersection(gold_set))
119
+ denom = len(predictions_set) + len(gold_set)
120
+ return float(nom)/float(denom)
121
+
122
+ def f1_scores(pred, true_index, tokenized, text, threshold=0.5):
123
+ scores = 0
124
+ for i in range(len(pred)):
125
+ tags = [1 if x > threshold else 0 for x in pred[i]]
126
+ tagged_sentence = list(zip(tokenized[i], tags))
127
+ prediction_index = get_index_toxic_words(text[i], tagged_sentence)
128
+ scores += f1(prediction_index, true_index[i])
129
+ return scores/len(pred)