Spaces:
Runtime error
Runtime error
mario
commited on
Commit
•
c68f5ab
1
Parent(s):
d81e7c2
Added deps
Browse files- notebooks/HMM and CRF.ipynb +0 -0
- notebooks/Results LSTM.ipynb +461 -0
- notebooks/Training LSTM-bidirectional.ipynb +0 -0
- notebooks/models/toxic_speech.crfsuite +0 -0
- notebooks/utils/__pycache__/lstm.cpython-39.pyc +0 -0
- notebooks/utils/__pycache__/processing.cpython-39.pyc +0 -0
- notebooks/utils/basic_models.py +58 -0
- notebooks/utils/lstm.py +235 -0
- notebooks/utils/processing.py +129 -0
notebooks/HMM and CRF.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/Results LSTM.ipynb
ADDED
@@ -0,0 +1,461 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"# Toxic Spans Detection"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": 1,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [],
|
15 |
+
"source": [
|
16 |
+
"import pandas as pd\n",
|
17 |
+
"import numpy as np\n",
|
18 |
+
"import matplotlib.pyplot as plt\n",
|
19 |
+
"import seaborn as sns\n",
|
20 |
+
"\n",
|
21 |
+
"import torch\n",
|
22 |
+
"import torch.nn as nn\n",
|
23 |
+
"\n",
|
24 |
+
"import spacy\n",
|
25 |
+
"import ast\n",
|
26 |
+
"from termcolor import colored\n",
|
27 |
+
"\n",
|
28 |
+
"from tqdm import tqdm\n",
|
29 |
+
"import gdown\n",
|
30 |
+
"\n",
|
31 |
+
"from utils.processing import get_index_toxic_words, color_toxic_words, f1\n",
|
32 |
+
"from utils.lstm import spacy_tokenizer, get_vocab\n",
|
33 |
+
"\n",
|
34 |
+
"sns.set_style('darkgrid')\n",
|
35 |
+
"dev = 'cuda:0' if torch.cuda.is_available() else 'cpu'"
|
36 |
+
]
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"cell_type": "code",
|
40 |
+
"execution_count": 2,
|
41 |
+
"metadata": {},
|
42 |
+
"outputs": [],
|
43 |
+
"source": [
|
44 |
+
"# To plot using LaTeX, sometimes it gives trouble, in that case comment these two lines\n",
|
45 |
+
"plt.rc('text', usetex=True)\n",
|
46 |
+
"plt.rc('font', family='serif')"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"cell_type": "markdown",
|
51 |
+
"metadata": {},
|
52 |
+
"source": [
|
53 |
+
"## Resultados\n",
|
54 |
+
"\n",
|
55 |
+
"De los distintos preprocesamientos el que mayor F1 score tiene en `test` es poner los posts con [ ] como completamente tóxicos (**best-model-try2.pt**, *train*=0.6498 , *test*=0.6526), así que usaremos ese."
|
56 |
+
]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"cell_type": "code",
|
60 |
+
"execution_count": 3,
|
61 |
+
"metadata": {},
|
62 |
+
"outputs": [],
|
63 |
+
"source": [
|
64 |
+
"train = pd.read_csv('../data/tsd_train.csv', converters={'spans':ast.literal_eval})\n",
|
65 |
+
"test = pd.read_csv('../data/tsd_trial.csv', converters={'spans':ast.literal_eval})"
|
66 |
+
]
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"cell_type": "code",
|
70 |
+
"execution_count": null,
|
71 |
+
"metadata": {},
|
72 |
+
"outputs": [],
|
73 |
+
"source": [
|
74 |
+
"# Nuestros embeddings\n",
|
75 |
+
"vocab = get_vocab(train)"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"cell_type": "code",
|
80 |
+
"execution_count": 5,
|
81 |
+
"metadata": {},
|
82 |
+
"outputs": [],
|
83 |
+
"source": [
|
84 |
+
"class LSTMTagger(nn.Module):\n",
|
85 |
+
"\n",
|
86 |
+
" def __init__(self, embedding_dim, stacked_layers, dropout_p, weight, hidden_dim, vocab_size):\n",
|
87 |
+
" super(LSTMTagger, self).__init__()\n",
|
88 |
+
" self.hidden_dim = hidden_dim # Dimension del estado oculta en cada direccion de la LSTM\n",
|
89 |
+
" self.stacked_layers = stacked_layers # Cuantas capas en la LSTM\n",
|
90 |
+
" \n",
|
91 |
+
" self.word_embeddings = nn.Embedding.from_pretrained(weight)\n",
|
92 |
+
" self.lstm = nn.LSTM(embedding_dim,\n",
|
93 |
+
" hidden_dim,\n",
|
94 |
+
" num_layers=stacked_layers,\n",
|
95 |
+
" dropout=dropout_p,\n",
|
96 |
+
" bidirectional=True)\n",
|
97 |
+
"\n",
|
98 |
+
" # Linear layers\n",
|
99 |
+
" self.fc1 = nn.Linear(hidden_dim*2, 1) # 2 veces el tamaño de hidden_dim por ser bidireccional\n",
|
100 |
+
"\n",
|
101 |
+
" def forward(self, sentence):\n",
|
102 |
+
" embeds = self.word_embeddings(sentence)\n",
|
103 |
+
" output, _ = self.lstm(embeds.view(len(sentence), 1, -1))\n",
|
104 |
+
" x = torch.sigmoid(self.fc1(output.view(len(sentence), -1)))\n",
|
105 |
+
" return x"
|
106 |
+
]
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"cell_type": "code",
|
110 |
+
"execution_count": 6,
|
111 |
+
"metadata": {},
|
112 |
+
"outputs": [],
|
113 |
+
"source": [
|
114 |
+
"def prepare_sequence(seq):\n",
|
115 |
+
" idxs = vocab.lookup_indices(seq) # Si no está lo pone como 0\n",
|
116 |
+
" return torch.tensor(idxs, dtype=torch.long, device=dev)\n",
|
117 |
+
"\n",
|
118 |
+
"def prepare_sequence_tags(seq):\n",
|
119 |
+
" tag_to_ix = {\"non_toxic\": 0, \"toxic\": 1} \n",
|
120 |
+
" idxs = [tag_to_ix[s] for s in seq]\n",
|
121 |
+
" return torch.tensor(idxs, dtype=torch.long, device=dev)\n",
|
122 |
+
"\n",
|
123 |
+
"def tagger_LSTM(text, threshold=0.5):\n",
|
124 |
+
" \"\"\"\n",
|
125 |
+
" Hace el tagging con el modelo que entrenamos.\n",
|
126 |
+
" \"\"\"\n",
|
127 |
+
" ix_to_tag = {0: 'non_toxic', 1: 'toxic'}\n",
|
128 |
+
" words = spacy_tokenizer(text.lower()) # Parece funcionar mejor\n",
|
129 |
+
" \n",
|
130 |
+
" with torch.no_grad():\n",
|
131 |
+
" inputs = prepare_sequence(words)\n",
|
132 |
+
" tag_scores = model(inputs)\n",
|
133 |
+
" \n",
|
134 |
+
" tags = [1 if x > threshold else 0 for x in tag_scores]\n",
|
135 |
+
" tagged_sentence = list(zip(words, tags))\n",
|
136 |
+
"\n",
|
137 |
+
" return tagged_sentence"
|
138 |
+
]
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"cell_type": "markdown",
|
142 |
+
"metadata": {},
|
143 |
+
"source": [
|
144 |
+
"El archivo con el modelo está algo pesado para GitHub, así que lo pondremos en un Drive para descargarlo."
|
145 |
+
]
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"cell_type": "code",
|
149 |
+
"execution_count": null,
|
150 |
+
"metadata": {},
|
151 |
+
"outputs": [],
|
152 |
+
"source": [
|
153 |
+
"url = 'https://drive.google.com/uc?id=1KO-QXUBfwzjauWLhiVi9StD3y0GtiBbj'\n",
|
154 |
+
"output = 'models/best-model.pt'\n",
|
155 |
+
"\n",
|
156 |
+
"gdown.download(url, output, quiet=False) "
|
157 |
+
]
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"cell_type": "code",
|
161 |
+
"execution_count": 8,
|
162 |
+
"metadata": {},
|
163 |
+
"outputs": [
|
164 |
+
{
|
165 |
+
"data": {
|
166 |
+
"text/plain": [
|
167 |
+
"LSTMTagger(\n",
|
168 |
+
" (word_embeddings): Embedding(19611, 200)\n",
|
169 |
+
" (lstm): LSTM(200, 600, num_layers=6, dropout=0.2, bidirectional=True)\n",
|
170 |
+
" (fc1): Linear(in_features=1200, out_features=1, bias=True)\n",
|
171 |
+
")"
|
172 |
+
]
|
173 |
+
},
|
174 |
+
"execution_count": 8,
|
175 |
+
"metadata": {},
|
176 |
+
"output_type": "execute_result"
|
177 |
+
}
|
178 |
+
],
|
179 |
+
"source": [
|
180 |
+
"model = torch.load(output)\n",
|
181 |
+
"model.to(torch.device(dev))"
|
182 |
+
]
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"cell_type": "code",
|
186 |
+
"execution_count": 9,
|
187 |
+
"metadata": {},
|
188 |
+
"outputs": [
|
189 |
+
{
|
190 |
+
"name": "stdout",
|
191 |
+
"output_type": "stream",
|
192 |
+
"text": [
|
193 |
+
"0\u001b[1m\u001b[36m Pred: \u001b[0mB\u001b[0me\u001b[0mc\u001b[0ma\u001b[0mu\u001b[0ms\u001b[0me\u001b[0m \u001b[0mh\u001b[0me\u001b[0m'\u001b[0ms\u001b[0m \u001b[0ma\u001b[0m \u001b[0m\u001b[41mm\u001b[0m\u001b[41mo\u001b[0m\u001b[41mr\u001b[0m\u001b[41mo\u001b[0m\u001b[41mn\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0ma\u001b[0m \u001b[0m\u001b[41mb\u001b[0m\u001b[41mi\u001b[0m\u001b[41mg\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m.\u001b[0m \u001b[0mI\u001b[0mt\u001b[0m'\u001b[0ms\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0ma\u001b[0mn\u001b[0my\u001b[0m \u001b[0mm\u001b[0mo\u001b[0mr\u001b[0me\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mp\u001b[0ml\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0mt\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m.\u001b[0m\n",
|
194 |
+
"0\u001b[1m\u001b[33m Gold: \u001b[0mB\u001b[0me\u001b[0mc\u001b[0ma\u001b[0mu\u001b[0ms\u001b[0me\u001b[0m \u001b[0mh\u001b[0me\u001b[0m'\u001b[0ms\u001b[0m \u001b[0ma\u001b[0m \u001b[0m\u001b[41mm\u001b[0m\u001b[41mo\u001b[0m\u001b[41mr\u001b[0m\u001b[41mo\u001b[0m\u001b[41mn\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0ma\u001b[0m \u001b[0m\u001b[41mb\u001b[0m\u001b[41mi\u001b[0m\u001b[41mg\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m.\u001b[0m \u001b[0mI\u001b[0mt\u001b[0m'\u001b[0ms\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0ma\u001b[0mn\u001b[0my\u001b[0m \u001b[0mm\u001b[0mo\u001b[0mr\u001b[0me\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mp\u001b[0ml\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0mt\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m.\u001b[0m\n",
|
195 |
+
"\n",
|
196 |
+
"\n",
|
197 |
+
"1\u001b[1m\u001b[36m Pred: \u001b[0mH\u001b[0mo\u001b[0mw\u001b[0m \u001b[0ma\u001b[0mb\u001b[0mo\u001b[0mu\u001b[0mt\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0ms\u001b[0mt\u001b[0mo\u001b[0mp\u001b[0m \u001b[0mp\u001b[0mr\u001b[0mo\u001b[0mt\u001b[0me\u001b[0mc\u001b[0mt\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m\u001b[41mi\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m\u001b[41ms\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0ml\u001b[0me\u001b[0mt\u001b[0m \u001b[0mn\u001b[0ma\u001b[0mt\u001b[0mu\u001b[0mr\u001b[0me\u001b[0m \u001b[0ma\u001b[0md\u001b[0md\u001b[0m \u001b[0ms\u001b[0mo\u001b[0mm\u001b[0me\u001b[0m \u001b[0mb\u001b[0ml\u001b[0me\u001b[0ma\u001b[0mc\u001b[0mh\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mg\u001b[0me\u001b[0mn\u001b[0me\u001b[0m \u001b[0mp\u001b[0mo\u001b[0mo\u001b[0ml\u001b[0m.\u001b[0m \u001b[0mW\u001b[0me\u001b[0m \u001b[0mc\u001b[0ma\u001b[0mn\u001b[0m \u001b[0ma\u001b[0ml\u001b[0mw\u001b[0ma\u001b[0my\u001b[0ms\u001b[0m \u001b[0ms\u001b[0mu\u001b[0mb\u001b[0mm\u001b[0mi\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mi\u001b[0mr\u001b[0m \u001b[0mn\u001b[0ma\u001b[0mm\u001b[0me\u001b[0ms\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mD\u001b[0ma\u001b[0mr\u001b[0mw\u001b[0mi\u001b[0mn\u001b[0m \u001b[0ma\u001b[0mw\u001b[0ma\u001b[0mr\u001b[0md\u001b[0ms\u001b[0m.\u001b[0m\n",
|
198 |
+
"1\u001b[1m\u001b[33m Gold: \u001b[0mH\u001b[0mo\u001b[0mw\u001b[0m \u001b[0ma\u001b[0mb\u001b[0mo\u001b[0mu\u001b[0mt\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0ms\u001b[0mt\u001b[0mo\u001b[0mp\u001b[0m \u001b[0mp\u001b[0mr\u001b[0mo\u001b[0mt\u001b[0me\u001b[0mc\u001b[0mt\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m\u001b[41mi\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m\u001b[41ms\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0ml\u001b[0me\u001b[0mt\u001b[0m \u001b[0mn\u001b[0ma\u001b[0mt\u001b[0mu\u001b[0mr\u001b[0me\u001b[0m \u001b[0ma\u001b[0md\u001b[0md\u001b[0m \u001b[0ms\u001b[0mo\u001b[0mm\u001b[0me\u001b[0m \u001b[0mb\u001b[0ml\u001b[0me\u001b[0ma\u001b[0mc\u001b[0mh\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mg\u001b[0me\u001b[0mn\u001b[0me\u001b[0m \u001b[0mp\u001b[0mo\u001b[0mo\u001b[0ml\u001b[0m.\u001b[0m \u001b[0mW\u001b[0me\u001b[0m \u001b[0mc\u001b[0ma\u001b[0mn\u001b[0m \u001b[0ma\u001b[0ml\u001b[0mw\u001b[0ma\u001b[0my\u001b[0ms\u001b[0m \u001b[0ms\u001b[0mu\u001b[0mb\u001b[0mm\u001b[0mi\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mi\u001b[0mr\u001b[0m \u001b[0mn\u001b[0ma\u001b[0mm\u001b[0me\u001b[0ms\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mD\u001b[0ma\u001b[0mr\u001b[0mw\u001b[0mi\u001b[0mn\u001b[0m \u001b[0ma\u001b[0mw\u001b[0ma\u001b[0mr\u001b[0md\u001b[0ms\u001b[0m.\u001b[0m\n",
|
199 |
+
"\n",
|
200 |
+
"\n",
|
201 |
+
"2\u001b[1m\u001b[36m Pred: \u001b[0mI\u001b[0mf\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m \u001b[0m \u001b[0mw\u001b[0me\u001b[0mr\u001b[0me\u001b[0m \u001b[0m \u001b[0ms\u001b[0mm\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0m,\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mu\u001b[0ml\u001b[0md\u001b[0m \u001b[0m \u001b[0mB\u001b[0mo\u001b[0my\u001b[0mc\u001b[0mo\u001b[0mt\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0mi\u001b[0ms\u001b[0m \u001b[0m \u001b[0mi\u001b[0mn\u001b[0me\u001b[0mp\u001b[0mt\u001b[0m \u001b[0m \u001b[0ma\u001b[0mi\u001b[0mr\u001b[0ml\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m,\u001b[0m \u001b[0m \u001b[0mb\u001b[0mu\u001b[0mt\u001b[0m \u001b[0m \u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0m \u001b[0ms\u001b[0mm\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0m,\u001b[0m \u001b[0m \u001b[0ms\u001b[0mo\u001b[0m \u001b[0m \u001b[0m \u001b[0mr\u001b[0mo\u001b[0mg\u001b[0mu\u001b[0me\u001b[0m \u001b[0m \u001b[0mb\u001b[0mu\u001b[0ms\u001b[0mi\u001b[0mn\u001b[0me\u001b[0ms\u001b[0ms\u001b[0me\u001b[0ms\u001b[0m \u001b[0m \u001b[0ml\u001b[0mi\u001b[0mk\u001b[0me\u001b[0m \u001b[0m \u001b[0mt\u001b[0mh\u001b[0mi\u001b[0ms\u001b[0m \u001b[0m \u001b[0mo\u001b[0mn\u001b[0me\u001b[0m,\u001b[0m \u001b[0m \u001b[0m \u001b[0ms\u001b[0mt\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0m \u001b[0mt\u001b[0mh\u001b[0mr\u001b[0mi\u001b[0mv\u001b[0me\u001b[0m \u001b[0m \u001b[0m \u001b[0mt\u001b[0ma\u001b[0mk\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m\u001b[41mi\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m\u001b[41ms\u001b[0m \u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0m \u001b[0ma\u001b[0m \u001b[0mr\u001b[0mi\u001b[0md\u001b[0me\u001b[0m.\u001b[0m.\u001b[0m.\u001b[0m\n",
|
202 |
+
"2\u001b[1m\u001b[33m Gold: \u001b[0mI\u001b[0mf\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m \u001b[0m \u001b[0mw\u001b[0me\u001b[0mr\u001b[0me\u001b[0m \u001b[0m \u001b[0ms\u001b[0mm\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0m,\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mu\u001b[0ml\u001b[0md\u001b[0m \u001b[0m \u001b[0mB\u001b[0mo\u001b[0my\u001b[0mc\u001b[0mo\u001b[0mt\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0mi\u001b[0ms\u001b[0m \u001b[0m \u001b[0mi\u001b[0mn\u001b[0me\u001b[0mp\u001b[0mt\u001b[0m \u001b[0m \u001b[0ma\u001b[0mi\u001b[0mr\u001b[0ml\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m,\u001b[0m \u001b[0m \u001b[0mb\u001b[0mu\u001b[0mt\u001b[0m \u001b[0m \u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0m \u001b[0ms\u001b[0mm\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0m,\u001b[0m \u001b[0m \u001b[0ms\u001b[0mo\u001b[0m \u001b[0m \u001b[0m \u001b[0mr\u001b[0mo\u001b[0mg\u001b[0mu\u001b[0me\u001b[0m \u001b[0m \u001b[0mb\u001b[0mu\u001b[0ms\u001b[0mi\u001b[0mn\u001b[0me\u001b[0ms\u001b[0ms\u001b[0me\u001b[0ms\u001b[0m \u001b[0m \u001b[0ml\u001b[0mi\u001b[0mk\u001b[0me\u001b[0m \u001b[0m \u001b[0mt\u001b[0mh\u001b[0mi\u001b[0ms\u001b[0m \u001b[0m \u001b[0mo\u001b[0mn\u001b[0me\u001b[0m,\u001b[0m \u001b[0m \u001b[0m \u001b[0ms\u001b[0mt\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0m \u001b[0mt\u001b[0mh\u001b[0mr\u001b[0mi\u001b[0mv\u001b[0me\u001b[0m \u001b[0m \u001b[0m \u001b[0mt\u001b[0ma\u001b[0mk\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m\u001b[41mi\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m\u001b[41ms\u001b[0m \u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0m \u001b[0ma\u001b[0m \u001b[0mr\u001b[0mi\u001b[0md\u001b[0me\u001b[0m.\u001b[0m.\u001b[0m.\u001b[0m\n",
|
203 |
+
"\n",
|
204 |
+
"\n",
|
205 |
+
"3\u001b[1m\u001b[36m Pred: \u001b[0mT\u001b[0mr\u001b[0mu\u001b[0mm\u001b[0mp\u001b[0m \u001b[0mC\u001b[0ml\u001b[0ma\u001b[0mi\u001b[0mm\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mR\u001b[0mu\u001b[0ms\u001b[0ms\u001b[0mi\u001b[0ma\u001b[0m \u001b[0mw\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0m \u001b[0mn\u001b[0me\u001b[0mv\u001b[0me\u001b[0mr\u001b[0m \u001b[0mi\u001b[0mn\u001b[0mv\u001b[0ma\u001b[0md\u001b[0me\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mU\u001b[0mk\u001b[0mr\u001b[0ma\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m,\u001b[0m \u001b[0mw\u001b[0mh\u001b[0me\u001b[0mn\u001b[0m \u001b[0mR\u001b[0mu\u001b[0ms\u001b[0ms\u001b[0mi\u001b[0ma\u001b[0m \u001b[0ma\u001b[0ml\u001b[0mr\u001b[0me\u001b[0ma\u001b[0md\u001b[0my\u001b[0m \u001b[0mh\u001b[0ma\u001b[0ms\u001b[0m \u001b[0m-\u001b[0m \u001b[0mh\u001b[0mo\u001b[0mw\u001b[0m \u001b[0m\u001b[41ms\u001b[0m\u001b[41mt\u001b[0m\u001b[41mu\u001b[0m\u001b[41mp\u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m \u001b[0mc\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m \u001b[0mb\u001b[0me\u001b[0m?\u001b[0m\n",
|
206 |
+
"3\u001b[1m\u001b[33m Gold: \u001b[0mT\u001b[0mr\u001b[0mu\u001b[0mm\u001b[0mp\u001b[0m \u001b[0mC\u001b[0ml\u001b[0ma\u001b[0mi\u001b[0mm\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mR\u001b[0mu\u001b[0ms\u001b[0ms\u001b[0mi\u001b[0ma\u001b[0m \u001b[0mw\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0m \u001b[0mn\u001b[0me\u001b[0mv\u001b[0me\u001b[0mr\u001b[0m \u001b[0mi\u001b[0mn\u001b[0mv\u001b[0ma\u001b[0md\u001b[0me\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mU\u001b[0mk\u001b[0mr\u001b[0ma\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m,\u001b[0m \u001b[0mw\u001b[0mh\u001b[0me\u001b[0mn\u001b[0m \u001b[0mR\u001b[0mu\u001b[0ms\u001b[0ms\u001b[0mi\u001b[0ma\u001b[0m \u001b[0ma\u001b[0ml\u001b[0mr\u001b[0me\u001b[0ma\u001b[0md\u001b[0my\u001b[0m \u001b[0mh\u001b[0ma\u001b[0ms\u001b[0m \u001b[0m-\u001b[0m \u001b[0mh\u001b[0mo\u001b[0mw\u001b[0m \u001b[0m\u001b[41ms\u001b[0m\u001b[41mt\u001b[0m\u001b[41mu\u001b[0m\u001b[41mp\u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m \u001b[0mc\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m \u001b[0mb\u001b[0me\u001b[0m?\u001b[0m\n",
|
207 |
+
"\n",
|
208 |
+
"\n",
|
209 |
+
"4\u001b[1m\u001b[36m Pred: \u001b[0mA\u001b[0ms\u001b[0m \u001b[0ml\u001b[0mo\u001b[0mn\u001b[0mg\u001b[0m \u001b[0ma\u001b[0ms\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0mr\u001b[0m \u001b[0mw\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mp\u001b[0ma\u001b[0my\u001b[0m \u001b[0ma\u001b[0m \u001b[0ml\u001b[0mo\u001b[0mt\u001b[0m \u001b[0mm\u001b[0mo\u001b[0mr\u001b[0me\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mp\u001b[0mr\u001b[0mo\u001b[0md\u001b[0mu\u001b[0mc\u001b[0mt\u001b[0ms\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0mb\u001b[0mu\u001b[0my\u001b[0m,\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mn\u001b[0m \u001b[0mf\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m.\u001b[0m\n",
|
210 |
+
"\u001b[0mB\u001b[0mu\u001b[0mt\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0mb\u001b[0me\u001b[0mt\u001b[0mt\u001b[0me\u001b[0mr\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0mb\u001b[0me\u001b[0m \u001b[0mg\u001b[0mo\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mC\u001b[0mo\u001b[0ms\u001b[0mt\u001b[0mc\u001b[0mo\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0mW\u001b[0ma\u001b[0ml\u001b[0mm\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mb\u001b[0mu\u001b[0my\u001b[0m \u001b[0ms\u001b[0mt\u001b[0mu\u001b[0mf\u001b[0mf\u001b[0m \u001b[0mb\u001b[0me\u001b[0mc\u001b[0ma\u001b[0mu\u001b[0ms\u001b[0me\u001b[0m \u001b[0mi\u001b[0mt\u001b[0m'\u001b[0ms\u001b[0m \u001b[0mc\u001b[0mh\u001b[0me\u001b[0ma\u001b[0mp\u001b[0me\u001b[0mr\u001b[0m.\u001b[0m\n",
|
211 |
+
"\u001b[0mI\u001b[0mf\u001b[0m \u001b[0ms\u001b[0mo\u001b[0m,\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0mg\u001b[0me\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mc\u001b[0ma\u001b[0ml\u001b[0ml\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0ma\u001b[0m \u001b[0m\u001b[41mh\u001b[0m\u001b[41my\u001b[0m\u001b[41mp\u001b[0m\u001b[41mo\u001b[0m\u001b[41mc\u001b[0m\u001b[41mr\u001b[0m\u001b[41mi\u001b[0m\u001b[41mt\u001b[0m\u001b[41mi\u001b[0m\u001b[41mc\u001b[0m\u001b[41ma\u001b[0m\u001b[41ml\u001b[0m \u001b[0mw\u001b[0ma\u001b[0mn\u001b[0mk\u001b[0me\u001b[0mr\u001b[0m.\u001b[0m\n",
|
212 |
+
"4\u001b[1m\u001b[33m Gold: \u001b[0mA\u001b[0ms\u001b[0m \u001b[0ml\u001b[0mo\u001b[0mn\u001b[0mg\u001b[0m \u001b[0ma\u001b[0ms\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0mr\u001b[0m \u001b[0mw\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mp\u001b[0ma\u001b[0my\u001b[0m \u001b[0ma\u001b[0m \u001b[0ml\u001b[0mo\u001b[0mt\u001b[0m \u001b[0mm\u001b[0mo\u001b[0mr\u001b[0me\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mp\u001b[0mr\u001b[0mo\u001b[0md\u001b[0mu\u001b[0mc\u001b[0mt\u001b[0ms\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0mb\u001b[0mu\u001b[0my\u001b[0m,\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mn\u001b[0m \u001b[0mf\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m.\u001b[0m\n",
|
213 |
+
"\u001b[0mB\u001b[0mu\u001b[0mt\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0mb\u001b[0me\u001b[0mt\u001b[0mt\u001b[0me\u001b[0mr\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0mb\u001b[0me\u001b[0m \u001b[0mg\u001b[0mo\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mC\u001b[0mo\u001b[0ms\u001b[0mt\u001b[0mc\u001b[0mo\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0mW\u001b[0ma\u001b[0ml\u001b[0mm\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mb\u001b[0mu\u001b[0my\u001b[0m \u001b[0ms\u001b[0mt\u001b[0mu\u001b[0mf\u001b[0mf\u001b[0m \u001b[0mb\u001b[0me\u001b[0mc\u001b[0ma\u001b[0mu\u001b[0ms\u001b[0me\u001b[0m \u001b[0mi\u001b[0mt\u001b[0m'\u001b[0ms\u001b[0m \u001b[0mc\u001b[0mh\u001b[0me\u001b[0ma\u001b[0mp\u001b[0me\u001b[0mr\u001b[0m.\u001b[0m\n",
|
214 |
+
"\u001b[0mI\u001b[0mf\u001b[0m \u001b[0ms\u001b[0mo\u001b[0m,\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0mg\u001b[0me\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mc\u001b[0ma\u001b[0ml\u001b[0ml\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0ma\u001b[0m \u001b[0mh\u001b[0my\u001b[0mp\u001b[0mo\u001b[0mc\u001b[0mr\u001b[0mi\u001b[0mt\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0ml\u001b[0m \u001b[0mw\u001b[0ma\u001b[0mn\u001b[0mk\u001b[0me\u001b[0mr\u001b[0m.\u001b[0m\n",
|
215 |
+
"\n",
|
216 |
+
"\n"
|
217 |
+
]
|
218 |
+
}
|
219 |
+
],
|
220 |
+
"source": [
|
221 |
+
"indices_test = []\n",
|
222 |
+
"for i, (gold_index, text) in enumerate(zip(test['spans'],test['text'])):\n",
|
223 |
+
" tagged_sentence = tagger_LSTM(text) \n",
|
224 |
+
" prediction_index = get_index_toxic_words(text.lower(), tagged_sentence)\n",
|
225 |
+
" indices_test.append(prediction_index)\n",
|
226 |
+
" \n",
|
227 |
+
" if i < 5:\n",
|
228 |
+
" print(str(i) + colored(' Pred: ', color='cyan', attrs=['bold']) + \n",
|
229 |
+
" color_toxic_words(prediction_index, text))\n",
|
230 |
+
" print(str(i) + colored(' Gold: ', color='yellow', attrs=['bold']) + \n",
|
231 |
+
" color_toxic_words(gold_index, text) + '\\n'*2)"
|
232 |
+
]
|
233 |
+
},
|
234 |
+
{
|
235 |
+
"cell_type": "code",
|
236 |
+
"execution_count": 10,
|
237 |
+
"metadata": {},
|
238 |
+
"outputs": [
|
239 |
+
{
|
240 |
+
"name": "stdout",
|
241 |
+
"output_type": "stream",
|
242 |
+
"text": [
|
243 |
+
"F1 in test: 0.648722\n"
|
244 |
+
]
|
245 |
+
}
|
246 |
+
],
|
247 |
+
"source": [
|
248 |
+
"score_test = [f1(pred, gold) for pred,gold in zip(indices_test, test['spans'])]\n",
|
249 |
+
"print('F1 in test: {:.6f}'.format(np.mean(score_test)))"
|
250 |
+
]
|
251 |
+
},
|
252 |
+
{
|
253 |
+
"cell_type": "markdown",
|
254 |
+
"metadata": {},
|
255 |
+
"source": [
|
256 |
+
"## Predicción en el dataset de *evaluation*"
|
257 |
+
]
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"cell_type": "code",
|
261 |
+
"execution_count": 11,
|
262 |
+
"metadata": {},
|
263 |
+
"outputs": [],
|
264 |
+
"source": [
|
265 |
+
"evaluation = pd.read_csv('../data/tsd_test.csv')"
|
266 |
+
]
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"cell_type": "code",
|
270 |
+
"execution_count": 12,
|
271 |
+
"metadata": {},
|
272 |
+
"outputs": [
|
273 |
+
{
|
274 |
+
"name": "stdout",
|
275 |
+
"output_type": "stream",
|
276 |
+
"text": [
|
277 |
+
"0\u001b[1m\u001b[36m Pred: \u001b[0mT\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m'\u001b[0ms\u001b[0m \u001b[0mr\u001b[0mi\u001b[0mg\u001b[0mh\u001b[0mt\u001b[0m.\u001b[0m \u001b[0mT\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mr\u001b[0mm\u001b[0ma\u001b[0ml\u001b[0m.\u001b[0m \u001b[0mA\u001b[0mn\u001b[0md\u001b[0m \u001b[0mI\u001b[0m \u001b[0ma\u001b[0mm\u001b[0m \u001b[0ms\u001b[0mt\u001b[0ma\u001b[0mr\u001b[0mt\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mf\u001b[0mr\u001b[0mo\u001b[0mm\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mp\u001b[0mr\u001b[0me\u001b[0mm\u001b[0mi\u001b[0ms\u001b[0me\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0mA\u001b[0mB\u001b[0mN\u001b[0mO\u001b[0mR\u001b[0mM\u001b[0mA\u001b[0mL\u001b[0m.\u001b[0m \u001b[0mP\u001b[0mr\u001b[0mo\u001b[0mc\u001b[0me\u001b[0me\u001b[0md\u001b[0m \u001b[0mw\u001b[0mt\u001b[0mh\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mt\u001b[0my\u001b[0mp\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0ml\u001b[0m \u001b[0m\u001b[41mr\u001b[0m\u001b[41ma\u001b[0m\u001b[41mc\u001b[0m\u001b[41mi\u001b[0m\u001b[41ms\u001b[0m\u001b[41mt\u001b[0m\u001b[41m,\u001b[0m\u001b[41m \u001b[0m\u001b[41mb\u001b[0m\u001b[41mi\u001b[0m\u001b[41mg\u001b[0m\u001b[41mo\u001b[0m\u001b[41mt\u001b[0m\u001b[41m,\u001b[0m\u001b[41m \u001b[0m\u001b[41ms\u001b[0m\u001b[41me\u001b[0m\u001b[41mx\u001b[0m\u001b[41mi\u001b[0m\u001b[41ms\u001b[0m\u001b[41mt\u001b[0m\u001b[41m \u001b[0m\u001b[41mr\u001b[0m\u001b[41mu\u001b[0m\u001b[41mb\u001b[0m\u001b[41mb\u001b[0m\u001b[41mi\u001b[0m\u001b[41ms\u001b[0m\u001b[41mh\u001b[0m.\u001b[0m\n",
|
278 |
+
"\u001b[0mT\u001b[0mh\u001b[0ma\u001b[0mn\u001b[0mk\u001b[0ms\u001b[0m!\u001b[0m\n",
|
279 |
+
"\n",
|
280 |
+
"1\u001b[1m\u001b[36m Pred: \u001b[0m\"\u001b[0mW\u001b[0ma\u001b[0mt\u001b[0mc\u001b[0mh\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m \u001b[0md\u001b[0mi\u001b[0me\u001b[0m \u001b[0mf\u001b[0mr\u001b[0mo\u001b[0mm\u001b[0m \u001b[0mt\u001b[0ma\u001b[0mk\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0ma\u001b[0mw\u001b[0ma\u001b[0my\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mi\u001b[0mr\u001b[0m \u001b[0mh\u001b[0me\u001b[0ma\u001b[0ml\u001b[0mt\u001b[0mh\u001b[0mc\u001b[0ma\u001b[0mr\u001b[0me\u001b[0m\"\u001b[0m\n",
|
281 |
+
"\u001b[0m\n",
|
282 |
+
"\u001b[0mD\u001b[0mI\u001b[0mN\u001b[0mG\u001b[0m \u001b[0mD\u001b[0mI\u001b[0mN\u001b[0mG\u001b[0m \u001b[0mD\u001b[0mI\u001b[0mN\u001b[0mG\u001b[0m!\u001b[0m \u001b[0m \u001b[0mW\u001b[0mi\u001b[0mn\u001b[0mn\u001b[0me\u001b[0mr\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0m\u001b[41ms\u001b[0m\u001b[41mt\u001b[0m\u001b[41mu\u001b[0m\u001b[41mp\u001b[0m\u001b[41mi\u001b[0m\u001b[41md\u001b[0m \u001b[0mp\u001b[0mo\u001b[0ms\u001b[0mt\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0md\u001b[0ma\u001b[0my\u001b[0m \u001b[0ma\u001b[0mw\u001b[0ma\u001b[0mr\u001b[0md\u001b[0m!\u001b[0m\n",
|
283 |
+
"\n",
|
284 |
+
"2\u001b[1m\u001b[36m Pred: \u001b[0mt\u001b[0me\u001b[0mn\u001b[0ms\u001b[0m \u001b[0my\u001b[0me\u001b[0ma\u001b[0mr\u001b[0ms\u001b[0m \u001b[0ma\u001b[0mg\u001b[0mo\u001b[0m \u001b[0mi\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mn\u001b[0mt\u001b[0ma\u001b[0mc\u001b[0mt\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mP\u001b[0mD\u001b[0mR\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0ms\u001b[0mu\u001b[0mg\u001b[0mg\u001b[0me\u001b[0ms\u001b[0mt\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mt\u001b[0mi\u001b[0mm\u001b[0me\u001b[0m \u001b[0mm\u001b[0mi\u001b[0mg\u001b[0mh\u001b[0mt\u001b[0m \u001b[0mb\u001b[0me\u001b[0m \u001b[0mg\u001b[0mo\u001b[0mo\u001b[0md\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mr\u001b[0mk\u001b[0m \u001b[0mw\u001b[0mi\u001b[0mt\u001b[0mh\u001b[0m \u001b[0ma\u001b[0ml\u001b[0ma\u001b[0ms\u001b[0mk\u001b[0ma\u001b[0m \u001b[0mo\u001b[0mn\u001b[0m \u001b[0mb\u001b[0mu\u001b[0mi\u001b[0ml\u001b[0md\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0ma\u001b[0m \u001b[0mg\u001b[0ma\u001b[0ms\u001b[0m \u001b[0ml\u001b[0mi\u001b[0mn\u001b[0me\u001b[0m.\u001b[0m.\u001b[0m \u001b[0ma\u001b[0ml\u001b[0ma\u001b[0ms\u001b[0mk\u001b[0ma\u001b[0m \u001b[0mr\u001b[0me\u001b[0mj\u001b[0me\u001b[0mc\u001b[0mt\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mm\u001b[0m \u001b[0mw\u001b[0mi\u001b[0mt\u001b[0mh\u001b[0mo\u001b[0mu\u001b[0mt\u001b[0m \u001b[0me\u001b[0mv\u001b[0me\u001b[0mn\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mn\u001b[0ms\u001b[0mi\u001b[0md\u001b[0me\u001b[0mr\u001b[0ma\u001b[0mt\u001b[0mi\u001b[0mo\u001b[0mn\u001b[0m \u001b[0md\u001b[0me\u001b[0ms\u001b[0mp\u001b[0mi\u001b[0mt\u001b[0me\u001b[0m \u001b[0mc\u001b[0mh\u001b[0mi\u001b[0mn\u001b[0ma\u001b[0m \u001b[0mb\u001b[0me\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mf\u001b[0ml\u001b[0mu\u001b[0ms\u001b[0mh\u001b[0m \u001b[0mw\u001b[0mi\u001b[0mt\u001b[0mh\u001b[0m \u001b[0mc\u001b[0ma\u001b[0ms\u001b[0mh\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0mh\u001b[0mu\u001b[0mn\u001b[0mg\u001b[0mr\u001b[0my\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mg\u001b[0ma\u001b[0ms\u001b[0m.\u001b[0m.\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0ms\u001b[0me\u001b[0mt\u001b[0m \u001b[0mu\u001b[0mp\u001b[0m \u001b[0ma\u001b[0mn\u001b[0mo\u001b[0mt\u001b[0mh\u001b[0me\u001b[0mr\u001b[0m \u001b[0mi\u001b[0mn\u001b[0mf\u001b[0ma\u001b[0mm\u001b[0mo\u001b[0mu\u001b[0ms\u001b[0m \u001b[0mb\u001b[0mo\u001b[0mo\u001b[0mn\u001b[0md\u001b[0mo\u001b[0mg\u001b[0mg\u001b[0ml\u001b[0me\u001b[0m.\u001b[0m.\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mt\u001b[0mr\u001b[0ma\u001b[0mn\u001b[0ms\u001b[0mc\u001b[0ma\u001b[0mn\u001b[0ma\u001b[0md\u001b[0ma\u001b[0m-\u001b[0me\u001b[0mx\u001b[0mx\u001b[0mo\u001b[0mn\u001b[0m \u001b[0mr\u001b[0mi\u001b[0mp\u001b[0m \u001b[0mo\u001b[0mf\u001b[0mf\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0ms\u001b[0mt\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0m \u001b[0mp\u001b[0ma\u001b[0my\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0mh\u001b[0ma\u001b[0mv\u001b[0me\u001b[0m \u001b[0my\u001b[0me\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mr\u001b[0me\u001b[0mc\u001b[0me\u001b[0mi\u001b[0mv\u001b[0me\u001b[0m \u001b[0ma\u001b[0mn\u001b[0my\u001b[0mt\u001b[0mh\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0mv\u001b[0ma\u001b[0ml\u001b[0mu\u001b[0me\u001b[0m.\u001b[0m.\u001b[0m \u001b[0mh\u001b[0mu\u001b[0mn\u001b[0md\u001b[0mr\u001b[0me\u001b[0md\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0mm\u001b[0mi\u001b[0ml\u001b[0ml\u001b[0mi\u001b[0mo\u001b[0mn\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0md\u001b[0mo\u001b[0ml\u001b[0ml\u001b[0ma\u001b[0mr\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mn\u001b[0m \u001b[0ms\u001b[0mt\u001b[0mu\u001b[0md\u001b[0mi\u001b[0me\u001b[0ms\u001b[0m.\u001b[0m.\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0mb\u001b[0mu\u001b[0my\u001b[0mo\u001b[0mu\u001b[0mt\u001b[0ms\u001b[0m.\u001b[0m.\u001b[0m \u001b[0mi\u001b[0m \u001b[0mh\u001b[0mo\u001b[0mp\u001b[0me\u001b[0m \u001b[0mc\u001b[0mh\u001b[0mi\u001b[0mn\u001b[0ma\u001b[0m \u001b[0ms\u001b[0ma\u001b[0my\u001b[0ms\u001b[0m \u001b[0mf\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0ma\u001b[0ml\u001b[0ma\u001b[0ms\u001b[0mk\u001b[0ma\u001b[0m.\u001b[0m.\u001b[0m \u001b[0my\u001b[0mo\u001b[0mu\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0mh\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mb\u001b[0mu\u001b[0mt\u001b[0m \u001b[0m\u001b[41mi\u001b[0m\u001b[41mg\u001b[0m\u001b[41mn\u001b[0m\u001b[41mo\u001b[0m\u001b[41mr\u001b[0m\u001b[41ma\u001b[0m\u001b[41mn\u001b[0m\u001b[41mt\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m.\u001b[0m.\u001b[0m\n",
|
285 |
+
"\n",
|
286 |
+
"3\u001b[1m\u001b[36m Pred: \u001b[0mT\u001b[0mh\u001b[0me\u001b[0m \u001b[0mp\u001b[0ma\u001b[0mr\u001b[0ma\u001b[0ml\u001b[0ml\u001b[0me\u001b[0ml\u001b[0ms\u001b[0m \u001b[0mb\u001b[0me\u001b[0mt\u001b[0mw\u001b[0me\u001b[0me\u001b[0mn\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mA\u001b[0mN\u001b[0mC\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mS\u001b[0mi\u001b[0mc\u001b[0mi\u001b[0ml\u001b[0mi\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mM\u001b[0ma\u001b[0mf\u001b[0mi\u001b[0ma\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0mg\u001b[0ml\u001b[0ma\u001b[0mr\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m.\u001b[0m \u001b[0mT\u001b[0mh\u001b[0me\u001b[0m \u001b[0mA\u001b[0mN\u001b[0mC\u001b[0m \u001b[0mh\u001b[0ma\u001b[0ms\u001b[0m \u001b[0ma\u001b[0ml\u001b[0mw\u001b[0ma\u001b[0my\u001b[0ms\u001b[0m \u001b[0mb\u001b[0me\u001b[0me\u001b[0mn\u001b[0m \u001b[0mr\u001b[0mu\u001b[0mn\u001b[0m \u001b[0mb\u001b[0my\u001b[0m \u001b[0ma\u001b[0m \u001b[0mf\u001b[0me\u001b[0mw\u001b[0m \u001b[0m\"\u001b[0mf\u001b[0ma\u001b[0mm\u001b[0mi\u001b[0ml\u001b[0mi\u001b[0me\u001b[0ms\u001b[0m\"\u001b[0m \u001b[0mw\u001b[0mh\u001b[0mo\u001b[0m \u001b[0mt\u001b[0mr\u001b[0me\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0ms\u001b[0mt\u001b[0ma\u001b[0mt\u001b[0me\u001b[0m \u001b[0ma\u001b[0ms\u001b[0m \u001b[0m'\u001b[0mt\u001b[0mu\u001b[0mr\u001b[0mf\u001b[0m'\u001b[0m \u001b[0m;\u001b[0m \u001b[0ma\u001b[0ms\u001b[0m \u001b[0mj\u001b[0mu\u001b[0ms\u001b[0mt\u001b[0m \u001b[0mo\u001b[0mn\u001b[0me\u001b[0m \u001b[0mb\u001b[0mi\u001b[0mg\u001b[0m \u001b[0mp\u001b[0mi\u001b[0mg\u001b[0mg\u001b[0my\u001b[0m \u001b[0mb\u001b[0ma\u001b[0mn\u001b[0mk\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mi\u001b[0mr\u001b[0m \u001b[0ms\u001b[0me\u001b[0ml\u001b[0mf\u001b[0m-\u001b[0me\u001b[0mn\u001b[0mr\u001b[0mi\u001b[0mc\u001b[0mh\u001b[0mm\u001b[0me\u001b[0mn\u001b[0mt\u001b[0m.\u001b[0m \u001b[0mT\u001b[0mh\u001b[0me\u001b[0m \u001b[0mg\u001b[0mo\u001b[0mv\u001b[0me\u001b[0mr\u001b[0mn\u001b[0mm\u001b[0me\u001b[0mn\u001b[0mt\u001b[0m \u001b[0mb\u001b[0ma\u001b[0ms\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0ml\u001b[0ml\u001b[0my\u001b[0m \u001b[0mb\u001b[0me\u001b[0ml\u001b[0mi\u001b[0me\u001b[0mv\u001b[0me\u001b[0ms\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0ma\u001b[0ml\u001b[0ml\u001b[0m \u001b[0mj\u001b[0mu\u001b[0ms\u001b[0mt\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mr\u001b[0mk\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mm\u001b[0m.\u001b[0m \u001b[0mT\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0mn\u001b[0m'\u001b[0mt\u001b[0m \u001b[0ma\u001b[0m \u001b[0md\u001b[0me\u001b[0mm\u001b[0mo\u001b[0mc\u001b[0mr\u001b[0ma\u001b[0mt\u001b[0mi\u001b[0mc\u001b[0m \u001b[0mg\u001b[0mo\u001b[0mv\u001b[0me\u001b[0mr\u001b[0mn\u001b[0mm\u001b[0me\u001b[0mn\u001b[0mt\u001b[0m \u001b[0ma\u001b[0mt\u001b[0m \u001b[0ma\u001b[0ml\u001b[0ml\u001b[0m,\u001b[0m \u001b[0mb\u001b[0mu\u001b[0mt\u001b[0m \u001b[0mu\u001b[0ms\u001b[0me\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0ma\u001b[0mp\u001b[0mp\u001b[0me\u001b[0ma\u001b[0mr\u001b[0ma\u001b[0mn\u001b[0mc\u001b[0me\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0md\u001b[0me\u001b[0mm\u001b[0mo\u001b[0mc\u001b[0mr\u001b[0ma\u001b[0mc\u001b[0my\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mg\u001b[0mi\u001b[0mv\u001b[0me\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mi\u001b[0mr\u001b[0m \u001b[0me\u001b[0mn\u001b[0mt\u001b[0mi\u001b[0mt\u001b[0ml\u001b[0me\u001b[0mm\u001b[0me\u001b[0mn\u001b[0mt\u001b[0m \u001b[0mp\u001b[0mr\u001b[0ma\u001b[0mc\u001b[0mt\u001b[0mi\u001b[0mc\u001b[0me\u001b[0ms\u001b[0m \u001b[0ma\u001b[0m \u001b[0mm\u001b[0ma\u001b[0ms\u001b[0mk\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0ml\u001b[0me\u001b[0mg\u001b[0mi\u001b[0mt\u001b[0mi\u001b[0mm\u001b[0ma\u001b[0mc\u001b[0my\u001b[0m.\u001b[0m \u001b[0mT\u001b[0mh\u001b[0me\u001b[0m \u001b[0mp\u001b[0mo\u001b[0mo\u001b[0mr\u001b[0m \u001b[0ma\u001b[0mn\u001b[0md\u001b[0m \u001b[0m\u001b[41mi\u001b[0m\u001b[41mg\u001b[0m\u001b[41mn\u001b[0m\u001b[41mo\u001b[0m\u001b[41mr\u001b[0m\u001b[41ma\u001b[0m\u001b[41mn\u001b[0m\u001b[41mt\u001b[0m \u001b[0mh\u001b[0ma\u001b[0mv\u001b[0me\u001b[0m \u001b[0mb\u001b[0me\u001b[0me\u001b[0mn\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mo\u001b[0ml\u001b[0me\u001b[0md\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0ms\u001b[0mo\u001b[0m \u001b[0ml\u001b[0mo\u001b[0mn\u001b[0mg\u001b[0m,\u001b[0m \u001b[0mb\u001b[0mu\u001b[0mt\u001b[0m \u001b[0mp\u001b[0me\u001b[0mo\u001b[0mp\u001b[0ml\u001b[0me\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0ms\u001b[0ml\u001b[0mo\u001b[0mw\u001b[0ml\u001b[0my\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mA\u001b[0mN\u001b[0mC\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mw\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0mi\u001b[0mt\u001b[0m \u001b[0mi\u001b[0ms\u001b[0m:\u001b[0m \u001b[0ma\u001b[0m \u001b[0ms\u001b[0me\u001b[0ml\u001b[0mf\u001b[0m-\u001b[0ms\u001b[0me\u001b[0mr\u001b[0mv\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mM\u001b[0ma\u001b[0mf\u001b[0mi\u001b[0ma\u001b[0m!\u001b[0m\n",
|
287 |
+
"\n",
|
288 |
+
"4\u001b[1m\u001b[36m Pred: \u001b[0mI\u001b[0mn\u001b[0mt\u001b[0me\u001b[0ml\u001b[0m \u001b[0mC\u001b[0mo\u001b[0mm\u001b[0mm\u001b[0mu\u001b[0mn\u001b[0mi\u001b[0mt\u001b[0my\u001b[0m:\u001b[0m \u001b[0m‘\u001b[0mH\u001b[0mo\u001b[0mw\u001b[0m \u001b[0mc\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mw\u001b[0me\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mr\u001b[0mk\u001b[0m \u001b[0mf\u001b[0mo\u001b[0mr\u001b[0m \u001b[0ma\u001b[0m \u001b[0mP\u001b[0mr\u001b[0me\u001b[0ms\u001b[0mi\u001b[0md\u001b[0me\u001b[0mn\u001b[0mt\u001b[0m \u001b[0mw\u001b[0mh\u001b[0mo\u001b[0m \u001b[0mU\u001b[0mn\u001b[0md\u001b[0me\u001b[0mr\u001b[0mm\u001b[0mi\u001b[0mn\u001b[0me\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mu\u001b[0mr\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mr\u001b[0mk\u001b[0m?\u001b[0m’\u001b[0m\n",
|
289 |
+
"\u001b[0m\n",
|
290 |
+
"\u001b[0mD\u001b[0ma\u001b[0my\u001b[0ms\u001b[0m \u001b[0mb\u001b[0me\u001b[0mf\u001b[0mo\u001b[0mr\u001b[0me\u001b[0m \u001b[0mt\u001b[0ma\u001b[0mk\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mo\u001b[0mf\u001b[0mf\u001b[0mi\u001b[0mc\u001b[0me\u001b[0m,\u001b[0m \u001b[0mh\u001b[0me\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mp\u001b[0ma\u001b[0mr\u001b[0me\u001b[0md\u001b[0m \u001b[0mA\u001b[0mm\u001b[0me\u001b[0mr\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0m’\u001b[0ms\u001b[0m \u001b[0ms\u001b[0mp\u001b[0mi\u001b[0me\u001b[0ms\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mA\u001b[0md\u001b[0mo\u001b[0ml\u001b[0mf\u001b[0m \u001b[0mH\u001b[0mi\u001b[0mt\u001b[0ml\u001b[0me\u001b[0mr\u001b[0m’\u001b[0ms\u001b[0m \u001b[0m\u001b[41mG\u001b[0m\u001b[41me\u001b[0m\u001b[41ms\u001b[0m\u001b[41mt\u001b[0m\u001b[41ma\u001b[0m\u001b[41mp\u001b[0m\u001b[41mo\u001b[0m.\u001b[0m\n",
|
291 |
+
"\u001b[0m\n",
|
292 |
+
"\u001b[0mA\u001b[0mm\u001b[0me\u001b[0mr\u001b[0mi\u001b[0mc\u001b[0ma\u001b[0mn\u001b[0m \u001b[0ms\u001b[0mp\u001b[0mi\u001b[0me\u001b[0ms\u001b[0m \u001b[0md\u001b[0mo\u001b[0m \u001b[0mn\u001b[0mo\u001b[0mt\u001b[0m \u001b[0ml\u001b[0mi\u001b[0mk\u001b[0me\u001b[0m \u001b[0mh\u001b[0ma\u001b[0mv\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0mi\u001b[0mr\u001b[0m \u001b[0mw\u001b[0mo\u001b[0mr\u001b[0mk\u001b[0m \u001b[0md\u001b[0mi\u001b[0ms\u001b[0mm\u001b[0mi\u001b[0ms\u001b[0ms\u001b[0me\u001b[0md\u001b[0m \u001b[0mb\u001b[0my\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mp\u001b[0mr\u001b[0me\u001b[0ms\u001b[0mi\u001b[0md\u001b[0me\u001b[0mn\u001b[0mt\u001b[0m.\u001b[0m \u001b[0mN\u001b[0mo\u001b[0mr\u001b[0m \u001b[0md\u001b[0mo\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0my\u001b[0m \u001b[0ma\u001b[0mp\u001b[0mp\u001b[0mr\u001b[0me\u001b[0mc\u001b[0mi\u001b[0ma\u001b[0mt\u001b[0me\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mp\u001b[0ma\u001b[0mr\u001b[0mi\u001b[0ms\u001b[0mo\u001b[0mn\u001b[0ms\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0m\u001b[41mN\u001b[0m\u001b[41ma\u001b[0m\u001b[41mz\u001b[0m\u001b[41mi\u001b[0m \u001b[0mG\u001b[0me\u001b[0mr\u001b[0mm\u001b[0ma\u001b[0mn\u001b[0my\u001b[0m.\u001b[0m\n",
|
293 |
+
"\u001b[0m\n",
|
294 |
+
"\u001b[0mF\u001b[0mo\u001b[0mr\u001b[0mm\u001b[0me\u001b[0mr\u001b[0m \u001b[0mC\u001b[0mI\u001b[0mA\u001b[0m \u001b[0mD\u001b[0mi\u001b[0mr\u001b[0me\u001b[0mc\u001b[0mt\u001b[0mo\u001b[0mr\u001b[0m \u001b[0mJ\u001b[0mo\u001b[0mh\u001b[0mn\u001b[0m \u001b[0mB\u001b[0mr\u001b[0me\u001b[0mn\u001b[0mn\u001b[0ma\u001b[0mn\u001b[0m \u001b[0mm\u001b[0ma\u001b[0md\u001b[0me\u001b[0m \u001b[0mi\u001b[0mt\u001b[0m \u001b[0mc\u001b[0ml\u001b[0me\u001b[0ma\u001b[0mr\u001b[0m:\u001b[0m \u001b[0m“\u001b[0mT\u001b[0mh\u001b[0me\u001b[0m \u001b[0mp\u001b[0me\u001b[0mr\u001b[0ms\u001b[0mo\u001b[0mn\u001b[0m \u001b[0mw\u001b[0mh\u001b[0mo\u001b[0m \u001b[0ms\u001b[0ma\u001b[0mi\u001b[0md\u001b[0m \u001b[0mt\u001b[0mh\u001b[0ma\u001b[0mt\u001b[0m \u001b[0ms\u001b[0mh\u001b[0mo\u001b[0mu\u001b[0ml\u001b[0md\u001b[0m \u001b[0mb\u001b[0me\u001b[0m \u001b[0ma\u001b[0ms\u001b[0mh\u001b[0ma\u001b[0mm\u001b[0me\u001b[0md\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0mh\u001b[0mi\u001b[0mm\u001b[0ms\u001b[0me\u001b[0ml\u001b[0mf\u001b[0m,\u001b[0m”\u001b[0m \u001b[0mB\u001b[0mr\u001b[0me\u001b[0mn\u001b[0mn\u001b[0ma\u001b[0mn\u001b[0m \u001b[0ms\u001b[0ma\u001b[0mi\u001b[0md\u001b[0m.\u001b[0m\n",
|
295 |
+
"\u001b[0m\n",
|
296 |
+
"\u001b[0mC\u001b[0mo\u001b[0ma\u001b[0mt\u001b[0ms\u001b[0m’\u001b[0m \u001b[0mp\u001b[0mr\u001b[0me\u001b[0md\u001b[0me\u001b[0mc\u001b[0me\u001b[0ms\u001b[0ms\u001b[0mo\u001b[0mr\u001b[0m,\u001b[0m \u001b[0mG\u001b[0me\u001b[0mn\u001b[0m.\u001b[0m \u001b[0mJ\u001b[0ma\u001b[0mm\u001b[0me\u001b[0ms\u001b[0m \u001b[0mC\u001b[0ml\u001b[0ma\u001b[0mp\u001b[0mp\u001b[0me\u001b[0mr\u001b[0m,\u001b[0m \u001b[0mc\u001b[0ma\u001b[0ml\u001b[0ml\u001b[0me\u001b[0md\u001b[0m \u001b[0mT\u001b[0mr\u001b[0mu\u001b[0mm\u001b[0mp\u001b[0m’\u001b[0ms\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mm\u001b[0me\u001b[0mn\u001b[0mt\u001b[0ms\u001b[0m \u001b[0m“\u001b[0ma\u001b[0m \u001b[0mt\u001b[0me\u001b[0mr\u001b[0mr\u001b[0mi\u001b[0mb\u001b[0ml\u001b[0me\u001b[0m,\u001b[0m \u001b[0mi\u001b[0mn\u001b[0ms\u001b[0mu\u001b[0ml\u001b[0mt\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0ma\u001b[0mf\u001b[0mf\u001b[0mr\u001b[0mo\u001b[0mn\u001b[0mt\u001b[0m \u001b[0m…\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mm\u001b[0mp\u001b[0ml\u001b[0me\u001b[0mt\u001b[0me\u001b[0ml\u001b[0my\u001b[0m \u001b[0mi\u001b[0mn\u001b[0ma\u001b[0mp\u001b[0mp\u001b[0mr\u001b[0mo\u001b[0mp\u001b[0mr\u001b[0mi\u001b[0ma\u001b[0mt\u001b[0me\u001b[0m.\u001b[0m”\u001b[0m\n",
|
297 |
+
"\u001b[0m\n",
|
298 |
+
"\u001b[0mT\u001b[0mr\u001b[0mu\u001b[0mm\u001b[0mp\u001b[0m’\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mp\u001b[0mi\u001b[0mn\u001b[0mi\u001b[0mo\u001b[0mn\u001b[0ms\u001b[0m \u001b[0ma\u001b[0mr\u001b[0me\u001b[0m \u001b[0mo\u001b[0mu\u001b[0mt\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0ms\u001b[0mt\u001b[0me\u001b[0mp\u001b[0m \u001b[0mw\u001b[0mi\u001b[0mt\u001b[0mh\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0mc\u001b[0mo\u001b[0mn\u001b[0mc\u001b[0ml\u001b[0mu\u001b[0ms\u001b[0mi\u001b[0mo\u001b[0mn\u001b[0ms\u001b[0m \u001b[0mo\u001b[0mf\u001b[0m \u001b[0mt\u001b[0mh\u001b[0me\u001b[0m \u001b[0ma\u001b[0mg\u001b[0me\u001b[0mn\u001b[0mc\u001b[0mi\u001b[0me\u001b[0ms\u001b[0m \u001b[0mh\u001b[0me\u001b[0m \u001b[0mi\u001b[0ms\u001b[0m \u001b[0ms\u001b[0mu\u001b[0mp\u001b[0mp\u001b[0mo\u001b[0ms\u001b[0me\u001b[0md\u001b[0m \u001b[0mt\u001b[0mo\u001b[0m \u001b[0mb\u001b[0me\u001b[0m \u001b[0ml\u001b[0me\u001b[0ma\u001b[0md\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m.\u001b[0m\n",
|
299 |
+
"\u001b[0m\n",
|
300 |
+
"\u001b[0mN\u001b[0mo\u001b[0mt\u001b[0m \u001b[0ms\u001b[0mu\u001b[0mr\u001b[0mp\u001b[0mr\u001b[0mi\u001b[0ms\u001b[0mi\u001b[0mn\u001b[0mg\u001b[0m \u001b[0mi\u001b[0mn\u001b[0m \u001b[0ma\u001b[0m \u001b[0mt\u001b[0mr\u001b[0me\u001b[0ma\u001b[0ms\u001b[0mo\u001b[0mn\u001b[0mo\u001b[0mu\u001b[0ms\u001b[0m \u001b[0mt\u001b[0mr\u001b[0ma\u001b[0mi\u001b[0mt\u001b[0mo\u001b[0mr\u001b[0m.\u001b[0m\n",
|
301 |
+
"\n"
|
302 |
+
]
|
303 |
+
}
|
304 |
+
],
|
305 |
+
"source": [
|
306 |
+
"indices_evaluation = []\n",
|
307 |
+
"for i,text in enumerate(evaluation['text']):\n",
|
308 |
+
" tagged_sentence = tagger_LSTM(text) \n",
|
309 |
+
" prediction_index = get_index_toxic_words(text.lower(), tagged_sentence)\n",
|
310 |
+
" indices_evaluation.append(prediction_index)\n",
|
311 |
+
" \n",
|
312 |
+
" if i < 5:\n",
|
313 |
+
" print(str(i) + colored(' Pred: ', color='cyan', attrs=['bold']) + \n",
|
314 |
+
" color_toxic_words(prediction_index, text) + '\\n') "
|
315 |
+
]
|
316 |
+
},
|
317 |
+
{
|
318 |
+
"cell_type": "code",
|
319 |
+
"execution_count": 13,
|
320 |
+
"metadata": {},
|
321 |
+
"outputs": [
|
322 |
+
{
|
323 |
+
"data": {
|
324 |
+
"text/html": [
|
325 |
+
"<div>\n",
|
326 |
+
"<style scoped>\n",
|
327 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
328 |
+
" vertical-align: middle;\n",
|
329 |
+
" }\n",
|
330 |
+
"\n",
|
331 |
+
" .dataframe tbody tr th {\n",
|
332 |
+
" vertical-align: top;\n",
|
333 |
+
" }\n",
|
334 |
+
"\n",
|
335 |
+
" .dataframe thead th {\n",
|
336 |
+
" text-align: right;\n",
|
337 |
+
" }\n",
|
338 |
+
"</style>\n",
|
339 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
340 |
+
" <thead>\n",
|
341 |
+
" <tr style=\"text-align: right;\">\n",
|
342 |
+
" <th></th>\n",
|
343 |
+
" <th>spans</th>\n",
|
344 |
+
" <th>text</th>\n",
|
345 |
+
" </tr>\n",
|
346 |
+
" </thead>\n",
|
347 |
+
" <tbody>\n",
|
348 |
+
" <tr>\n",
|
349 |
+
" <th>0</th>\n",
|
350 |
+
" <td>[118, 119, 120, 121, 122, 123, 124, 125, 126, ...</td>\n",
|
351 |
+
" <td>That's right. They are not normal. And I am st...</td>\n",
|
352 |
+
" </tr>\n",
|
353 |
+
" <tr>\n",
|
354 |
+
" <th>1</th>\n",
|
355 |
+
" <td>[81, 82, 83, 84, 85, 86]</td>\n",
|
356 |
+
" <td>\"Watch people die from taking away their healt...</td>\n",
|
357 |
+
" </tr>\n",
|
358 |
+
" <tr>\n",
|
359 |
+
" <th>2</th>\n",
|
360 |
+
" <td>[483, 484, 485, 486, 487, 488, 489, 490]</td>\n",
|
361 |
+
" <td>tens years ago i contacted the PDR and suggest...</td>\n",
|
362 |
+
" </tr>\n",
|
363 |
+
" <tr>\n",
|
364 |
+
" <th>3</th>\n",
|
365 |
+
" <td>[413, 414, 415, 416, 417, 418, 419, 420]</td>\n",
|
366 |
+
" <td>The parallels between the ANC and the Sicilian...</td>\n",
|
367 |
+
" </tr>\n",
|
368 |
+
" <tr>\n",
|
369 |
+
" <th>4</th>\n",
|
370 |
+
" <td>[150, 151, 152, 153, 154, 155, 156, 271, 272, ...</td>\n",
|
371 |
+
" <td>Intel Community: ‘How can we work for a Presid...</td>\n",
|
372 |
+
" </tr>\n",
|
373 |
+
" </tbody>\n",
|
374 |
+
"</table>\n",
|
375 |
+
"</div>"
|
376 |
+
],
|
377 |
+
"text/plain": [
|
378 |
+
" spans \\\n",
|
379 |
+
"0 [118, 119, 120, 121, 122, 123, 124, 125, 126, ... \n",
|
380 |
+
"1 [81, 82, 83, 84, 85, 86] \n",
|
381 |
+
"2 [483, 484, 485, 486, 487, 488, 489, 490] \n",
|
382 |
+
"3 [413, 414, 415, 416, 417, 418, 419, 420] \n",
|
383 |
+
"4 [150, 151, 152, 153, 154, 155, 156, 271, 272, ... \n",
|
384 |
+
"\n",
|
385 |
+
" text \n",
|
386 |
+
"0 That's right. They are not normal. And I am st... \n",
|
387 |
+
"1 \"Watch people die from taking away their healt... \n",
|
388 |
+
"2 tens years ago i contacted the PDR and suggest... \n",
|
389 |
+
"3 The parallels between the ANC and the Sicilian... \n",
|
390 |
+
"4 Intel Community: ‘How can we work for a Presid... "
|
391 |
+
]
|
392 |
+
},
|
393 |
+
"execution_count": 13,
|
394 |
+
"metadata": {},
|
395 |
+
"output_type": "execute_result"
|
396 |
+
}
|
397 |
+
],
|
398 |
+
"source": [
|
399 |
+
"evaluation['spans'] = indices_evaluation\n",
|
400 |
+
"evaluation = evaluation[['spans', 'text']]\n",
|
401 |
+
"evaluation.head()"
|
402 |
+
]
|
403 |
+
},
|
404 |
+
{
|
405 |
+
"cell_type": "markdown",
|
406 |
+
"metadata": {},
|
407 |
+
"source": [
|
408 |
+
"Para la evaluación se debe subir un zip con un archivo txt de la siguiente manera (al final subir el archivo `spans-pred.zip` que se produce):"
|
409 |
+
]
|
410 |
+
},
|
411 |
+
{
|
412 |
+
"cell_type": "code",
|
413 |
+
"execution_count": 14,
|
414 |
+
"metadata": {},
|
415 |
+
"outputs": [
|
416 |
+
{
|
417 |
+
"name": "stdout",
|
418 |
+
"output_type": "stream",
|
419 |
+
"text": [
|
420 |
+
" adding: spans-pred.txt (deflated 84%)\n"
|
421 |
+
]
|
422 |
+
}
|
423 |
+
],
|
424 |
+
"source": [
|
425 |
+
"predictions = evaluation['spans'].tolist()\n",
|
426 |
+
"ids = evaluation.index.tolist()\n",
|
427 |
+
"\n",
|
428 |
+
"with open(\"spans-pred.txt\", \"w\") as out:\n",
|
429 |
+
" for uid, text_scores in zip(ids, predictions):\n",
|
430 |
+
" out.write(f\"{str(uid)}\\t{str(text_scores)}\\n\")\n",
|
431 |
+
" \n",
|
432 |
+
"# Zip the predictions\n",
|
433 |
+
"! zip -r spans-pred.zip ./spans-pred.* \n",
|
434 |
+
"! rm spans-pred.txt\n",
|
435 |
+
"! mv spans-pred.zip ../spans-pred.zip"
|
436 |
+
]
|
437 |
+
}
|
438 |
+
],
|
439 |
+
"metadata": {
|
440 |
+
"kernelspec": {
|
441 |
+
"display_name": "Python 3",
|
442 |
+
"language": "python",
|
443 |
+
"name": "python3"
|
444 |
+
},
|
445 |
+
"language_info": {
|
446 |
+
"codemirror_mode": {
|
447 |
+
"name": "ipython",
|
448 |
+
"version": 3
|
449 |
+
},
|
450 |
+
"file_extension": ".py",
|
451 |
+
"mimetype": "text/x-python",
|
452 |
+
"name": "python",
|
453 |
+
"nbconvert_exporter": "python",
|
454 |
+
"pygments_lexer": "ipython3",
|
455 |
+
"version": "3.7.3"
|
456 |
+
},
|
457 |
+
"toc-autonumbering": false
|
458 |
+
},
|
459 |
+
"nbformat": 4,
|
460 |
+
"nbformat_minor": 4
|
461 |
+
}
|
notebooks/Training LSTM-bidirectional.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/models/toxic_speech.crfsuite
ADDED
Binary file (440 kB). View file
|
|
notebooks/utils/__pycache__/lstm.cpython-39.pyc
ADDED
Binary file (6.14 kB). View file
|
|
notebooks/utils/__pycache__/processing.cpython-39.pyc
ADDED
Binary file (3.92 kB). View file
|
|
notebooks/utils/basic_models.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from nltk import pos_tag
|
2 |
+
from nltk.tokenize import word_tokenize
|
3 |
+
|
4 |
+
# The following is for the CRF, which we don't use anymore, instead we focused in the LSTM
|
5 |
+
|
6 |
+
def word2features(sent, i):
|
7 |
+
word = sent[i][0]
|
8 |
+
postag = sent[i][1]
|
9 |
+
features = [
|
10 |
+
'bias',
|
11 |
+
'word.lower=' + word.lower(),
|
12 |
+
'word[-3:]=' + word[-3:],
|
13 |
+
'word[-2:]=' + word[-2:],
|
14 |
+
'word.isupper=%s' % word.isupper(),
|
15 |
+
'word.istitle=%s' % word.istitle(),
|
16 |
+
'word.isdigit=%s' % word.isdigit(),
|
17 |
+
'postag=' + postag,
|
18 |
+
'postag[:2]=' + postag[:2],
|
19 |
+
]
|
20 |
+
if i > 0:
|
21 |
+
word1 = sent[i-1][0]
|
22 |
+
postag1 = sent[i-1][1]
|
23 |
+
features.extend([
|
24 |
+
'-1:word.lower=' + word1.lower(),
|
25 |
+
'-1:word.istitle=%s' % word1.istitle(),
|
26 |
+
'-1:word.isupper=%s' % word1.isupper(),
|
27 |
+
'-1:postag=' + postag1,
|
28 |
+
'-1:postag[:2]=' + postag1[:2],
|
29 |
+
])
|
30 |
+
else:
|
31 |
+
features.append('BOS')
|
32 |
+
|
33 |
+
if i < len(sent)-1:
|
34 |
+
word1 = sent[i+1][0]
|
35 |
+
postag1 = sent[i+1][1]
|
36 |
+
features.extend([
|
37 |
+
'+1:word.lower=' + word1.lower(),
|
38 |
+
'+1:word.istitle=%s' % word1.istitle(),
|
39 |
+
'+1:word.isupper=%s' % word1.isupper(),
|
40 |
+
'+1:postag=' + postag1,
|
41 |
+
'+1:postag[:2]=' + postag1[:2],
|
42 |
+
])
|
43 |
+
else:
|
44 |
+
features.append('EOS')
|
45 |
+
|
46 |
+
return features
|
47 |
+
|
48 |
+
def sent2features(sent):
|
49 |
+
return [word2features(sent, i) for i in range(len(sent))]
|
50 |
+
|
51 |
+
def sent2labels(sent):
|
52 |
+
return [label for token, postag, label in sent]
|
53 |
+
|
54 |
+
def sent2tokens(sent):
|
55 |
+
return [token for token, postag, label in sent]
|
56 |
+
|
57 |
+
def token_postag_label(sentence):
|
58 |
+
return pos_tag(word_tokenize(sentence))
|
notebooks/utils/lstm.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
import torch.optim as optim
|
8 |
+
|
9 |
+
from torchtext.data import Field
|
10 |
+
|
11 |
+
from .processing import separate_words, f1_scores
|
12 |
+
|
13 |
+
import spacy
|
14 |
+
import ast
|
15 |
+
|
16 |
+
from tqdm import tqdm
|
17 |
+
from IPython.display import clear_output
|
18 |
+
|
19 |
+
nlp = spacy.load('en_core_web_md')
|
20 |
+
dev = 'cuda:0' if torch.cuda.is_available() else 'cpu'
|
21 |
+
torch.manual_seed(42)
|
22 |
+
torch.backends.cudnn.deterministic = True
|
23 |
+
|
24 |
+
# SpaCy hace cosas no deseadas con algunas palabras al tokenizar, como don't -> [do, n't], pero se puede corregir.
|
25 |
+
# Pero de acuerdo a SpaCy esa es la convención, además, eso se debería codificar en los embeddings, así que se quede
|
26 |
+
# así, sólo hay que usar el mismo tokenizador en Field de torchtext (permite el de SpaCy entre otros).
|
27 |
+
|
28 |
+
# from spacy.symbols import ORTH, LEMMA, POS
|
29 |
+
# nlp.tokenizer.add_special_case("don't", [{ORTH: "do"}, {ORTH: "not"}])
|
30 |
+
# nlp.tokenizer.add_special_case("don't", [{ORTH: "don't"}])
|
31 |
+
# nlp.tokenizer.add_special_case("doesn't", [{ORTH: "does"}, {ORTH: "not"}])
|
32 |
+
|
33 |
+
def spacy_tokenizer (text):
|
34 |
+
return [str(token) for token in nlp(text)]
|
35 |
+
|
36 |
+
def prepare_data(spans, texts):
|
37 |
+
data = []
|
38 |
+
for index, text in tqdm(zip(spans, texts), total=len(texts)):
|
39 |
+
toxic_words = [text[i[0]:i[-1]+1] for i in separate_words(index) if len(index) > 0]
|
40 |
+
|
41 |
+
tokens = spacy_tokenizer(text)
|
42 |
+
tagged_tokens = []
|
43 |
+
|
44 |
+
for token in tokens:
|
45 |
+
if token in toxic_words:
|
46 |
+
tagged_tokens.append('toxic')
|
47 |
+
# Removemos en caso de que se repita posteriormente pero esté como 'non_toxic'
|
48 |
+
toxic_words.remove(token)
|
49 |
+
else:
|
50 |
+
tagged_tokens.append('non_toxic')
|
51 |
+
|
52 |
+
data.append((tokens, tagged_tokens, text, index))
|
53 |
+
|
54 |
+
return data
|
55 |
+
|
56 |
+
def get_vocab(train_df):
|
57 |
+
train_df['text'] = train_df['text'].apply(lambda x:x.lower())
|
58 |
+
|
59 |
+
# Aquí había un problema, estábamos usando 2 tokenizadores diferentes para sacar los
|
60 |
+
# embeddings y para preprocesar el texto para entrenar. Pondré el de SpaCy como
|
61 |
+
# tokenizador en común con el corpus de 'en_core_web_md'
|
62 |
+
text_field = Field(
|
63 |
+
tokenize='spacy',
|
64 |
+
tokenizer_language='en_core_web_md',
|
65 |
+
lower=True
|
66 |
+
)
|
67 |
+
# sadly have to apply preprocess manually
|
68 |
+
preprocessed_text = train_df['text'].apply(lambda x: text_field.preprocess(x))
|
69 |
+
# load fastext simple embedding with 200d
|
70 |
+
text_field.build_vocab(
|
71 |
+
preprocessed_text,
|
72 |
+
vectors='glove.twitter.27B.200d'
|
73 |
+
)
|
74 |
+
# get the vocab instance
|
75 |
+
vocab = text_field.vocab
|
76 |
+
|
77 |
+
return vocab
|
78 |
+
|
79 |
+
def plot_loss_and_score(train_loss, test_loss, f1_scores_train, f1_scores_test, show=True):
|
80 |
+
_, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(18,7))
|
81 |
+
|
82 |
+
ax0.plot(np.arange(1, len(train_loss) + 1), train_loss, marker='o', label='Train loss')
|
83 |
+
ax0.plot(np.arange(1, len(test_loss) + 1), test_loss, marker='o', label='Test loss')
|
84 |
+
ax0.set_xlabel(r'\textbf{Epochs}',size=16)
|
85 |
+
ax0.set_ylabel(r'\textbf{Loss}', size=16)
|
86 |
+
ax0.tick_params(labelsize=14)
|
87 |
+
ax0.legend(fontsize=14)
|
88 |
+
|
89 |
+
ax1.plot(np.arange(1, len(f1_scores_train) + 1), f1_scores_train,
|
90 |
+
marker='o', label='F1 score in train')
|
91 |
+
ax1.plot(np.arange(1, len(f1_scores_test) + 1), f1_scores_test,
|
92 |
+
marker='o', label='F1 score in test')
|
93 |
+
ax1.set_xlabel(r'\textbf{Epochs}',size=16)
|
94 |
+
ax1.set_ylabel(r'\textbf{F1 score}', size=16)
|
95 |
+
ax1.tick_params(labelsize=14)
|
96 |
+
ax1.legend(fontsize=14)
|
97 |
+
|
98 |
+
title = 'train-F1: {:.4f} \n test-F1: {:.4f}'.format(np.max(f1_scores_train), np.max(f1_scores_test))
|
99 |
+
ax1.set_title(title, fontweight='bold', size=16)
|
100 |
+
|
101 |
+
|
102 |
+
if show:
|
103 |
+
plt.show()
|
104 |
+
|
105 |
+
# WTF Mario, this is a mess
|
106 |
+
def train_model(model, trainloader, testloader, stop_after_best, savefile):
|
107 |
+
criterion = nn.BCELoss()
|
108 |
+
optimizer = optim.Adam(model.parameters())
|
109 |
+
|
110 |
+
loss_per_epoch = [0]
|
111 |
+
training_loss = [0]
|
112 |
+
f1_scores_train = [0]
|
113 |
+
f1_scores_dev = [0]
|
114 |
+
best_l = None
|
115 |
+
best_tl = None
|
116 |
+
worst_l = None
|
117 |
+
worst_tl = None
|
118 |
+
worst_l_f1 = None
|
119 |
+
best_l_f1 = None
|
120 |
+
worst_tl_f1 = None
|
121 |
+
last_epoch_save = 0
|
122 |
+
|
123 |
+
epochs_without_change = 0
|
124 |
+
epochs = len(loss_per_epoch)
|
125 |
+
|
126 |
+
while epochs_without_change < stop_after_best:
|
127 |
+
clear_output(wait=True)
|
128 |
+
|
129 |
+
print("Training on: " + torch.cuda.get_device_name(torch.cuda.current_device()))
|
130 |
+
print("###############################################")
|
131 |
+
print("Current epoch: " + str(epochs))
|
132 |
+
print("Last model save was in epoch " + str(last_epoch_save))
|
133 |
+
print("Stopping training in: " + str(stop_after_best - epochs_without_change) + " epochs.")
|
134 |
+
print("###############################################")
|
135 |
+
print("[Best iter] training F1 is: " + str(best_tl))
|
136 |
+
print("[Best iter] dev F1 is: " + str(best_l))
|
137 |
+
print("###############################################")
|
138 |
+
print("[Last iter] training F1 was: " + str(f1_scores_train[-1]))
|
139 |
+
print("[Last iter] dev. F1 was: " + str(f1_scores_dev[-1]))
|
140 |
+
print("###############################################")
|
141 |
+
|
142 |
+
# Dibujo lo que puedo
|
143 |
+
plot_loss_and_score(training_loss, loss_per_epoch, f1_scores_train, f1_scores_dev, show=True)
|
144 |
+
|
145 |
+
tl = 0
|
146 |
+
t_pred_l = []
|
147 |
+
t_true_index_l = []
|
148 |
+
t_tokenized_l = []
|
149 |
+
t_text_l = []
|
150 |
+
|
151 |
+
for _, v in tqdm(enumerate(trainloader), total=len(trainloader)): # Not using batches yet
|
152 |
+
text = torch.reshape(v['text'], (-1,))
|
153 |
+
tags = torch.reshape(v['spans'], (-1,))
|
154 |
+
optimizer.zero_grad()
|
155 |
+
tag_scores = model(text)
|
156 |
+
|
157 |
+
# Para la F1
|
158 |
+
t_pred_l.append(tag_scores.cpu().detach().numpy())
|
159 |
+
t_true_index_l.append([a.cpu().detach().numpy()[0] for a in v['true_index']])
|
160 |
+
t_tokenized_l.append([a[0] for a in v['tokenized']])
|
161 |
+
t_text_l.append(v['original_text'][0])
|
162 |
+
|
163 |
+
loss = criterion(torch.reshape(tag_scores, (-1,)), torch.reshape(tags, (-1,)).float())
|
164 |
+
tl += loss.item()
|
165 |
+
loss.backward()
|
166 |
+
optimizer.step()
|
167 |
+
|
168 |
+
tl /= len(trainloader)
|
169 |
+
l = 0
|
170 |
+
print("Starting evaluation for loss function.")
|
171 |
+
# evaluar el modelo
|
172 |
+
pred_l = []
|
173 |
+
true_index_l = []
|
174 |
+
tokenized_l = []
|
175 |
+
text_l = []
|
176 |
+
|
177 |
+
model.eval()
|
178 |
+
with torch.no_grad():
|
179 |
+
for v in testloader:
|
180 |
+
text = torch.reshape(v['text'], (-1,))
|
181 |
+
tags = torch.reshape(v['spans'], (-1,))
|
182 |
+
|
183 |
+
tag_scores = model(text)
|
184 |
+
|
185 |
+
#Para la F1
|
186 |
+
pred_l.append(tag_scores.cpu().detach().numpy())
|
187 |
+
true_index_l.append([a.cpu().detach().numpy()[0] for a in v['true_index']])
|
188 |
+
tokenized_l.append([a[0] for a in v['tokenized']])
|
189 |
+
text_l.append(v['original_text'][0])
|
190 |
+
|
191 |
+
loss = criterion(torch.reshape(tag_scores, (-1,)), torch.reshape(tags, (-1,)).float())
|
192 |
+
l += loss.item()
|
193 |
+
|
194 |
+
model.train()
|
195 |
+
l /= len(testloader)
|
196 |
+
print("Starting evaluation for dev F1")
|
197 |
+
f1_d = f1_scores(pred_l, true_index_l, tokenized_l, text_l)
|
198 |
+
# Es aproximado, pero solo es una referencia
|
199 |
+
f1_t = f1_scores(t_pred_l, t_true_index_l, t_tokenized_l, t_text_l)
|
200 |
+
|
201 |
+
|
202 |
+
epochs_without_change += 1
|
203 |
+
if best_l is None or best_l < f1_d:
|
204 |
+
print("Model improved, saving.")
|
205 |
+
torch.save(model, savefile)
|
206 |
+
best_l = f1_d
|
207 |
+
best_tl = f1_t
|
208 |
+
epochs_without_change = 0
|
209 |
+
last_epoch_save = epochs
|
210 |
+
print("Model improved, saved.")
|
211 |
+
|
212 |
+
# Para graficar con una escala coherente.
|
213 |
+
if(worst_l_f1 is None or f1_d < worst_l_f1):
|
214 |
+
worst_l_f1 = f1_d
|
215 |
+
f1_scores_dev[0] = worst_l_f1
|
216 |
+
if(worst_tl_f1 is None or f1_t < worst_tl_f1):
|
217 |
+
worst_tl_f1 = f1_t
|
218 |
+
f1_scores_train[0] = worst_tl_f1
|
219 |
+
if(worst_tl is None or tl > worst_tl):
|
220 |
+
worst_tl = tl
|
221 |
+
training_loss[0] = worst_tl
|
222 |
+
if(worst_l is None or l > worst_l):
|
223 |
+
worst_l = l
|
224 |
+
loss_per_epoch[0] = worst_l
|
225 |
+
|
226 |
+
# Rastreo las perdidas
|
227 |
+
loss_per_epoch.append(l)
|
228 |
+
training_loss.append(tl)
|
229 |
+
f1_scores_train.append(f1_t)
|
230 |
+
f1_scores_dev.append(f1_d)
|
231 |
+
# Rastreo la época actual
|
232 |
+
epochs += 1
|
233 |
+
print('Finished Training')
|
234 |
+
|
235 |
+
return loss_per_epoch, training_loss, f1_scores_train, f1_scores_dev
|
notebooks/utils/processing.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from termcolor import colored
|
2 |
+
import string
|
3 |
+
|
4 |
+
def color_toxic_words(index, text, html=False):
|
5 |
+
if not html:
|
6 |
+
colored_string = ''
|
7 |
+
for i, x in enumerate(text):
|
8 |
+
if i in index:
|
9 |
+
colored_string += colored(x, on_color='on_red')
|
10 |
+
else:
|
11 |
+
colored_string += colored(x)
|
12 |
+
else:
|
13 |
+
colored_string = ''
|
14 |
+
for i, x in enumerate(text):
|
15 |
+
if i in index:
|
16 |
+
colored_string += f'<span style="background-color: #FF0000">{x}</span>'
|
17 |
+
else:
|
18 |
+
colored_string += x
|
19 |
+
|
20 |
+
return colored_string
|
21 |
+
|
22 |
+
def remove_symbols(index, text):
|
23 |
+
"""
|
24 |
+
Remueve los índices que corresponden a símbolos 'no tóxicos', como espacios en blanco
|
25 |
+
comas, puntos, etc.
|
26 |
+
"""
|
27 |
+
index_clean = []
|
28 |
+
for i in index:
|
29 |
+
x = text[i]
|
30 |
+
if x not in ('"()+,-./:;<=>[\\]^_`{|}~' + string.whitespace):
|
31 |
+
index_clean.append(i)
|
32 |
+
|
33 |
+
return index_clean
|
34 |
+
|
35 |
+
def completely_toxic(span, text):
|
36 |
+
if span == []:
|
37 |
+
return [i for i in range(len(text))]
|
38 |
+
else:
|
39 |
+
return span
|
40 |
+
|
41 |
+
def separate_words(indices):
|
42 |
+
"""
|
43 |
+
Separa los índices por palabras.
|
44 |
+
"""
|
45 |
+
|
46 |
+
toxic_words_indices = []
|
47 |
+
m = 0
|
48 |
+
for i,(j,k) in enumerate(zip(indices[0:-1], indices[1:])):
|
49 |
+
if k-j != 1:
|
50 |
+
toxic_words_indices.append(indices[m:i+1])
|
51 |
+
m = i+1
|
52 |
+
toxic_words_indices.append(indices[m:]) # Última palabra
|
53 |
+
|
54 |
+
return toxic_words_indices
|
55 |
+
|
56 |
+
|
57 |
+
def postprocessing(indices_list, delta=7):
|
58 |
+
"""
|
59 |
+
Pone como tóxicos los caracteres en medio de dos palabras tóxicas si el espacio
|
60 |
+
entre ellas es menor a delta.
|
61 |
+
"""
|
62 |
+
|
63 |
+
# Asumiendo que tienes indices numéricos enteros.
|
64 |
+
if len(indices_list) > 1:
|
65 |
+
l = sorted(indices_list)
|
66 |
+
new_list = []
|
67 |
+
for i in range(len(indices_list)-1):
|
68 |
+
# Agrego el indice existente
|
69 |
+
new_list.append(l[i])
|
70 |
+
# Si no hay mucho espacio entre este y el siguiente indice, selecciono todos los indices intermedios
|
71 |
+
if (l[i+1] - l[i]) <= delta:
|
72 |
+
new_list = new_list + list(range(l[i]+1,l[i+1]))
|
73 |
+
|
74 |
+
new_list.append(l[-1]) # El ultimo elemento
|
75 |
+
return new_list
|
76 |
+
else:
|
77 |
+
return indices_list
|
78 |
+
|
79 |
+
|
80 |
+
def get_index_toxic_words(sentence, tagged_sentence, delta=7):
|
81 |
+
toxic_indices = []
|
82 |
+
m = 0
|
83 |
+
#tag_to_ix = {"non_toxic": 0, "toxic": 1}
|
84 |
+
for word_tag in tagged_sentence:
|
85 |
+
word, tag = word_tag
|
86 |
+
if tag == 1: #toxic
|
87 |
+
# Si la palabra tóxica aparece 2 o más veces ésto solo dará la primera
|
88 |
+
# aparición, hay que arreglar eso pero por lo mientras sirve
|
89 |
+
# word_indices = [sentence.find(word) + i for i in range(len(word))]
|
90 |
+
# toxic_indices.append(word_indices)
|
91 |
+
|
92 |
+
# Así parece evitar el problema de la palabra repetida
|
93 |
+
word_indices = [m + sentence[m:].find(word) + i for i in range(len(word))]
|
94 |
+
toxic_indices.append(word_indices)
|
95 |
+
# Ya se arregla el 'bug' de 'stupidity'
|
96 |
+
m += sentence[m:].find(word) + len(word)
|
97 |
+
|
98 |
+
toxic_indices = [val for sublist in toxic_indices for val in sublist]
|
99 |
+
|
100 |
+
# Unir espacios y otras cosas para que suba el F1
|
101 |
+
return postprocessing(toxic_indices, delta)
|
102 |
+
|
103 |
+
|
104 |
+
def f1(predictions, gold):
|
105 |
+
"""
|
106 |
+
F1 (a.k.a. DICE) operating on two lists of offsets (e.g., character).
|
107 |
+
>>> assert f1([0, 1, 4, 5], [0, 1, 6]) == 0.5714285714285714
|
108 |
+
:param predictions: a list of predicted offsets
|
109 |
+
:param gold: a list of offsets serving as the ground truth
|
110 |
+
:return: a score between 0 and 1
|
111 |
+
"""
|
112 |
+
if len(gold) == 0:
|
113 |
+
return 1. if len(predictions) == 0 else 0.
|
114 |
+
if len(predictions) == 0:
|
115 |
+
return 0.
|
116 |
+
predictions_set = set(predictions)
|
117 |
+
gold_set = set(gold)
|
118 |
+
nom = 2 * len(predictions_set.intersection(gold_set))
|
119 |
+
denom = len(predictions_set) + len(gold_set)
|
120 |
+
return float(nom)/float(denom)
|
121 |
+
|
122 |
+
def f1_scores(pred, true_index, tokenized, text, threshold=0.5):
|
123 |
+
scores = 0
|
124 |
+
for i in range(len(pred)):
|
125 |
+
tags = [1 if x > threshold else 0 for x in pred[i]]
|
126 |
+
tagged_sentence = list(zip(tokenized[i], tags))
|
127 |
+
prediction_index = get_index_toxic_words(text[i], tagged_sentence)
|
128 |
+
scores += f1(prediction_index, true_index[i])
|
129 |
+
return scores/len(pred)
|