Upload 5 files
Browse files- deployment_utils.py +607 -0
- plotting.py +230 -0
- preprocessing.py +591 -0
- style.css +94 -0
- utils.py +389 -0
deployment_utils.py
ADDED
@@ -0,0 +1,607 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# global
|
2 |
+
from typing import Tuple, List
|
3 |
+
import re
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
import tensorflow as tf
|
8 |
+
from tensorflow import keras
|
9 |
+
from keras.utils import pad_sequences
|
10 |
+
from keras.preprocessing.text import Tokenizer
|
11 |
+
from gensim.models.doc2vec import Doc2Vec
|
12 |
+
|
13 |
+
import transformers
|
14 |
+
from transformers import pipeline, BertTokenizer
|
15 |
+
|
16 |
+
import fasttext
|
17 |
+
|
18 |
+
# local
|
19 |
+
from preprocessing import Preprocessor
|
20 |
+
from utils import read_data
|
21 |
+
|
22 |
+
|
23 |
+
# read data
|
24 |
+
X_train, X_test, y_train, y_test = read_data()
|
25 |
+
|
26 |
+
# instantiate preprocessor object
|
27 |
+
preprocessor = Preprocessor()
|
28 |
+
|
29 |
+
# load models
|
30 |
+
doc2vec_model_embeddings = Doc2Vec.load(
|
31 |
+
"F:/Graduation Project/Project/models/best_doc2vec_embeddings")
|
32 |
+
doc2vec_model = keras.models.load_model(
|
33 |
+
"F:/Graduation Project/Project/models/best_doc2vec_model.h5")
|
34 |
+
tfidf_model = keras.models.load_model(
|
35 |
+
"F:/Graduation Project/Project/models/best_tfidf_model.h5")
|
36 |
+
cnn_model = keras.models.load_model(
|
37 |
+
"F:/Graduation Project/Project/models/best_cnn_model.h5")
|
38 |
+
glove_model = keras.models.load_model(
|
39 |
+
"F:/Graduation Project/Project/models/best_glove_model.h5")
|
40 |
+
lstm_model = keras.models.load_model(
|
41 |
+
"F:/Graduation Project/Project/models/best_lstm_model.h5")
|
42 |
+
bert_model = keras.models.load_model(
|
43 |
+
"F:/Graduation Project/Project/models/best_bert_model.h5", custom_objects={"TFBertModel": transformers.TFBertModel})
|
44 |
+
fasttext_model = fasttext.load_model(
|
45 |
+
"F:/Graduation Project/Project/models/best_fasttext_model.bin")
|
46 |
+
summarization_model = pipeline(
|
47 |
+
"summarization", model="facebook/bart-large-cnn")
|
48 |
+
|
49 |
+
|
50 |
+
# TODO: Add Docstrings
|
51 |
+
def extract_case_information(case_content: str):
|
52 |
+
content_list = case_content.split("\n")
|
53 |
+
petitioner = re.findall(r"petitioner:(.+)", content_list[0])[0]
|
54 |
+
respondent = re.findall(r"respondent:(.+)", content_list[1])[0]
|
55 |
+
facts = re.findall(r"facts:(.+)", content_list[2])[0]
|
56 |
+
|
57 |
+
return petitioner, respondent, facts
|
58 |
+
|
59 |
+
|
60 |
+
def generate_random_sample() -> Tuple[str, str, str, int]:
|
61 |
+
"""
|
62 |
+
Randomly fetch a random case from `X_test` to test it.
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
--------
|
66 |
+
A tuple contains the following:
|
67 |
+
- petitioner : str
|
68 |
+
Contains petitioner name.
|
69 |
+
- respondent : str
|
70 |
+
Contains respondent name.
|
71 |
+
- facts : str
|
72 |
+
Contains case facts.
|
73 |
+
- label : int
|
74 |
+
Represents the winning index(0 = petitioner, 1 = respondent).
|
75 |
+
"""
|
76 |
+
|
77 |
+
random_idx = np.random.randint(low=0, high=len(X_test))
|
78 |
+
|
79 |
+
petitioner = X_test["first_party"].iloc[random_idx]
|
80 |
+
respondent = X_test["second_party"].iloc[random_idx]
|
81 |
+
facts = X_test["Facts"].iloc[random_idx]
|
82 |
+
label = y_test.iloc[random_idx][0]
|
83 |
+
|
84 |
+
return petitioner, respondent, facts, label
|
85 |
+
|
86 |
+
|
87 |
+
def generate_highlighted_words(facts: str, petitioner_words: List[str], respondent_words: List[str]):
|
88 |
+
"""
|
89 |
+
Highlight `petitioner_words` and `respondent_words` for model
|
90 |
+
interpretation.
|
91 |
+
|
92 |
+
Parameters:
|
93 |
+
-----------
|
94 |
+
- facts : str
|
95 |
+
Facts of a specific case.
|
96 |
+
- petitioner_words : List[str]
|
97 |
+
Contains all words that model pays attention
|
98 |
+
to be a petetioner words.
|
99 |
+
- respondent_words : List[str]
|
100 |
+
Contains all words that model pays attention
|
101 |
+
to be a respondent words.
|
102 |
+
|
103 |
+
Returns:
|
104 |
+
--------
|
105 |
+
- rendered_text : str
|
106 |
+
Contains the `facts` but with adding
|
107 |
+
highlighting mechanism to visualize it using CSS in HTML format.
|
108 |
+
|
109 |
+
Example:
|
110 |
+
--------
|
111 |
+
>>> facts_ = 'Mohammed shot Aly after a hot negotiation happened between
|
112 |
+
... them about the profits of their company'
|
113 |
+
>>> petitioner_words_ = ['shot', 'hot']
|
114 |
+
>>> respondent_words_ = ['profits']
|
115 |
+
>>> generate_highlighted_words(facts, petitioner_words_, respondent_words_)
|
116 |
+
|
117 |
+
>>> output:
|
118 |
+
<div class='text-facts'> Mohammed <span class='highlight-petitioner'>shot</span>
|
119 |
+
Aly after a <span class='highlight-petitioner'>hot</span> negotiation happened
|
120 |
+
between them about <span class='highlight-respondent'>profits</span> of their
|
121 |
+
company </div>
|
122 |
+
"""
|
123 |
+
|
124 |
+
rendered_text = '<div class="text-facts"> '
|
125 |
+
|
126 |
+
for word in facts.split():
|
127 |
+
if word in petitioner_words:
|
128 |
+
highlight_word = ' <span class="highlight-petitioner"> ' + word + " </span> "
|
129 |
+
rendered_text += highlight_word
|
130 |
+
|
131 |
+
elif word in respondent_words:
|
132 |
+
highlight_word = ' <span class="highlight-respondent"> ' + word + " </span> "
|
133 |
+
rendered_text += highlight_word
|
134 |
+
|
135 |
+
else:
|
136 |
+
rendered_text += " " + word
|
137 |
+
|
138 |
+
rendered_text += " </div>"
|
139 |
+
|
140 |
+
return rendered_text
|
141 |
+
|
142 |
+
|
143 |
+
class VectorizerGenerator:
|
144 |
+
"""Responsible for creation and generation of tokenizers and text
|
145 |
+
vectorizers for JudgerAIs' models"""
|
146 |
+
|
147 |
+
def __init__(self) -> None:
|
148 |
+
pass
|
149 |
+
|
150 |
+
def generate_tf_idf_vectorizer(self) -> keras.layers.TextVectorization:
|
151 |
+
"""
|
152 |
+
Generating best text vectroizer of the tf-idf model (3rd combination).
|
153 |
+
|
154 |
+
Returns:
|
155 |
+
-------
|
156 |
+
- text_vectorizer : keras.layers.TextVectorization
|
157 |
+
Represents the case facts' vectorizer that converts case facts to
|
158 |
+
numerical tensors.
|
159 |
+
"""
|
160 |
+
|
161 |
+
first_party_names = X_train["first_party"]
|
162 |
+
second_party_names = X_train["second_party"]
|
163 |
+
facts = X_train["Facts"]
|
164 |
+
|
165 |
+
anonymized_facts = preprocessor.anonymize_data(
|
166 |
+
first_party_names, second_party_names, facts)
|
167 |
+
|
168 |
+
text_vectorizer, _ = preprocessor.convert_text_to_vectors_tf_idf(
|
169 |
+
anonymized_facts)
|
170 |
+
|
171 |
+
return text_vectorizer
|
172 |
+
|
173 |
+
def generate_cnn_vectorizer(self) -> keras.layers.TextVectorization:
|
174 |
+
"""
|
175 |
+
Generating best text vectroizer of the cnn model (2nd combination).
|
176 |
+
|
177 |
+
Returns:
|
178 |
+
-------
|
179 |
+
- text_vectorizer : keras.layers.TextVectorization
|
180 |
+
Represents the case facts' vectorizer that converts case facts to
|
181 |
+
numerical tensors.
|
182 |
+
"""
|
183 |
+
|
184 |
+
balanced_df = preprocessor.balance_data(X_train["Facts"], y_train)
|
185 |
+
X_train_balanced = balanced_df["Facts"]
|
186 |
+
|
187 |
+
text_vectorizer, _ = preprocessor.convert_text_to_vectors_cnn(
|
188 |
+
X_train_balanced)
|
189 |
+
|
190 |
+
return text_vectorizer
|
191 |
+
|
192 |
+
def generate_glove_tokenizer(self) -> keras.preprocessing.text.Tokenizer:
|
193 |
+
"""
|
194 |
+
Generating best glove tokenizer of the GloVe model (2nd combination).
|
195 |
+
|
196 |
+
Returns:
|
197 |
+
-------
|
198 |
+
- glove_tokenizer : keras.preprocessing.text.Tokenizer
|
199 |
+
Represents the case facts' tokenizer that converts case facts to
|
200 |
+
numerical tensors.
|
201 |
+
"""
|
202 |
+
|
203 |
+
balanced_df = preprocessor.balance_data(X_train["Facts"], y_train)
|
204 |
+
X_train_balanced = balanced_df["Facts"]
|
205 |
+
|
206 |
+
glove_tokenizer, _ = preprocessor.convert_text_to_vectors_glove(
|
207 |
+
X_train_balanced)
|
208 |
+
|
209 |
+
return glove_tokenizer
|
210 |
+
|
211 |
+
def generate_lstm_tokenizer(self) -> keras.preprocessing.text.Tokenizer:
|
212 |
+
"""
|
213 |
+
Generating best text tokenizer of the LSTM model (1st combination).
|
214 |
+
|
215 |
+
Returns:
|
216 |
+
-------
|
217 |
+
- lstm_tokenizer : keras.preprocessing.text.Tokenizer
|
218 |
+
Represents the case facts' tokenizer that converts case facts to
|
219 |
+
numerical tensors.
|
220 |
+
"""
|
221 |
+
|
222 |
+
lstm_tokenizer = Tokenizer(num_words=18430)
|
223 |
+
lstm_tokenizer.fit_on_texts(X_train)
|
224 |
+
|
225 |
+
return lstm_tokenizer
|
226 |
+
|
227 |
+
def generate_bert_tokenizer(self) -> transformers.BertTokenizer:
|
228 |
+
"""
|
229 |
+
Generating best bert tokenizer of the BERT model (1st combination).
|
230 |
+
|
231 |
+
Returns:
|
232 |
+
-------
|
233 |
+
- bert_tokenizer : transformers.BertTokenizer
|
234 |
+
Represents the case facts' tokenizer that converts case facts to
|
235 |
+
input ids tensors.
|
236 |
+
"""
|
237 |
+
|
238 |
+
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
|
239 |
+
return bert_tokenizer
|
240 |
+
|
241 |
+
|
242 |
+
class DataPreparator:
|
243 |
+
"""Responsible for preparing the case facts aka converting case facts to
|
244 |
+
numerical vectors using `VectorizerGenerator` object."""
|
245 |
+
|
246 |
+
def __init__(self) -> None:
|
247 |
+
self.vectorizer_generator = VectorizerGenerator()
|
248 |
+
|
249 |
+
def prepare_doc2vec(self, facts: str) -> pd.DataFrame:
|
250 |
+
"""
|
251 |
+
Responsible for converting `facts` string to numerical vector
|
252 |
+
using `doc2vec_model_embeddings`.
|
253 |
+
|
254 |
+
Parameters:
|
255 |
+
----------
|
256 |
+
- facts : str
|
257 |
+
Represents the case facts.
|
258 |
+
|
259 |
+
Returns:
|
260 |
+
-------
|
261 |
+
- facts_vector : pd.DataFrame
|
262 |
+
A row DataFrame represents the 50-d vector of the `facts`.
|
263 |
+
"""
|
264 |
+
|
265 |
+
facts = pd.Series(facts)
|
266 |
+
facts_processed = preprocessor.preprocess_data(facts)
|
267 |
+
facts_vectors = preprocessor.convert_text_to_vectors_doc2vec(
|
268 |
+
facts_processed, train=False, embeddings_doc2vec=doc2vec_model_embeddings)
|
269 |
+
|
270 |
+
return facts_vectors
|
271 |
+
|
272 |
+
def _anonymize_facts(self, first_party_name: str, second_party_name: str, facts: str) -> str:
|
273 |
+
"""
|
274 |
+
Anonymize case `facts` by replacing `first_party_name` & `second_party_name` with
|
275 |
+
generic tag "__PARTY__".
|
276 |
+
|
277 |
+
Parameters:
|
278 |
+
-----------
|
279 |
+
- first_party_name : str
|
280 |
+
Represents the petitioner name.
|
281 |
+
- second_party_name : str
|
282 |
+
Represents the respondent name.
|
283 |
+
- facts : str
|
284 |
+
Represents the case facts.
|
285 |
+
|
286 |
+
Returns:
|
287 |
+
-------
|
288 |
+
- anonymized_facts : str
|
289 |
+
Represents `facts` after anonymization.
|
290 |
+
"""
|
291 |
+
|
292 |
+
anonymized_facts = preprocessor._anonymize_case_facts(
|
293 |
+
first_party_name, second_party_name, facts)
|
294 |
+
|
295 |
+
return anonymized_facts
|
296 |
+
|
297 |
+
def prepare_tf_idf(self, anonymized_facts: str) -> tf.Tensor:
|
298 |
+
"""
|
299 |
+
Responsible for converting `facts` string to numerical vector
|
300 |
+
using tf-idf `vectorizer_generator` in the 3rd combination.
|
301 |
+
|
302 |
+
Parameters:
|
303 |
+
-----------
|
304 |
+
- anonymized_facts : str
|
305 |
+
Represents the case facts after anonymization.
|
306 |
+
|
307 |
+
Returns:
|
308 |
+
-------
|
309 |
+
- facts_vector : tf.Tensor
|
310 |
+
A Tensor of 10000-d represents `facts`.
|
311 |
+
"""
|
312 |
+
|
313 |
+
anonymized_facts = pd.Series(anonymized_facts)
|
314 |
+
tf_idf_vectorizer = self.vectorizer_generator.generate_tf_idf_vectorizer()
|
315 |
+
|
316 |
+
facts_vector = preprocessor.convert_text_to_vectors_tf_idf(
|
317 |
+
anonymized_facts, train=False, text_vectorizer=tf_idf_vectorizer)
|
318 |
+
|
319 |
+
return facts_vector
|
320 |
+
|
321 |
+
def prepare_cnn(self, facts: str) -> tf.Tensor:
|
322 |
+
"""
|
323 |
+
Responsible for converting `facts` string to numerical vector
|
324 |
+
using cnn `vectorizer_generator` in the 2nd combination.
|
325 |
+
|
326 |
+
Parameters:
|
327 |
+
-----------
|
328 |
+
- facts : str
|
329 |
+
Represents the case facts.
|
330 |
+
|
331 |
+
Returns:
|
332 |
+
-------
|
333 |
+
- facts_vector : tf.Tensor
|
334 |
+
A Tensor of 2000-d represents `facts`.
|
335 |
+
"""
|
336 |
+
facts = pd.Series(facts)
|
337 |
+
|
338 |
+
cnn_vectorizer = self.vectorizer_generator.generate_cnn_vectorizer()
|
339 |
+
|
340 |
+
facts_vector = preprocessor.convert_text_to_vectors_cnn(
|
341 |
+
facts, train=False, text_vectorizer=cnn_vectorizer)
|
342 |
+
|
343 |
+
return facts_vector
|
344 |
+
|
345 |
+
def prepare_glove(self, facts: str) -> np.ndarray:
|
346 |
+
"""
|
347 |
+
Responsible for converting `facts` string to numerical vector
|
348 |
+
using glove `vectorizer_generator` in the 2nd combination.
|
349 |
+
|
350 |
+
Parameters:
|
351 |
+
-----------
|
352 |
+
- facts : str
|
353 |
+
Represents the case facts.
|
354 |
+
|
355 |
+
Returns:
|
356 |
+
-------
|
357 |
+
- facts_vector : np.ndarray
|
358 |
+
A nd.ndarray of 50-d represents `facts`.
|
359 |
+
"""
|
360 |
+
|
361 |
+
facts = pd.Series(facts)
|
362 |
+
|
363 |
+
glove_tokneizer = self.vectorizer_generator.generate_glove_tokenizer()
|
364 |
+
|
365 |
+
facts_vector = preprocessor.convert_text_to_vectors_glove(
|
366 |
+
facts, train=False, glove_tokenizer=glove_tokneizer)
|
367 |
+
|
368 |
+
return facts_vector
|
369 |
+
|
370 |
+
def prepare_lstm(self, facts: str) -> np.ndarray:
|
371 |
+
"""
|
372 |
+
Responsible for converting `facts` string to numerical vector
|
373 |
+
using lstm `vectorizer_generator` in the 1st combination.
|
374 |
+
|
375 |
+
Parameters:
|
376 |
+
-----------
|
377 |
+
- facts : str
|
378 |
+
Represents the case facts.
|
379 |
+
|
380 |
+
Returns:
|
381 |
+
-------
|
382 |
+
- facts_vector_padded : np.ndarray
|
383 |
+
A nd.ndarray of 974-d represents `facts`.
|
384 |
+
"""
|
385 |
+
|
386 |
+
facts = pd.Series(facts)
|
387 |
+
lstm_tokenizer = self.vectorizer_generator.generate_lstm_tokenizer()
|
388 |
+
facts_vector = lstm_tokenizer.texts_to_sequences(facts)
|
389 |
+
facts_vector_padded = pad_sequences(facts_vector, 974)
|
390 |
+
|
391 |
+
return facts_vector_padded
|
392 |
+
|
393 |
+
def prepare_bert(self, facts: str) -> tf.Tensor:
|
394 |
+
"""
|
395 |
+
Responsible for converting `facts` string to numerical vector
|
396 |
+
using bert `vectorizer_generator` in the 1st combination.
|
397 |
+
|
398 |
+
Parameters:
|
399 |
+
-----------
|
400 |
+
- facts : str
|
401 |
+
Represents the case facts.
|
402 |
+
|
403 |
+
Returns:
|
404 |
+
-------
|
405 |
+
- tf.Tensor
|
406 |
+
A tf.Tensor of 256-d represents `facts` input ids.
|
407 |
+
"""
|
408 |
+
|
409 |
+
bert_tokenizer = self.vectorizer_generator.generate_bert_tokenizer()
|
410 |
+
facts_vector_dict = bert_tokenizer.encode_plus(
|
411 |
+
facts,
|
412 |
+
max_length=256,
|
413 |
+
truncation=True,
|
414 |
+
padding='max_length',
|
415 |
+
add_special_tokens=True,
|
416 |
+
return_tensors='tf'
|
417 |
+
)
|
418 |
+
|
419 |
+
return facts_vector_dict["input_ids"]
|
420 |
+
|
421 |
+
|
422 |
+
class Predictor:
|
423 |
+
"""Responsible for get predictions of JudgerAIs' models"""
|
424 |
+
|
425 |
+
def __init__(self) -> None:
|
426 |
+
self.data_preparator = DataPreparator()
|
427 |
+
|
428 |
+
def predict_doc2vec(self, facts: str) -> np.ndarray:
|
429 |
+
"""
|
430 |
+
Get prediction of `facts` using `doc2vec_model`.
|
431 |
+
|
432 |
+
Parameters:
|
433 |
+
----------
|
434 |
+
- facts : str
|
435 |
+
Represents the case facts.
|
436 |
+
|
437 |
+
Returns:
|
438 |
+
--------
|
439 |
+
- pet_res_scores : np.ndarray
|
440 |
+
An array contains 2 elements, one for probability of petitioner winning
|
441 |
+
and the second for the probability of respondent winning.
|
442 |
+
"""
|
443 |
+
|
444 |
+
facts_vector = self.data_preparator.prepare_doc2vec(facts)
|
445 |
+
predictions = doc2vec_model.predict(facts_vector)
|
446 |
+
|
447 |
+
pet_res_scores = []
|
448 |
+
for i in predictions:
|
449 |
+
temp = i[0]
|
450 |
+
pet_res_scores.append(np.array([1 - temp, temp]))
|
451 |
+
|
452 |
+
return np.array(pet_res_scores)
|
453 |
+
|
454 |
+
def predict_tf_idf(self, anonymized_facts: str) -> np.ndarray:
|
455 |
+
"""
|
456 |
+
Get prediction of `facts` using `tfidf_model`.
|
457 |
+
|
458 |
+
Parameters:
|
459 |
+
-----------
|
460 |
+
- anonymized_facts : str
|
461 |
+
Represents the case facts after anonymization.
|
462 |
+
|
463 |
+
Returns:
|
464 |
+
--------
|
465 |
+
- pet_res_scores : np.ndarray
|
466 |
+
An array contains 2 elements, one for probability of petitioner winning
|
467 |
+
and the second for the probability of respondent winning.
|
468 |
+
"""
|
469 |
+
|
470 |
+
facts_vector = self.data_preparator.prepare_tf_idf(anonymized_facts)
|
471 |
+
predictions = tfidf_model.predict(facts_vector)
|
472 |
+
|
473 |
+
pet_res_scores = []
|
474 |
+
for i in predictions:
|
475 |
+
temp = i[0]
|
476 |
+
pet_res_scores.append(np.array([1 - temp, temp]))
|
477 |
+
|
478 |
+
return np.array(pet_res_scores)
|
479 |
+
|
480 |
+
def predict_cnn(self, facts: str) -> np.ndarray:
|
481 |
+
"""
|
482 |
+
Get prediction of `facts` using `cnn_model`.
|
483 |
+
|
484 |
+
Parameters:
|
485 |
+
----------
|
486 |
+
- facts : str
|
487 |
+
Represents the case facts.
|
488 |
+
|
489 |
+
Returns:
|
490 |
+
--------
|
491 |
+
- pet_res_scores : np.ndarray
|
492 |
+
An array contains 2 elements, one for probability of petitioner winning
|
493 |
+
and the second for the probability of respondent winning.
|
494 |
+
"""
|
495 |
+
|
496 |
+
facts_vector = self.data_preparator.prepare_cnn(facts)
|
497 |
+
predictions = cnn_model.predict(facts_vector)
|
498 |
+
|
499 |
+
pet_res_scores = []
|
500 |
+
for i in predictions:
|
501 |
+
temp = i[0]
|
502 |
+
pet_res_scores.append(np.array([1 - temp, temp]))
|
503 |
+
|
504 |
+
return np.array(pet_res_scores)
|
505 |
+
|
506 |
+
def predict_glove(self, facts: str) -> np.ndarray:
|
507 |
+
"""
|
508 |
+
Get prediction of `facts` using `glove_model`.
|
509 |
+
|
510 |
+
Parameters:
|
511 |
+
----------
|
512 |
+
- facts : str
|
513 |
+
Represents the case facts.
|
514 |
+
|
515 |
+
Returns:
|
516 |
+
--------
|
517 |
+
- pet_res_scores : np.ndarray
|
518 |
+
An array contains 2 elements, one for probability of petitioner winning
|
519 |
+
and the second for the probability of respondent winning.
|
520 |
+
"""
|
521 |
+
|
522 |
+
facts_vector = self.data_preparator.prepare_glove(facts)
|
523 |
+
predictions = glove_model.predict(facts_vector)
|
524 |
+
|
525 |
+
pet_res_scores = []
|
526 |
+
for i in predictions:
|
527 |
+
temp = i[0]
|
528 |
+
pet_res_scores.append(np.array([1 - temp, temp]))
|
529 |
+
|
530 |
+
return np.array(pet_res_scores)
|
531 |
+
|
532 |
+
def predict_lstm(self, facts: str) -> np.ndarray:
|
533 |
+
"""
|
534 |
+
Get prediction of `facts` using `lstm_model`.
|
535 |
+
|
536 |
+
Parameters:
|
537 |
+
----------
|
538 |
+
- facts : str
|
539 |
+
Represents the case facts.
|
540 |
+
|
541 |
+
Returns:
|
542 |
+
--------
|
543 |
+
- pet_res_scores : np.ndarray
|
544 |
+
An array contains 2 elements, one for probability of petitioner winning
|
545 |
+
and the second for the probability of respondent winning.
|
546 |
+
"""
|
547 |
+
|
548 |
+
facts_vector = self.data_preparator.prepare_lstm(facts)
|
549 |
+
predictions = lstm_model.predict(facts_vector)
|
550 |
+
|
551 |
+
pet_res_scores = []
|
552 |
+
for i in predictions:
|
553 |
+
temp = i[0]
|
554 |
+
pet_res_scores.append(np.array([1 - temp, temp]))
|
555 |
+
|
556 |
+
return np.array(pet_res_scores)
|
557 |
+
|
558 |
+
def predict_bert(self, facts: str) -> np.ndarray:
|
559 |
+
"""
|
560 |
+
Get prediction of `facts` using `bert_model`.
|
561 |
+
|
562 |
+
Parameters:
|
563 |
+
----------
|
564 |
+
- facts : str
|
565 |
+
Represents the case facts.
|
566 |
+
|
567 |
+
Returns:
|
568 |
+
--------
|
569 |
+
- predictions : np.ndarray
|
570 |
+
An array contains 2 elements, one for probability of petitioner winning
|
571 |
+
and the second for the probability of respondent winning.
|
572 |
+
"""
|
573 |
+
|
574 |
+
facts_vector = self.data_preparator.prepare_bert(facts)
|
575 |
+
predictions = bert_model.predict(facts_vector)
|
576 |
+
|
577 |
+
return predictions
|
578 |
+
|
579 |
+
def predict_fasttext(self, facts: str) -> np.ndarray:
|
580 |
+
"""
|
581 |
+
Get prediction of `facts` using `fasttext`.
|
582 |
+
|
583 |
+
Parameters:
|
584 |
+
----------
|
585 |
+
- facts : str
|
586 |
+
Represents the case facts.
|
587 |
+
|
588 |
+
Returns:
|
589 |
+
--------
|
590 |
+
- pet_res_scores : np.ndarray
|
591 |
+
An array contains 2 elements, one for probability of petitioner winning
|
592 |
+
and the second for the probability of respondent winning.
|
593 |
+
"""
|
594 |
+
|
595 |
+
prediction = fasttext_model.predict(facts)[1]
|
596 |
+
prediction = np.array([prediction])
|
597 |
+
|
598 |
+
pet_res_scores = []
|
599 |
+
for i in prediction:
|
600 |
+
temp = i[0]
|
601 |
+
pet_res_scores.append(np.array([1 - temp, temp]))
|
602 |
+
|
603 |
+
return np.array(pet_res_scores)
|
604 |
+
|
605 |
+
def summarize_facts(self, facts: str) -> str:
|
606 |
+
summarized_case_facts = summarization_model(facts)[0]['summary_text']
|
607 |
+
return summarized_case_facts
|
plotting.py
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import seaborn as sn
|
7 |
+
|
8 |
+
from sklearn.metrics import auc
|
9 |
+
from sklearn.metrics import roc_curve
|
10 |
+
from sklearn.metrics import classification_report
|
11 |
+
from sklearn.metrics import confusion_matrix
|
12 |
+
|
13 |
+
from tensorflow import keras
|
14 |
+
|
15 |
+
|
16 |
+
class PlottingManager:
|
17 |
+
"""Responsible for providing plots & visualization for the models."""
|
18 |
+
|
19 |
+
def __init__(self) -> None:
|
20 |
+
"""Define style for visualizations."""
|
21 |
+
plt.style.use("seaborn")
|
22 |
+
|
23 |
+
def plot_subplots_curve(
|
24 |
+
self,
|
25 |
+
training_measure: List[List[float]],
|
26 |
+
validation_measure: List[List[float]],
|
27 |
+
title: str,
|
28 |
+
train_color: str = "orangered",
|
29 |
+
validation_color: str = "dodgerblue",
|
30 |
+
) -> None:
|
31 |
+
"""
|
32 |
+
Plotting subplots of the elements of `training_measure` vs. `validation_measure`.
|
33 |
+
|
34 |
+
Parameters:
|
35 |
+
------------
|
36 |
+
- training_measure : List[List[float]]
|
37 |
+
A `k` by `num_epochs` list contains the trained measure whether it's loss or
|
38 |
+
accuracy for each fold.
|
39 |
+
- validation_measure : List[List[float]]
|
40 |
+
A `k` by `num_epochs` list contains the validation measure whether it's loss
|
41 |
+
or accuracy for each fold.
|
42 |
+
- title : str
|
43 |
+
Represents the title of the plot.
|
44 |
+
- train_color : str, optional
|
45 |
+
Represents the graph color for the `training_measure`. (Default is "orangered").
|
46 |
+
- validation_color : str, optional
|
47 |
+
Represents the graph color for the `validation_measure`. (Default is "dodgerblue").
|
48 |
+
"""
|
49 |
+
|
50 |
+
plt.figure(figsize=(12, 8))
|
51 |
+
|
52 |
+
for i in range(len(training_measure)):
|
53 |
+
plt.subplot(2, 2, i + 1)
|
54 |
+
plt.plot(training_measure[i], c=train_color)
|
55 |
+
plt.plot(validation_measure[i], c=validation_color)
|
56 |
+
plt.title("Fold " + str(i + 1))
|
57 |
+
|
58 |
+
plt.suptitle(title)
|
59 |
+
plt.show()
|
60 |
+
|
61 |
+
def plot_heatmap(
|
62 |
+
self, measure: List[List[float]], title: str, cmap: str = "coolwarm"
|
63 |
+
) -> None:
|
64 |
+
"""
|
65 |
+
Plotting a heatmap of the values in `measure`.
|
66 |
+
|
67 |
+
Parameters:
|
68 |
+
------------
|
69 |
+
- measure : List[List[float]]
|
70 |
+
A `k` by `num_epochs` list contains the measure whether it's loss
|
71 |
+
or accuracy for each fold.
|
72 |
+
- title : str
|
73 |
+
Title of the plot.
|
74 |
+
- cmap : str, optional
|
75 |
+
Color map of the plot (default is "coolwarm").
|
76 |
+
"""
|
77 |
+
|
78 |
+
# transpose the array to make it `num_epochs` by `k`
|
79 |
+
values_array = np.array(measure).T
|
80 |
+
df_cm = pd.DataFrame(
|
81 |
+
values_array,
|
82 |
+
range(1, values_array.shape[0] + 1),
|
83 |
+
["fold " + str(i + 1) for i in range(4)],
|
84 |
+
)
|
85 |
+
|
86 |
+
plt.figure(figsize=(10, 8))
|
87 |
+
plt.title(
|
88 |
+
title + " Throughout " + str(values_array.shape[1]) + " Folds", pad=20
|
89 |
+
)
|
90 |
+
sn.heatmap(df_cm, annot=True, cmap=cmap, annot_kws={"size": 10})
|
91 |
+
plt.show()
|
92 |
+
|
93 |
+
def plot_average_curves(
|
94 |
+
self,
|
95 |
+
title: str,
|
96 |
+
x: List[float],
|
97 |
+
y: List[float],
|
98 |
+
x_label: str,
|
99 |
+
y_label: str,
|
100 |
+
train_color: str = "orangered",
|
101 |
+
validation_color: str = "dodgerblue",
|
102 |
+
) -> None:
|
103 |
+
"""
|
104 |
+
Plotting the curves of `x` against `y`, where x and y are training and validation
|
105 |
+
measures (loss or accuracy).
|
106 |
+
|
107 |
+
Parameters:
|
108 |
+
------------
|
109 |
+
- title : str
|
110 |
+
Title of the plot.
|
111 |
+
- x : List[float]
|
112 |
+
Training measure of the models (loss or accuracy).
|
113 |
+
- y : List[float]
|
114 |
+
Validation measure of the models (loss or accuracy).
|
115 |
+
- x_label : str
|
116 |
+
Label of the training measure to put it in plot legend.
|
117 |
+
- y_label : str
|
118 |
+
Label of the validation measure to put it in plot legend.
|
119 |
+
- train_color : str, optional
|
120 |
+
Color of the training plot (default is "orangered").
|
121 |
+
- validation_color : str, optional
|
122 |
+
Color of the validation plot (default is "dodgerblue").
|
123 |
+
"""
|
124 |
+
|
125 |
+
plt.title(title, pad=20)
|
126 |
+
plt.plot(x, c=train_color, label=x_label)
|
127 |
+
plt.plot(y, c=validation_color, label=y_label)
|
128 |
+
plt.legend()
|
129 |
+
plt.show()
|
130 |
+
|
131 |
+
def plot_roc_curve(
|
132 |
+
self,
|
133 |
+
all_models: List[keras.models.Sequential],
|
134 |
+
X_test: pd.DataFrame,
|
135 |
+
y_test: pd.Series,
|
136 |
+
) -> None:
|
137 |
+
"""
|
138 |
+
Plotting the AUC-ROC curve of all the passed models in `all_models`.
|
139 |
+
|
140 |
+
Parameters:
|
141 |
+
------------
|
142 |
+
- all_models : List[keras.models.Sequential]
|
143 |
+
Contains all trained models, number of models equals number of
|
144 |
+
`k` fold cross-validation.
|
145 |
+
- X_test : pd.DataFrame
|
146 |
+
Contains the testing vectors.
|
147 |
+
- y_test : pd.Series
|
148 |
+
Contains the testing labels.
|
149 |
+
"""
|
150 |
+
|
151 |
+
plt.figure(figsize=(12, 8))
|
152 |
+
for i, model in enumerate(all_models):
|
153 |
+
y_pred = model.predict(X_test).ravel()
|
154 |
+
fpr, tpr, _ = roc_curve(y_test, y_pred)
|
155 |
+
auc_curve = auc(fpr, tpr)
|
156 |
+
plt.subplot(2, 2, i + 1)
|
157 |
+
plt.plot([0, 1], [0, 1], color="dodgerblue", linestyle="--")
|
158 |
+
plt.plot(
|
159 |
+
fpr,
|
160 |
+
tpr,
|
161 |
+
color="orangered",
|
162 |
+
label=f"Fold {str(i+1)} (area = {auc_curve:.3f})",
|
163 |
+
)
|
164 |
+
plt.legend(loc="best")
|
165 |
+
plt.title(f"Fold {str(i+1)}")
|
166 |
+
|
167 |
+
plt.suptitle("AUC-ROC curves")
|
168 |
+
plt.show()
|
169 |
+
|
170 |
+
def plot_classification_report(
|
171 |
+
self, model: keras.models.Sequential, X_test: pd.DataFrame, y_test: pd.Series
|
172 |
+
) -> str | dict:
|
173 |
+
"""
|
174 |
+
Plotting the classification report of the passed `model`.
|
175 |
+
|
176 |
+
Parameters:
|
177 |
+
------------
|
178 |
+
- model : keras.models.Sequential
|
179 |
+
The trained model that will be evaluated.
|
180 |
+
- X_test : pd.DataFrame
|
181 |
+
Contains the testing vectors.
|
182 |
+
- y_test : pd.Series
|
183 |
+
Contains the testing labels.
|
184 |
+
|
185 |
+
Returns:
|
186 |
+
--------
|
187 |
+
- str | dict: The classification report for the given model and testing data.
|
188 |
+
It returns a string if `output_format` is set to 'str', and returns
|
189 |
+
a dictionary if `output_format` is set to 'dict'.
|
190 |
+
"""
|
191 |
+
|
192 |
+
y_pred = model.predict(X_test).ravel()
|
193 |
+
preds = np.where(y_pred > 0.5, 1, 0)
|
194 |
+
cls_report = classification_report(y_test, preds)
|
195 |
+
|
196 |
+
return cls_report
|
197 |
+
|
198 |
+
def plot_confusion_matrix(
|
199 |
+
self,
|
200 |
+
all_models: List[keras.models.Sequential],
|
201 |
+
X_test: pd.DataFrame,
|
202 |
+
y_test: pd.Series,
|
203 |
+
) -> None:
|
204 |
+
"""
|
205 |
+
Plotting the confusion matrix of each model in `all_models`.
|
206 |
+
|
207 |
+
Parameters:
|
208 |
+
------------
|
209 |
+
- all_models: list[keras.models.Sequential]
|
210 |
+
Contains all trained models, number of models equals
|
211 |
+
number of `k` fold cross-validation.
|
212 |
+
- X_test: pd.DataFrame
|
213 |
+
Contains the testing vectors.
|
214 |
+
- y_test: pd.Series
|
215 |
+
Contains the testing labels.
|
216 |
+
"""
|
217 |
+
|
218 |
+
_, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
|
219 |
+
|
220 |
+
for i, (model, ax) in enumerate(zip(all_models, axes.flatten())):
|
221 |
+
y_pred = model.predict(X_test).ravel()
|
222 |
+
preds = np.where(y_pred > 0.5, 1, 0)
|
223 |
+
|
224 |
+
conf_matrix = confusion_matrix(y_test, preds)
|
225 |
+
sn.heatmap(conf_matrix, annot=True, ax=ax)
|
226 |
+
ax.set_title(f"Fold {i+1}")
|
227 |
+
|
228 |
+
plt.suptitle("Confusion Matrices")
|
229 |
+
plt.tight_layout()
|
230 |
+
plt.show()
|
preprocessing.py
ADDED
@@ -0,0 +1,591 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# global
|
2 |
+
import string
|
3 |
+
from typing import List, Tuple
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
|
8 |
+
import re
|
9 |
+
import nltk
|
10 |
+
|
11 |
+
from sklearn.utils import resample
|
12 |
+
|
13 |
+
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
14 |
+
from nltk.tokenize import RegexpTokenizer
|
15 |
+
|
16 |
+
import tensorflow as tf
|
17 |
+
from keras.layers import TextVectorization
|
18 |
+
from keras.preprocessing.text import Tokenizer
|
19 |
+
from keras.utils import pad_sequences
|
20 |
+
|
21 |
+
# local
|
22 |
+
from utils import Doc2VecModel
|
23 |
+
|
24 |
+
|
25 |
+
punct = string.punctuation
|
26 |
+
stemmer = nltk.stem.PorterStemmer()
|
27 |
+
eng_stopwords = nltk.corpus.stopwords.words("english")
|
28 |
+
|
29 |
+
|
30 |
+
class Preprocessor:
|
31 |
+
"""Responsible for preprocessing case facts."""
|
32 |
+
|
33 |
+
def __init__(self) -> None:
|
34 |
+
pass
|
35 |
+
|
36 |
+
def _nltk_tokenizer(self, text: str) -> List[str]:
|
37 |
+
"""
|
38 |
+
Tokenize a given `text` using the RegexpTokenizer from the nltk library.
|
39 |
+
|
40 |
+
Parameters:
|
41 |
+
-----------
|
42 |
+
- text : str
|
43 |
+
A string containing the text to be tokenized.
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
--------
|
47 |
+
- tokens : List[str]
|
48 |
+
A list of tokens generated by the tokenizer.
|
49 |
+
"""
|
50 |
+
|
51 |
+
tokenizer = RegexpTokenizer(r"\w+")
|
52 |
+
tokens = tokenizer.tokenize(text)
|
53 |
+
|
54 |
+
return tokens
|
55 |
+
|
56 |
+
def _tokenize_text(self, text_column: pd.Series) -> pd.Series:
|
57 |
+
"""Splitting `text_column` into tokens.
|
58 |
+
|
59 |
+
Parameters:
|
60 |
+
------------
|
61 |
+
- text_column : pd.Series
|
62 |
+
Contains text that needs to be tokenized.
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
--------
|
66 |
+
- tokenized_text : pd.Series
|
67 |
+
Contains tokenized version of `text_column`.
|
68 |
+
"""
|
69 |
+
|
70 |
+
tokenized_text = text_column.apply(self._nltk_tokenizer)
|
71 |
+
return tokenized_text
|
72 |
+
|
73 |
+
def _convert_to_tagged_document(
|
74 |
+
self, text_column: pd.Series
|
75 |
+
) -> Tuple[List[str], List[TaggedDocument]]:
|
76 |
+
"""
|
77 |
+
Convert `text_column` of specific to TaggedDocuments.
|
78 |
+
|
79 |
+
Parameters:
|
80 |
+
------------
|
81 |
+
- column : pd.Series
|
82 |
+
Contains the list of tokens of each fact.
|
83 |
+
|
84 |
+
Returns:
|
85 |
+
--------
|
86 |
+
A tuble containing the following items:
|
87 |
+
- tokens_list : list[str]
|
88 |
+
Contains all tokens of each case in the `text_column`.
|
89 |
+
- tagged_docs : list[TaggedDocument]
|
90 |
+
Contains TaggedDocument object for each case.
|
91 |
+
"""
|
92 |
+
|
93 |
+
tokens_list = text_column.to_list()
|
94 |
+
tagged_docs = [TaggedDocument(t, [str(i)])
|
95 |
+
for i, t in enumerate(tokens_list)]
|
96 |
+
|
97 |
+
return tokens_list, tagged_docs
|
98 |
+
|
99 |
+
def _vectorize_text(
|
100 |
+
self, doc2vec_model: Doc2Vec, df: pd.Series, tokens_list: List[str]
|
101 |
+
) -> pd.DataFrame:
|
102 |
+
"""
|
103 |
+
Convert values of `tokens_list` to a vector.
|
104 |
+
|
105 |
+
Parameters:
|
106 |
+
-----------
|
107 |
+
- doc2vec_model : Doc2Vev
|
108 |
+
Trained Doc2Vec model.
|
109 |
+
- df : pd.Series
|
110 |
+
This will use only to get its indicies for the new generated dataframe.
|
111 |
+
- tokens_list : List[str]
|
112 |
+
Contains all tokens of each case.
|
113 |
+
|
114 |
+
Returns:
|
115 |
+
--------
|
116 |
+
- text_vectors_df : pd.DataFrame
|
117 |
+
Contains the vector representaion for each case.
|
118 |
+
"""
|
119 |
+
|
120 |
+
text_vectors = [doc2vec_model.infer_vector(doc) for doc in tokens_list]
|
121 |
+
text_vectors_df = pd.DataFrame(text_vectors, index=df.index)
|
122 |
+
|
123 |
+
return text_vectors_df
|
124 |
+
|
125 |
+
def _anonymize_case_facts(
|
126 |
+
self, first_party_name: str, second_party_name: str, facts: str
|
127 |
+
) -> str:
|
128 |
+
"""
|
129 |
+
Anonymize case facts by replacing its party names with "_PARTY_" tag.
|
130 |
+
|
131 |
+
Parameters:
|
132 |
+
------------
|
133 |
+
- first_party_name : str
|
134 |
+
Represents first party name or petitioner name.
|
135 |
+
- second_party_name : str
|
136 |
+
Represents second party name or respondent name.
|
137 |
+
- facts : str
|
138 |
+
Represents case facts.
|
139 |
+
|
140 |
+
Returns:
|
141 |
+
--------
|
142 |
+
- anonymized_facts : str
|
143 |
+
An anonymized version of `facts`.
|
144 |
+
"""
|
145 |
+
|
146 |
+
# remove any commas and any non alphabet characters
|
147 |
+
first_party_name = re.sub(r"[\,+]", " ", first_party_name)
|
148 |
+
first_party_name = re.sub(r"[^a-zA-Z]", " ", first_party_name)
|
149 |
+
|
150 |
+
second_party_name = re.sub(r"[\,+]", " ", second_party_name)
|
151 |
+
second_party_name = re.sub(r"[^a-zA-Z]", " ", second_party_name)
|
152 |
+
|
153 |
+
for name in first_party_name.split():
|
154 |
+
facts = re.sub(name, " _PARTY_ ", facts)
|
155 |
+
|
156 |
+
for name in second_party_name.split():
|
157 |
+
facts = re.sub(name, " _PARTY_ ", facts)
|
158 |
+
|
159 |
+
# replace any consecutive _PARTY_ tags with only one _PARTY_ tag.
|
160 |
+
regex_continous_tags = r"(_PARTY_\s+){2,}"
|
161 |
+
anonymized_facts = re.sub(regex_continous_tags, " _PARTY_ ", facts)
|
162 |
+
# remove ant consecutive spaces
|
163 |
+
anonymized_facts = re.sub(r"\s+", " ", anonymized_facts)
|
164 |
+
|
165 |
+
return anonymized_facts
|
166 |
+
|
167 |
+
def _preprocess_text(self, text: str) -> str:
|
168 |
+
"""
|
169 |
+
Preprocessing & cleaning `text` including:
|
170 |
+
- lowercasing
|
171 |
+
- removing quotation marks
|
172 |
+
- removing digits
|
173 |
+
- removing punctuation
|
174 |
+
- removing brackets, braces, and paranthesis
|
175 |
+
- removeing stopwords
|
176 |
+
- stemming tokens
|
177 |
+
|
178 |
+
Parameters:
|
179 |
+
------------
|
180 |
+
- text : str
|
181 |
+
Text need to be processed (cleaned).
|
182 |
+
|
183 |
+
Returns:
|
184 |
+
--------
|
185 |
+
- processed_text : str
|
186 |
+
A preprocessed version of `text`.
|
187 |
+
"""
|
188 |
+
|
189 |
+
text = text.lower()
|
190 |
+
# remove quotation marks
|
191 |
+
text = re.sub(r"\'", "", text)
|
192 |
+
# remove digits
|
193 |
+
text = re.sub(r"\d+", "", text)
|
194 |
+
# remove punctuation but with keeping '_' letter
|
195 |
+
text = "".join([ch for ch in text if (ch == "_") or (ch not in punct)])
|
196 |
+
# remove brackets, braces, and parantheses
|
197 |
+
text = re.sub(r"[\[\]\(\)\{\}]+", " ", text)
|
198 |
+
tokens = nltk.word_tokenize(text)
|
199 |
+
# remove stopwords and stemming tokens
|
200 |
+
tokens = [stemmer.stem(token)
|
201 |
+
for token in tokens if token not in eng_stopwords]
|
202 |
+
# convert tokens back to string
|
203 |
+
processed_text = " ".join(tokens)
|
204 |
+
|
205 |
+
return processed_text
|
206 |
+
|
207 |
+
def convert_text_to_vectors_doc2vec(
|
208 |
+
self,
|
209 |
+
text_column: pd.Series,
|
210 |
+
train: bool = True,
|
211 |
+
embeddings_doc2vec: Doc2Vec = None,
|
212 |
+
) -> Tuple[Doc2Vec, pd.DataFrame] | pd.DataFrame:
|
213 |
+
"""
|
214 |
+
Converting `text_column` to vectors using `Doc2Vec` model
|
215 |
+
|
216 |
+
Parameters:
|
217 |
+
------------
|
218 |
+
- text_column : pd.Series
|
219 |
+
Contains the case facts.
|
220 |
+
- train : bool, optional
|
221 |
+
Defines whether the model will be trained or not. (if True, Doc2Vec will be trained |
|
222 |
+
else, Doc2Vec will used the passed `embeddings_Doc2Vec`). (Default is True).
|
223 |
+
- embeddings_doc2vec : Doc2Vec, optional
|
224 |
+
Trained Doc2Vec model will be used for generating embeddings of `text_column` if
|
225 |
+
`train` is False. (Default is None).
|
226 |
+
|
227 |
+
Returns:
|
228 |
+
--------
|
229 |
+
1. A tuple contains the following:
|
230 |
+
- embeddings_doc2vec : Doc2Vec
|
231 |
+
Trained Doc2Vec model.
|
232 |
+
- text_vectors_df : pd.DataFrame
|
233 |
+
A DataFrame contains `text_column` vectors if `train` is True.
|
234 |
+
|
235 |
+
2. text_vectors_df : pd.DataFrame
|
236 |
+
A DataFrame contains `text_column` vectors if `train` is False.
|
237 |
+
|
238 |
+
Raises:
|
239 |
+
-------
|
240 |
+
- AssertionError
|
241 |
+
If train is False and `embeddings_doc2vec` is None.
|
242 |
+
- AssertionError
|
243 |
+
If train is False and `embedding_doc2vec` is not an instance of Doc2Vec
|
244 |
+
"""
|
245 |
+
|
246 |
+
tokenized_text = self._tokenize_text(text_column)
|
247 |
+
tokens_list, tagged_docs = self._convert_to_tagged_document(
|
248 |
+
tokenized_text)
|
249 |
+
|
250 |
+
if train:
|
251 |
+
doc2vec_model = Doc2VecModel()
|
252 |
+
embeddings_doc2vec = doc2vec_model.train_doc2vec_embeddings_model(
|
253 |
+
tagged_docs
|
254 |
+
)
|
255 |
+
text_vectors_df = self._vectorize_text(
|
256 |
+
embeddings_doc2vec, text_column, tokens_list
|
257 |
+
)
|
258 |
+
return embeddings_doc2vec, text_vectors_df
|
259 |
+
|
260 |
+
assert (
|
261 |
+
embeddings_doc2vec is not None
|
262 |
+
), "`embedding_doc2vec` argument must be not None."
|
263 |
+
assert isinstance(
|
264 |
+
embeddings_doc2vec, Doc2Vec
|
265 |
+
), "`embedding_doc2vec` argument must be an instance of Doc2Vec to infer vectors."
|
266 |
+
text_vectors_df = self._vectorize_text(
|
267 |
+
embeddings_doc2vec, text_column, tokens_list
|
268 |
+
)
|
269 |
+
|
270 |
+
return text_vectors_df
|
271 |
+
|
272 |
+
def convert_text_to_vectors_tf_idf(
|
273 |
+
self,
|
274 |
+
text_column: pd.Series,
|
275 |
+
ngrams: int = 2,
|
276 |
+
max_tokens: int = 10000,
|
277 |
+
output_mode: str = "tf-idf",
|
278 |
+
train: bool = True,
|
279 |
+
text_vectorizer: TextVectorization = None,
|
280 |
+
) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor:
|
281 |
+
"""
|
282 |
+
Converting `text_column` to vectors using `TextVectorization` layer.
|
283 |
+
|
284 |
+
Parameters:
|
285 |
+
------------
|
286 |
+
- text_column : pd.Series
|
287 |
+
Contains the case facts.
|
288 |
+
- ngrams : int, optional
|
289 |
+
Defines the number of n-gram (Default is 2).
|
290 |
+
- max_tokens : int, optional
|
291 |
+
Defines the number of max_tokens of `text_vectorizer` (Default is 10,000).
|
292 |
+
- output_mode : str, optional
|
293 |
+
Represents the output vectors type whether it is "tfi-df" or "binary" or "count"
|
294 |
+
(Default is "tf-idf").
|
295 |
+
- train : bool, optional
|
296 |
+
Defines whether the model will be trained or not. (if True, TextVectorization
|
297 |
+
will be trained, else, TextVectorization will used the passed `text_vectorizer`).
|
298 |
+
(Default is True).
|
299 |
+
- text_vectorizer : TextVectorization, optional
|
300 |
+
Trained TextVectorization layer will be used for generating embeddings of
|
301 |
+
`text_column` if `train` is False. (Default is None).
|
302 |
+
|
303 |
+
Returns:
|
304 |
+
--------
|
305 |
+
- if `train` == True:
|
306 |
+
A tuple contains the following:
|
307 |
+
- text_vectorizer : TextVectorization
|
308 |
+
Trained TextVectorization layer.
|
309 |
+
- text_vectors : tf.Tensor
|
310 |
+
A Tensor contains `text_column` training vectors.
|
311 |
+
- otherwise:
|
312 |
+
text_vectors : tf.Tensor
|
313 |
+
A Tensor contains `text_column` testing vectors.
|
314 |
+
|
315 |
+
Raises:
|
316 |
+
-------
|
317 |
+
- AssertionError
|
318 |
+
If train is False and `text_vectorizer` is None.
|
319 |
+
- AssertionError
|
320 |
+
If train is False and `text_vectorizer` is not an instance of TextVectorization.
|
321 |
+
"""
|
322 |
+
|
323 |
+
if train:
|
324 |
+
text_vectorizer = TextVectorization(
|
325 |
+
ngrams=ngrams, max_tokens=max_tokens, output_mode=output_mode
|
326 |
+
)
|
327 |
+
text_vectorizer.adapt(text_column)
|
328 |
+
text_vectors = text_vectorizer(text_column)
|
329 |
+
|
330 |
+
return text_vectorizer, text_vectors
|
331 |
+
|
332 |
+
assert (
|
333 |
+
text_vectorizer is not None
|
334 |
+
), "`text_vectorizer` argument must be not None."
|
335 |
+
assert isinstance(
|
336 |
+
text_vectorizer, TextVectorization
|
337 |
+
), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors."
|
338 |
+
text_vectors = text_vectorizer(text_column)
|
339 |
+
|
340 |
+
return text_vectors
|
341 |
+
|
342 |
+
def convert_text_to_vectors_cnn(
|
343 |
+
self,
|
344 |
+
text_column: pd.Series,
|
345 |
+
max_tokens: int = 2000,
|
346 |
+
output_sequence_length: int = 500,
|
347 |
+
output_mode: str = "int",
|
348 |
+
train: bool = True,
|
349 |
+
text_vectorizer: TextVectorization = None,
|
350 |
+
) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor:
|
351 |
+
"""
|
352 |
+
Converting `text_column` to vectors using `TextVectorization` layer.
|
353 |
+
|
354 |
+
Parameters:
|
355 |
+
------------
|
356 |
+
- text_column : pd.Series
|
357 |
+
Contains the case facts.
|
358 |
+
- max_tokens : int, optional
|
359 |
+
Defines the number of max_tokens of `text_vectorizer` (Default is 2000).
|
360 |
+
- output_sequence_length : int, optional
|
361 |
+
Represents the dimensions of the output vector (Default is 500).
|
362 |
+
- output_mode : str, optional
|
363 |
+
Represents the output vectors type whether it is "int" or "binary" or "tfi-df".
|
364 |
+
- train : bool, optional
|
365 |
+
Defines whether the model will be trained or not. (if True,
|
366 |
+
TextVectorization will be trained | else, TextVectorization will used the
|
367 |
+
passed `text_vectorizer`). (Default is True).
|
368 |
+
- text_vectorizer : TextVectorization, optional
|
369 |
+
Trained TextVectorization layer will be used for generating embeddings of
|
370 |
+
`text_column` if `train` is False. (Default is None).
|
371 |
+
|
372 |
+
Returns:
|
373 |
+
--------
|
374 |
+
- if `train` == True:
|
375 |
+
A tuple contains the following:
|
376 |
+
- text_vectorizer : TextVectorization
|
377 |
+
Trained TextVectorization layer.
|
378 |
+
- text_vectors : tf.Tensor
|
379 |
+
A Tensor contains `text_column` training vectors.
|
380 |
+
- otherwise:
|
381 |
+
text_vectors : tf.Tensor
|
382 |
+
A Tensor contains `text_column` testing vectors.
|
383 |
+
|
384 |
+
Raises:
|
385 |
+
-------
|
386 |
+
- AssertionError
|
387 |
+
If train is False and `text_vectorizer` is None.
|
388 |
+
- AssertionError
|
389 |
+
If train is False and `text_vectorizer` is not an instance of TextVectorization.
|
390 |
+
"""
|
391 |
+
|
392 |
+
if train:
|
393 |
+
text_vectorizer = TextVectorization(
|
394 |
+
max_tokens=max_tokens,
|
395 |
+
output_mode=output_mode,
|
396 |
+
output_sequence_length=output_sequence_length,
|
397 |
+
)
|
398 |
+
text_vectorizer.adapt(text_column)
|
399 |
+
text_vectors = text_vectorizer(text_column)
|
400 |
+
return text_vectorizer, text_vectors
|
401 |
+
|
402 |
+
assert (
|
403 |
+
text_vectorizer is not None
|
404 |
+
), "`text_vectorizer` argument must be not None."
|
405 |
+
assert isinstance(
|
406 |
+
text_vectorizer, TextVectorization
|
407 |
+
), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors."
|
408 |
+
text_vectors = text_vectorizer(text_column)
|
409 |
+
|
410 |
+
return text_vectors
|
411 |
+
|
412 |
+
def convert_text_to_vectors_glove(
|
413 |
+
self,
|
414 |
+
text_column: pd.Series,
|
415 |
+
train: bool = True,
|
416 |
+
glove_tokenizer: Tokenizer = None,
|
417 |
+
vocab_size: int = 1000,
|
418 |
+
oov_token: str = "<OOV>",
|
419 |
+
max_length: int = 50,
|
420 |
+
padding_type: str = "post",
|
421 |
+
truncation_type: str = "post",
|
422 |
+
) -> Tuple[Tokenizer, np.ndarray] | np.ndarray:
|
423 |
+
"""
|
424 |
+
Converting `text_column` to vectors using `glove_tokenizer`.
|
425 |
+
|
426 |
+
Parameters:
|
427 |
+
------------
|
428 |
+
- text_column : pd.Series
|
429 |
+
Contains the case facts.
|
430 |
+
- train : bool, optional
|
431 |
+
Defines whether the model will be trained or not. (if True,
|
432 |
+
Tokenizer will be trained | else, Tokenizer will used the
|
433 |
+
passed `glove_tokenizer`). (Default is True).
|
434 |
+
- glove_tokenizer : Tokenizer, optional
|
435 |
+
Trained Tokenizer layer will be used for generating embeddings of
|
436 |
+
`text_column` if `train` is False. (Default is None).
|
437 |
+
- vocab_size : int, optional
|
438 |
+
Represents the number of supported vocabulary of the Tokenizer,
|
439 |
+
any token not in this vocabulary will be treated as an out-of-vocabulary
|
440 |
+
token(OOV). (Default is 1000).
|
441 |
+
- oov_tokens : str, optional
|
442 |
+
Represents the token of an out-of-vocabulary token (Default is "<OOV>").
|
443 |
+
- max_length : int, optional
|
444 |
+
Defins the output vector's dimension. (Default is 50).
|
445 |
+
- padding_type : str, optional
|
446 |
+
Defines the padding type of the vectors, if the vector size is less than
|
447 |
+
`max_length`, the rest of the `max_length` will be padded with 0 (Default is "post").
|
448 |
+
- truncation_type : str, optional
|
449 |
+
Defines the truncation type of the vectors, if the vector size is more than
|
450 |
+
`max_length`, the extra of the `max_length` will be truncated (Default is "post").
|
451 |
+
|
452 |
+
Returns:
|
453 |
+
--------
|
454 |
+
- if `train` == True:
|
455 |
+
A tuple contains the following:
|
456 |
+
- glove_tokenizer : Tokenizer
|
457 |
+
Trained Tokenizer layer.
|
458 |
+
- text_padded : np.ndarray
|
459 |
+
An array contains `text_column` vectors.
|
460 |
+
- otherwise:
|
461 |
+
text_padded : np.ndarray
|
462 |
+
An array contains `text_column` vectors.
|
463 |
+
|
464 |
+
Raises:
|
465 |
+
-------
|
466 |
+
- AssertionError
|
467 |
+
If train is False and `glove_tokenizer` is None.
|
468 |
+
- AssertionError
|
469 |
+
If train is False and `glove_tokenizer` is not instance of Tokenizer.
|
470 |
+
"""
|
471 |
+
|
472 |
+
if train:
|
473 |
+
glove_tokenizer = Tokenizer(
|
474 |
+
num_words=vocab_size, oov_token=oov_token)
|
475 |
+
glove_tokenizer.fit_on_texts(text_column)
|
476 |
+
text_sequences = glove_tokenizer.texts_to_sequences(text_column)
|
477 |
+
text_padded = pad_sequences(
|
478 |
+
text_sequences,
|
479 |
+
maxlen=max_length,
|
480 |
+
padding=padding_type,
|
481 |
+
truncating=truncation_type,
|
482 |
+
)
|
483 |
+
|
484 |
+
return glove_tokenizer, text_padded
|
485 |
+
|
486 |
+
assert (
|
487 |
+
glove_tokenizer is not None
|
488 |
+
), "`glove_tokenizer` argument must be not None."
|
489 |
+
assert isinstance(
|
490 |
+
glove_tokenizer, Tokenizer
|
491 |
+
), "`glove_tokenizer` argument must be an instance of Tokenizer."
|
492 |
+
text_sequences = glove_tokenizer.texts_to_sequences(text_column)
|
493 |
+
text_padded = pad_sequences(
|
494 |
+
text_sequences,
|
495 |
+
maxlen=max_length,
|
496 |
+
padding=padding_type,
|
497 |
+
truncating=truncation_type,
|
498 |
+
)
|
499 |
+
|
500 |
+
return text_padded
|
501 |
+
|
502 |
+
def balance_data(self, X_train: pd.Series, y_train: pd.Series) -> pd.DataFrame:
|
503 |
+
"""
|
504 |
+
Balancing `X_train` and `y_train` to distribute the targets in `y_train` equally.
|
505 |
+
|
506 |
+
Parameters:
|
507 |
+
------------
|
508 |
+
- text_column : pd.Series
|
509 |
+
Contains the case facts.
|
510 |
+
- y_train : pd.Series
|
511 |
+
Contains the training targets.
|
512 |
+
|
513 |
+
Returns:
|
514 |
+
--------
|
515 |
+
- shuffled_balanced_df : pd.DataFrame
|
516 |
+
Contains the new balanced dataframe with shuffling indicies.
|
517 |
+
"""
|
518 |
+
|
519 |
+
df = pd.concat([X_train, y_train], axis=1)
|
520 |
+
|
521 |
+
first_party = df[df["winner_index"] == 0]
|
522 |
+
second_party = df[df["winner_index"] == 1]
|
523 |
+
|
524 |
+
upsample_second_party = resample(
|
525 |
+
second_party, replace=True, n_samples=len(first_party), random_state=42
|
526 |
+
)
|
527 |
+
|
528 |
+
upsample_df = pd.concat([upsample_second_party, first_party])
|
529 |
+
|
530 |
+
shuffled_indices = np.arange(upsample_df.shape[0])
|
531 |
+
np.random.shuffle(shuffled_indices)
|
532 |
+
|
533 |
+
shuffled_balanced_df = upsample_df.iloc[shuffled_indices, :]
|
534 |
+
|
535 |
+
return shuffled_balanced_df
|
536 |
+
|
537 |
+
def anonymize_data(
|
538 |
+
self,
|
539 |
+
first_party_names: pd.Series,
|
540 |
+
second_party_names: pd.Series,
|
541 |
+
text_column: pd.Series,
|
542 |
+
) -> pd.Series:
|
543 |
+
"""
|
544 |
+
Anonymize `text_column` by replacing `first_party_names` and
|
545 |
+
`second_party_names` wit "_PARTY_" tag.
|
546 |
+
|
547 |
+
Parameters:
|
548 |
+
------------
|
549 |
+
- first_party_names : pd.Series
|
550 |
+
Contains all first party names needed to be anonymized.
|
551 |
+
- second_party_names : pd.Series
|
552 |
+
Contains all second party names needed to be anonymized.
|
553 |
+
- text_column : pd.Series
|
554 |
+
Contains all texts needed to be anonymized.
|
555 |
+
|
556 |
+
Returns:
|
557 |
+
--------
|
558 |
+
- all_anonyimzed_facts : pd.Series
|
559 |
+
Contains anonymized version of `text_column`.
|
560 |
+
"""
|
561 |
+
|
562 |
+
all_anonymized_facts = []
|
563 |
+
|
564 |
+
for i in range(text_column.shape[0]):
|
565 |
+
facts = text_column.iloc[i]
|
566 |
+
first_party_name = first_party_names.iloc[i]
|
567 |
+
second_party_name = second_party_names.iloc[i]
|
568 |
+
anonymized_facts = self._anonymize_case_facts(
|
569 |
+
first_party_name, second_party_name, facts
|
570 |
+
)
|
571 |
+
all_anonymized_facts.append(anonymized_facts)
|
572 |
+
|
573 |
+
return pd.Series(all_anonymized_facts)
|
574 |
+
|
575 |
+
def preprocess_data(self, text_column: pd.Series) -> pd.Series:
|
576 |
+
"""
|
577 |
+
Preprocessing & cleaning all texts in `text_column`.
|
578 |
+
|
579 |
+
Parameters:
|
580 |
+
------------
|
581 |
+
- text_column : pd.Series
|
582 |
+
Contains all case facts.
|
583 |
+
|
584 |
+
Returns:
|
585 |
+
--------
|
586 |
+
- preprocessed_text : pd.Series
|
587 |
+
Contains all texts after being processed.
|
588 |
+
"""
|
589 |
+
|
590 |
+
preprocessed_text = text_column.apply(self._preprocess_text)
|
591 |
+
return preprocessed_text
|
style.css
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@import url('https://fonts.googleapis.com/css2?family=Cairo:wght@300;400;500;600;700;800&display=swap');
|
2 |
+
|
3 |
+
* {
|
4 |
+
font-family: 'Cairo', sans-serif !important;
|
5 |
+
}
|
6 |
+
|
7 |
+
/* title */
|
8 |
+
.e16nr0p30 {
|
9 |
+
font-weight: 700;
|
10 |
+
font-size: 30px;
|
11 |
+
}
|
12 |
+
|
13 |
+
/* buttons */
|
14 |
+
.edgvbvh10,
|
15 |
+
.edgvbvh5 {
|
16 |
+
width: 100%;
|
17 |
+
height: 40px;
|
18 |
+
background-color: #4756ff;
|
19 |
+
color: #fff;
|
20 |
+
transition: 0.4s;
|
21 |
+
border: none;
|
22 |
+
}
|
23 |
+
|
24 |
+
.edgvbvh10:hover,
|
25 |
+
.edgvbvh5:hover {
|
26 |
+
background-color: #3747fd;
|
27 |
+
color: #fff;
|
28 |
+
border: none;
|
29 |
+
}
|
30 |
+
|
31 |
+
.edgvbvh10:focus,
|
32 |
+
.edgvbvh5:focus {
|
33 |
+
background-color: #3747fd;
|
34 |
+
color: #fff !important;
|
35 |
+
box-shadow: none;
|
36 |
+
border: none;
|
37 |
+
}
|
38 |
+
|
39 |
+
/* header */
|
40 |
+
.row_heading {
|
41 |
+
font-size: 14px;
|
42 |
+
}
|
43 |
+
|
44 |
+
/* spinner */
|
45 |
+
.css-1y04v0k.e17lx80j1,
|
46 |
+
.css-p6380s.e17lx80j1 {
|
47 |
+
margin: 0px;
|
48 |
+
border-color: #34e27f #b3b3b333 #cacaca33 !important;
|
49 |
+
-webkit-box-flex: 0;
|
50 |
+
flex-grow: 0;
|
51 |
+
flex-shrink: 0;
|
52 |
+
}
|
53 |
+
|
54 |
+
/* inputs styling */
|
55 |
+
.st-bf {
|
56 |
+
transition: 0.8s;
|
57 |
+
border: none !important;
|
58 |
+
}
|
59 |
+
|
60 |
+
.st-bf:hover {
|
61 |
+
box-shadow: 0 0 0 4px #dbdbdb !important;
|
62 |
+
}
|
63 |
+
|
64 |
+
/* text stylings */
|
65 |
+
.highlight-petitioner {
|
66 |
+
border-radius: 0.4rem;
|
67 |
+
background-color: rgba(253, 231, 142, 0.4);
|
68 |
+
color: #ffd061;
|
69 |
+
padding: 1px 5px;
|
70 |
+
margin-top: 10px;
|
71 |
+
margin-right: 5px;
|
72 |
+
}
|
73 |
+
|
74 |
+
.highlight-respondent {
|
75 |
+
border-radius: 0.4rem;
|
76 |
+
background-color: rgba(78, 170, 255, 0.2);
|
77 |
+
color: #6195ff;
|
78 |
+
padding: 1px 5px;
|
79 |
+
margin-top: 10px;
|
80 |
+
margin-right: 5px;
|
81 |
+
}
|
82 |
+
|
83 |
+
.bold-text {
|
84 |
+
font-weight: 700 !important;
|
85 |
+
}
|
86 |
+
|
87 |
+
.text-facts {
|
88 |
+
line-height: 40px;
|
89 |
+
}
|
90 |
+
|
91 |
+
/* footer */
|
92 |
+
footer {
|
93 |
+
display: none !important;
|
94 |
+
}
|
utils.py
ADDED
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Callable, List, Tuple
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
7 |
+
|
8 |
+
import tensorflow as tf
|
9 |
+
from tensorflow import keras
|
10 |
+
from keras.preprocessing.text import Tokenizer
|
11 |
+
|
12 |
+
|
13 |
+
def read_data(filepath="../csvs/"):
|
14 |
+
"""
|
15 |
+
Reading CSV files of the dataset.
|
16 |
+
|
17 |
+
Parameters:
|
18 |
+
----------
|
19 |
+
- filepath : str
|
20 |
+
Defines the path that contains the CSV files.
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
--------
|
24 |
+
A tuple contains the following:
|
25 |
+
- X_train : pd.DataFrame
|
26 |
+
- y_train : pd.Series
|
27 |
+
- X_test : pd.DataFrame
|
28 |
+
- y_test : pd.Series
|
29 |
+
"""
|
30 |
+
|
31 |
+
X_train = pd.read_csv(filepath + "X_train.csv")
|
32 |
+
X_train = X_train.iloc[:, 1:]
|
33 |
+
|
34 |
+
X_test = pd.read_csv(filepath + "X_test.csv")
|
35 |
+
X_test = X_test.iloc[:, 1:]
|
36 |
+
|
37 |
+
y_train = pd.read_csv(filepath + "y_train.csv")
|
38 |
+
y_train = y_train.iloc[:, 1:]
|
39 |
+
|
40 |
+
y_test = pd.read_csv(filepath + "y_test.csv")
|
41 |
+
y_test = y_test.iloc[:, 1:]
|
42 |
+
|
43 |
+
return X_train, X_test, y_train, y_test
|
44 |
+
|
45 |
+
|
46 |
+
def train_model(
|
47 |
+
model_building_func: Callable[[], keras.models.Sequential],
|
48 |
+
X_train_vectors: pd.DataFrame | np.ndarray | tf.Tensor,
|
49 |
+
y_train: pd.Series,
|
50 |
+
k: int = 4,
|
51 |
+
num_epochs: int = 30,
|
52 |
+
batch_size: int = 64,
|
53 |
+
) -> Tuple[
|
54 |
+
List[keras.models.Sequential],
|
55 |
+
List[List[float]],
|
56 |
+
List[List[float]],
|
57 |
+
List[List[float]],
|
58 |
+
List[List[float]],
|
59 |
+
]:
|
60 |
+
"""
|
61 |
+
Trains a model on `X_train_vectors` and `y_train` using k-fold cross-validation.
|
62 |
+
|
63 |
+
Parameters:
|
64 |
+
-----------
|
65 |
+
- model_building_func : Callable[[], tf.keras.models.Sequential]
|
66 |
+
A function that builds and compiles a Keras Sequential model.
|
67 |
+
- X_train_vectors : pd.DataFrame
|
68 |
+
The training input data.
|
69 |
+
- y_train : pd.Series
|
70 |
+
The training target data.
|
71 |
+
- k : int, optional
|
72 |
+
The number of folds for cross-validation (default is 4).
|
73 |
+
- num_epochs : int, optional
|
74 |
+
The number of epochs to train for (default is 30).
|
75 |
+
- batch_size : int, optional
|
76 |
+
The batch size to use during training (default is 64).
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
--------
|
80 |
+
A tuple containing the following items:
|
81 |
+
- all_models : List[keras.models.Sequential]
|
82 |
+
A list of `k` trained models.
|
83 |
+
- all_losses : List[List[float]]
|
84 |
+
A `k` by `num_epochs` list containing the training losses for each fold.
|
85 |
+
- all_val_losses : List[List[float]]
|
86 |
+
A `k` by `num_epochs` list containing the validation losses for each fold.
|
87 |
+
- all_acc : List[List[float]]
|
88 |
+
A `k` by `num_epochs` list containing the training accuracies for each fold.
|
89 |
+
- all_val_acc : List[List[float]]
|
90 |
+
A `k` by `num_epochs` list containing the validation accuracies for each fold.
|
91 |
+
"""
|
92 |
+
|
93 |
+
num_validation_samples = len(X_train_vectors) // k
|
94 |
+
|
95 |
+
all_models = []
|
96 |
+
all_losses = []
|
97 |
+
all_val_losses = []
|
98 |
+
all_accuracies = []
|
99 |
+
all_val_accuracies = []
|
100 |
+
|
101 |
+
for fold in range(k):
|
102 |
+
print(f"fold: {fold+1}")
|
103 |
+
validation_data = X_train_vectors[
|
104 |
+
num_validation_samples * fold : num_validation_samples * (fold + 1)
|
105 |
+
]
|
106 |
+
validation_targets = y_train[
|
107 |
+
num_validation_samples * fold : num_validation_samples * (fold + 1)
|
108 |
+
]
|
109 |
+
|
110 |
+
training_data = np.concatenate(
|
111 |
+
[
|
112 |
+
X_train_vectors[: num_validation_samples * fold],
|
113 |
+
X_train_vectors[num_validation_samples * (fold + 1) :],
|
114 |
+
]
|
115 |
+
)
|
116 |
+
training_targets = np.concatenate(
|
117 |
+
[
|
118 |
+
y_train[: num_validation_samples * fold],
|
119 |
+
y_train[num_validation_samples * (fold + 1) :],
|
120 |
+
]
|
121 |
+
)
|
122 |
+
|
123 |
+
model = model_building_func()
|
124 |
+
history = model.fit(
|
125 |
+
training_data,
|
126 |
+
training_targets,
|
127 |
+
validation_data=(validation_data, validation_targets),
|
128 |
+
epochs=num_epochs,
|
129 |
+
batch_size=batch_size,
|
130 |
+
)
|
131 |
+
|
132 |
+
all_models.append(model)
|
133 |
+
all_losses.append(history.history["loss"])
|
134 |
+
all_val_losses.append(history.history["val_loss"])
|
135 |
+
all_accuracies.append(history.history["accuracy"])
|
136 |
+
all_val_accuracies.append(history.history["val_accuracy"])
|
137 |
+
|
138 |
+
return (all_models, all_losses, all_val_losses, all_accuracies, all_val_accuracies)
|
139 |
+
|
140 |
+
|
141 |
+
def print_testing_loss_accuracy(
|
142 |
+
all_models: List[keras.models.Sequential],
|
143 |
+
X_test_vectors: pd.DataFrame | np.ndarray | tf.Tensor,
|
144 |
+
y_test: pd.Series,
|
145 |
+
) -> None:
|
146 |
+
"""
|
147 |
+
Displaying testing loss and testing accuracy of each model in `all_models`,
|
148 |
+
and displaying their average.
|
149 |
+
|
150 |
+
Parameters:
|
151 |
+
------------
|
152 |
+
- all_models : List[keras.models.Sequential]
|
153 |
+
A list of size `k` contains trained models.
|
154 |
+
- X_test_vectors : pd.DataFrame
|
155 |
+
Contains testing vectors.
|
156 |
+
- y_test : pd.Series
|
157 |
+
Contains testing labels.
|
158 |
+
"""
|
159 |
+
|
160 |
+
sum_testing_losses = 0.0
|
161 |
+
sum_testing_accuracies = 0.0
|
162 |
+
|
163 |
+
for i, model in enumerate(all_models):
|
164 |
+
print(f"model: {i+1}")
|
165 |
+
loss_accuracy = model.evaluate(X_test_vectors, y_test, verbose=1)
|
166 |
+
sum_testing_losses += loss_accuracy[0]
|
167 |
+
sum_testing_accuracies += loss_accuracy[1]
|
168 |
+
print("====" * 20)
|
169 |
+
|
170 |
+
num_models = len(all_models)
|
171 |
+
avg_testing_loss = sum_testing_losses / num_models
|
172 |
+
avg_testing_acc = sum_testing_accuracies / num_models
|
173 |
+
print(f"average testing loss: {avg_testing_loss:.3f}")
|
174 |
+
print(f"average testing accuracy: {avg_testing_acc:.3f}")
|
175 |
+
|
176 |
+
|
177 |
+
def calculate_average_measures(
|
178 |
+
all_losses: list[list[float]],
|
179 |
+
all_val_losses: list[list[float]],
|
180 |
+
all_accuracies: list[list[float]],
|
181 |
+
all_val_accuracies: list[list[float]],
|
182 |
+
) -> Tuple[
|
183 |
+
List[keras.models.Sequential],
|
184 |
+
List[List[float]],
|
185 |
+
List[List[float]],
|
186 |
+
List[List[float]],
|
187 |
+
List[List[float]],
|
188 |
+
]:
|
189 |
+
"""
|
190 |
+
Calculate the average measures of cross-validated results.
|
191 |
+
|
192 |
+
Parameters:
|
193 |
+
------------
|
194 |
+
- all_losses : List[List[float]]
|
195 |
+
A `k` by `num_epochs` list contains the values of training losses.
|
196 |
+
- all_val_losses : List[List[float]]
|
197 |
+
A `k` by `num_epochs` list contains the values of validation losses.
|
198 |
+
- all_accuracies : List[List[float]]
|
199 |
+
A `k` by `num_epochs` list contains the values of training accuracies.
|
200 |
+
- all_val_accuracies : List[List[float]]
|
201 |
+
A `k` by `num_epochs` list contains the values of validation accuracies.
|
202 |
+
|
203 |
+
Returns:
|
204 |
+
--------
|
205 |
+
A tuple containing the following items:
|
206 |
+
- avg_loss_hist : List[float]
|
207 |
+
A list of length `num_epochs` contains the average of training losses.
|
208 |
+
- avg_val_loss_hist : List[float]
|
209 |
+
A list of length `num_epochs` contains the average of validaton losses.
|
210 |
+
- avg_acc_hist : List[float]
|
211 |
+
A list of length `num_epochs` contains the average of training accuracies.
|
212 |
+
- avg_val_acc_hist : List[float]
|
213 |
+
A list of length `num_epochs` contains the average of validation accuracies.
|
214 |
+
"""
|
215 |
+
|
216 |
+
num_epochs = len(all_losses[0])
|
217 |
+
avg_loss_hist = [np.mean([x[i] for x in all_losses]) for i in range(num_epochs)]
|
218 |
+
avg_val_loss_hist = [
|
219 |
+
np.mean([x[i] for x in all_val_losses]) for i in range(num_epochs)
|
220 |
+
]
|
221 |
+
avg_acc_hist = [np.mean([x[i] for x in all_accuracies]) for i in range(num_epochs)]
|
222 |
+
avg_val_acc_hist = [
|
223 |
+
np.mean([x[i] for x in all_val_accuracies]) for i in range(num_epochs)
|
224 |
+
]
|
225 |
+
|
226 |
+
return (avg_loss_hist, avg_val_loss_hist, avg_acc_hist, avg_val_acc_hist)
|
227 |
+
|
228 |
+
|
229 |
+
class Doc2VecModel:
|
230 |
+
"""Responsible of creating, initializing, and training Doc2Vec embeddings model."""
|
231 |
+
|
232 |
+
def __init__(self, vector_size=50, min_count=2, epochs=100, dm=1, window=5) -> None:
|
233 |
+
"""
|
234 |
+
Initalize a Doc2Vec model.
|
235 |
+
|
236 |
+
Parameters:
|
237 |
+
------------
|
238 |
+
- vector_size : int, optional
|
239 |
+
Dimensionality of the feature vectors (Default is 50).
|
240 |
+
- min_count : int, optional
|
241 |
+
Ignores all words with total frequency lower than this (Default is 2).
|
242 |
+
- epochs : int, optional
|
243 |
+
Represents the number of training epochs (Default is 100).
|
244 |
+
- dm : int, optional
|
245 |
+
Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
|
246 |
+
Otherwise, `distributed bag of words` (PV-DBOW) is employed (Default is 1).
|
247 |
+
- window : int, optional
|
248 |
+
The maximum distance between the current and predicted word within a
|
249 |
+
sentence (Default is 5).
|
250 |
+
"""
|
251 |
+
|
252 |
+
self.doc2vec_model = Doc2Vec(
|
253 |
+
vector_size=vector_size,
|
254 |
+
min_count=min_count,
|
255 |
+
epochs=epochs,
|
256 |
+
dm=dm,
|
257 |
+
seed=865,
|
258 |
+
window=window,
|
259 |
+
)
|
260 |
+
|
261 |
+
def train_doc2vec_embeddings_model(
|
262 |
+
self, tagged_docs_train: List[TaggedDocument]
|
263 |
+
) -> Doc2Vec:
|
264 |
+
"""
|
265 |
+
Train Doc2Vec model on `tagged_docs_train`.
|
266 |
+
|
267 |
+
Parameters:
|
268 |
+
------------
|
269 |
+
- tagged_docs_train : list[TaggedDocument]
|
270 |
+
Contains the required format of training Doc2Vec model.
|
271 |
+
|
272 |
+
Returns:
|
273 |
+
--------
|
274 |
+
- doc2vec_model : Doc2Vec
|
275 |
+
The trained Doc2Vec model.
|
276 |
+
"""
|
277 |
+
|
278 |
+
self.doc2vec_model.build_vocab(tagged_docs_train)
|
279 |
+
self.doc2vec_model.train(
|
280 |
+
tagged_docs_train,
|
281 |
+
total_examples=self.doc2vec_model.corpus_count,
|
282 |
+
epochs=self.doc2vec_model.epochs,
|
283 |
+
)
|
284 |
+
|
285 |
+
return self.doc2vec_model
|
286 |
+
|
287 |
+
|
288 |
+
class GloveModel:
|
289 |
+
"""Responsible for creating and generating the glove embedding layer"""
|
290 |
+
|
291 |
+
def __init__(self) -> None:
|
292 |
+
pass
|
293 |
+
|
294 |
+
def _generate_glove_embedding_index(
|
295 |
+
self, glove_file_path: str = "GloVe/glove.6B.50d.txt"
|
296 |
+
) -> dict:
|
297 |
+
"""
|
298 |
+
Responsible for generating glove embedding index.
|
299 |
+
|
300 |
+
Parameters:
|
301 |
+
------------
|
302 |
+
- glove_file_path : str
|
303 |
+
Defines the path of the pretrained GloVe embeddings text file
|
304 |
+
(Default is "GloVe/glove.6B.50d.txt").
|
305 |
+
|
306 |
+
Returns:
|
307 |
+
--------
|
308 |
+
- embedding_index : dict
|
309 |
+
Contains each word as a key, and its co-effeicents as a value.
|
310 |
+
"""
|
311 |
+
|
312 |
+
embeddings_index = {}
|
313 |
+
with open(glove_file_path, encoding="utf8") as f:
|
314 |
+
for line in f:
|
315 |
+
values = line.split()
|
316 |
+
word = values[0]
|
317 |
+
coefs = np.asarray(values[1:], dtype="float32")
|
318 |
+
embeddings_index[word] = coefs
|
319 |
+
|
320 |
+
return embeddings_index
|
321 |
+
|
322 |
+
def _generate_glove_embedding_matrix(
|
323 |
+
self, word_index: dict, embedding_index: dict, max_length: int
|
324 |
+
) -> np.ndarray:
|
325 |
+
"""
|
326 |
+
Generating embedding matrix of each word in `word_index`.
|
327 |
+
|
328 |
+
Parameters:
|
329 |
+
-----------
|
330 |
+
- word_index : dict
|
331 |
+
Contains words as keys with there indicies as values.
|
332 |
+
- embedding_index : dict
|
333 |
+
Contains each word as a key, and its co-effeicents as a value.
|
334 |
+
- max_length : int
|
335 |
+
Defines the size of the embedding vector of each word in the
|
336 |
+
embedding matrix.
|
337 |
+
|
338 |
+
Returns:
|
339 |
+
--------
|
340 |
+
- embedding_matrix : np.ndarray
|
341 |
+
Contains all embedding vectors for each word in`word_index`.
|
342 |
+
"""
|
343 |
+
|
344 |
+
embedding_matrix = np.zeros((len(word_index) + 1, max_length))
|
345 |
+
|
346 |
+
for word, i in word_index.items():
|
347 |
+
embedding_vector = embedding_index.get(word)
|
348 |
+
if embedding_vector is not None:
|
349 |
+
embedding_matrix[i] = embedding_vector
|
350 |
+
|
351 |
+
return embedding_matrix
|
352 |
+
|
353 |
+
def generate_glove_embedding_layer(
|
354 |
+
self, glove_tokenizer: Tokenizer, max_length: int = 50
|
355 |
+
) -> keras.layers.Embedding:
|
356 |
+
"""
|
357 |
+
Create GloVe embedding layer for later usage in the neural network.
|
358 |
+
|
359 |
+
Paramters:
|
360 |
+
----------
|
361 |
+
- glove_tokenizer : Tokenizer
|
362 |
+
Trained tokenizer on training data to extract word index from it.
|
363 |
+
- max_length : int, optional
|
364 |
+
Defines the maximum length of the output embedding vector for
|
365 |
+
each word. (Default is 50).
|
366 |
+
|
367 |
+
Returns:
|
368 |
+
--------
|
369 |
+
- embedding_layer : keras.layers.Embedding
|
370 |
+
An embedding layer of size `word index + 1` by `max_length` with
|
371 |
+
trained weights that can be used a vectorizer of case facts.
|
372 |
+
"""
|
373 |
+
|
374 |
+
word_index = glove_tokenizer.word_index
|
375 |
+
|
376 |
+
embedding_index = self._generate_glove_embedding_index()
|
377 |
+
embedding_matrix = self._generate_glove_embedding_matrix(
|
378 |
+
word_index, embedding_index, max_length
|
379 |
+
)
|
380 |
+
|
381 |
+
embedding_layer = keras.layers.Embedding(
|
382 |
+
len(word_index) + 1,
|
383 |
+
max_length,
|
384 |
+
weights=[embedding_matrix],
|
385 |
+
input_length=max_length,
|
386 |
+
trainable=False,
|
387 |
+
)
|
388 |
+
|
389 |
+
return embedding_layer
|