Hetan07 commited on
Commit
674a23c
1 Parent(s): b75acff

Upload 5 files

Browse files
Files changed (5) hide show
  1. deployment_utils.py +607 -0
  2. plotting.py +230 -0
  3. preprocessing.py +591 -0
  4. style.css +94 -0
  5. utils.py +389 -0
deployment_utils.py ADDED
@@ -0,0 +1,607 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # global
2
+ from typing import Tuple, List
3
+ import re
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ import tensorflow as tf
8
+ from tensorflow import keras
9
+ from keras.utils import pad_sequences
10
+ from keras.preprocessing.text import Tokenizer
11
+ from gensim.models.doc2vec import Doc2Vec
12
+
13
+ import transformers
14
+ from transformers import pipeline, BertTokenizer
15
+
16
+ import fasttext
17
+
18
+ # local
19
+ from preprocessing import Preprocessor
20
+ from utils import read_data
21
+
22
+
23
+ # read data
24
+ X_train, X_test, y_train, y_test = read_data()
25
+
26
+ # instantiate preprocessor object
27
+ preprocessor = Preprocessor()
28
+
29
+ # load models
30
+ doc2vec_model_embeddings = Doc2Vec.load(
31
+ "F:/Graduation Project/Project/models/best_doc2vec_embeddings")
32
+ doc2vec_model = keras.models.load_model(
33
+ "F:/Graduation Project/Project/models/best_doc2vec_model.h5")
34
+ tfidf_model = keras.models.load_model(
35
+ "F:/Graduation Project/Project/models/best_tfidf_model.h5")
36
+ cnn_model = keras.models.load_model(
37
+ "F:/Graduation Project/Project/models/best_cnn_model.h5")
38
+ glove_model = keras.models.load_model(
39
+ "F:/Graduation Project/Project/models/best_glove_model.h5")
40
+ lstm_model = keras.models.load_model(
41
+ "F:/Graduation Project/Project/models/best_lstm_model.h5")
42
+ bert_model = keras.models.load_model(
43
+ "F:/Graduation Project/Project/models/best_bert_model.h5", custom_objects={"TFBertModel": transformers.TFBertModel})
44
+ fasttext_model = fasttext.load_model(
45
+ "F:/Graduation Project/Project/models/best_fasttext_model.bin")
46
+ summarization_model = pipeline(
47
+ "summarization", model="facebook/bart-large-cnn")
48
+
49
+
50
+ # TODO: Add Docstrings
51
+ def extract_case_information(case_content: str):
52
+ content_list = case_content.split("\n")
53
+ petitioner = re.findall(r"petitioner:(.+)", content_list[0])[0]
54
+ respondent = re.findall(r"respondent:(.+)", content_list[1])[0]
55
+ facts = re.findall(r"facts:(.+)", content_list[2])[0]
56
+
57
+ return petitioner, respondent, facts
58
+
59
+
60
+ def generate_random_sample() -> Tuple[str, str, str, int]:
61
+ """
62
+ Randomly fetch a random case from `X_test` to test it.
63
+
64
+ Returns:
65
+ --------
66
+ A tuple contains the following:
67
+ - petitioner : str
68
+ Contains petitioner name.
69
+ - respondent : str
70
+ Contains respondent name.
71
+ - facts : str
72
+ Contains case facts.
73
+ - label : int
74
+ Represents the winning index(0 = petitioner, 1 = respondent).
75
+ """
76
+
77
+ random_idx = np.random.randint(low=0, high=len(X_test))
78
+
79
+ petitioner = X_test["first_party"].iloc[random_idx]
80
+ respondent = X_test["second_party"].iloc[random_idx]
81
+ facts = X_test["Facts"].iloc[random_idx]
82
+ label = y_test.iloc[random_idx][0]
83
+
84
+ return petitioner, respondent, facts, label
85
+
86
+
87
+ def generate_highlighted_words(facts: str, petitioner_words: List[str], respondent_words: List[str]):
88
+ """
89
+ Highlight `petitioner_words` and `respondent_words` for model
90
+ interpretation.
91
+
92
+ Parameters:
93
+ -----------
94
+ - facts : str
95
+ Facts of a specific case.
96
+ - petitioner_words : List[str]
97
+ Contains all words that model pays attention
98
+ to be a petetioner words.
99
+ - respondent_words : List[str]
100
+ Contains all words that model pays attention
101
+ to be a respondent words.
102
+
103
+ Returns:
104
+ --------
105
+ - rendered_text : str
106
+ Contains the `facts` but with adding
107
+ highlighting mechanism to visualize it using CSS in HTML format.
108
+
109
+ Example:
110
+ --------
111
+ >>> facts_ = 'Mohammed shot Aly after a hot negotiation happened between
112
+ ... them about the profits of their company'
113
+ >>> petitioner_words_ = ['shot', 'hot']
114
+ >>> respondent_words_ = ['profits']
115
+ >>> generate_highlighted_words(facts, petitioner_words_, respondent_words_)
116
+
117
+ >>> output:
118
+ <div class='text-facts'> Mohammed <span class='highlight-petitioner'>shot</span>
119
+ Aly after a <span class='highlight-petitioner'>hot</span> negotiation happened
120
+ between them about <span class='highlight-respondent'>profits</span> of their
121
+ company </div>
122
+ """
123
+
124
+ rendered_text = '<div class="text-facts"> '
125
+
126
+ for word in facts.split():
127
+ if word in petitioner_words:
128
+ highlight_word = ' <span class="highlight-petitioner"> ' + word + " </span> "
129
+ rendered_text += highlight_word
130
+
131
+ elif word in respondent_words:
132
+ highlight_word = ' <span class="highlight-respondent"> ' + word + " </span> "
133
+ rendered_text += highlight_word
134
+
135
+ else:
136
+ rendered_text += " " + word
137
+
138
+ rendered_text += " </div>"
139
+
140
+ return rendered_text
141
+
142
+
143
+ class VectorizerGenerator:
144
+ """Responsible for creation and generation of tokenizers and text
145
+ vectorizers for JudgerAIs' models"""
146
+
147
+ def __init__(self) -> None:
148
+ pass
149
+
150
+ def generate_tf_idf_vectorizer(self) -> keras.layers.TextVectorization:
151
+ """
152
+ Generating best text vectroizer of the tf-idf model (3rd combination).
153
+
154
+ Returns:
155
+ -------
156
+ - text_vectorizer : keras.layers.TextVectorization
157
+ Represents the case facts' vectorizer that converts case facts to
158
+ numerical tensors.
159
+ """
160
+
161
+ first_party_names = X_train["first_party"]
162
+ second_party_names = X_train["second_party"]
163
+ facts = X_train["Facts"]
164
+
165
+ anonymized_facts = preprocessor.anonymize_data(
166
+ first_party_names, second_party_names, facts)
167
+
168
+ text_vectorizer, _ = preprocessor.convert_text_to_vectors_tf_idf(
169
+ anonymized_facts)
170
+
171
+ return text_vectorizer
172
+
173
+ def generate_cnn_vectorizer(self) -> keras.layers.TextVectorization:
174
+ """
175
+ Generating best text vectroizer of the cnn model (2nd combination).
176
+
177
+ Returns:
178
+ -------
179
+ - text_vectorizer : keras.layers.TextVectorization
180
+ Represents the case facts' vectorizer that converts case facts to
181
+ numerical tensors.
182
+ """
183
+
184
+ balanced_df = preprocessor.balance_data(X_train["Facts"], y_train)
185
+ X_train_balanced = balanced_df["Facts"]
186
+
187
+ text_vectorizer, _ = preprocessor.convert_text_to_vectors_cnn(
188
+ X_train_balanced)
189
+
190
+ return text_vectorizer
191
+
192
+ def generate_glove_tokenizer(self) -> keras.preprocessing.text.Tokenizer:
193
+ """
194
+ Generating best glove tokenizer of the GloVe model (2nd combination).
195
+
196
+ Returns:
197
+ -------
198
+ - glove_tokenizer : keras.preprocessing.text.Tokenizer
199
+ Represents the case facts' tokenizer that converts case facts to
200
+ numerical tensors.
201
+ """
202
+
203
+ balanced_df = preprocessor.balance_data(X_train["Facts"], y_train)
204
+ X_train_balanced = balanced_df["Facts"]
205
+
206
+ glove_tokenizer, _ = preprocessor.convert_text_to_vectors_glove(
207
+ X_train_balanced)
208
+
209
+ return glove_tokenizer
210
+
211
+ def generate_lstm_tokenizer(self) -> keras.preprocessing.text.Tokenizer:
212
+ """
213
+ Generating best text tokenizer of the LSTM model (1st combination).
214
+
215
+ Returns:
216
+ -------
217
+ - lstm_tokenizer : keras.preprocessing.text.Tokenizer
218
+ Represents the case facts' tokenizer that converts case facts to
219
+ numerical tensors.
220
+ """
221
+
222
+ lstm_tokenizer = Tokenizer(num_words=18430)
223
+ lstm_tokenizer.fit_on_texts(X_train)
224
+
225
+ return lstm_tokenizer
226
+
227
+ def generate_bert_tokenizer(self) -> transformers.BertTokenizer:
228
+ """
229
+ Generating best bert tokenizer of the BERT model (1st combination).
230
+
231
+ Returns:
232
+ -------
233
+ - bert_tokenizer : transformers.BertTokenizer
234
+ Represents the case facts' tokenizer that converts case facts to
235
+ input ids tensors.
236
+ """
237
+
238
+ bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
239
+ return bert_tokenizer
240
+
241
+
242
+ class DataPreparator:
243
+ """Responsible for preparing the case facts aka converting case facts to
244
+ numerical vectors using `VectorizerGenerator` object."""
245
+
246
+ def __init__(self) -> None:
247
+ self.vectorizer_generator = VectorizerGenerator()
248
+
249
+ def prepare_doc2vec(self, facts: str) -> pd.DataFrame:
250
+ """
251
+ Responsible for converting `facts` string to numerical vector
252
+ using `doc2vec_model_embeddings`.
253
+
254
+ Parameters:
255
+ ----------
256
+ - facts : str
257
+ Represents the case facts.
258
+
259
+ Returns:
260
+ -------
261
+ - facts_vector : pd.DataFrame
262
+ A row DataFrame represents the 50-d vector of the `facts`.
263
+ """
264
+
265
+ facts = pd.Series(facts)
266
+ facts_processed = preprocessor.preprocess_data(facts)
267
+ facts_vectors = preprocessor.convert_text_to_vectors_doc2vec(
268
+ facts_processed, train=False, embeddings_doc2vec=doc2vec_model_embeddings)
269
+
270
+ return facts_vectors
271
+
272
+ def _anonymize_facts(self, first_party_name: str, second_party_name: str, facts: str) -> str:
273
+ """
274
+ Anonymize case `facts` by replacing `first_party_name` & `second_party_name` with
275
+ generic tag "__PARTY__".
276
+
277
+ Parameters:
278
+ -----------
279
+ - first_party_name : str
280
+ Represents the petitioner name.
281
+ - second_party_name : str
282
+ Represents the respondent name.
283
+ - facts : str
284
+ Represents the case facts.
285
+
286
+ Returns:
287
+ -------
288
+ - anonymized_facts : str
289
+ Represents `facts` after anonymization.
290
+ """
291
+
292
+ anonymized_facts = preprocessor._anonymize_case_facts(
293
+ first_party_name, second_party_name, facts)
294
+
295
+ return anonymized_facts
296
+
297
+ def prepare_tf_idf(self, anonymized_facts: str) -> tf.Tensor:
298
+ """
299
+ Responsible for converting `facts` string to numerical vector
300
+ using tf-idf `vectorizer_generator` in the 3rd combination.
301
+
302
+ Parameters:
303
+ -----------
304
+ - anonymized_facts : str
305
+ Represents the case facts after anonymization.
306
+
307
+ Returns:
308
+ -------
309
+ - facts_vector : tf.Tensor
310
+ A Tensor of 10000-d represents `facts`.
311
+ """
312
+
313
+ anonymized_facts = pd.Series(anonymized_facts)
314
+ tf_idf_vectorizer = self.vectorizer_generator.generate_tf_idf_vectorizer()
315
+
316
+ facts_vector = preprocessor.convert_text_to_vectors_tf_idf(
317
+ anonymized_facts, train=False, text_vectorizer=tf_idf_vectorizer)
318
+
319
+ return facts_vector
320
+
321
+ def prepare_cnn(self, facts: str) -> tf.Tensor:
322
+ """
323
+ Responsible for converting `facts` string to numerical vector
324
+ using cnn `vectorizer_generator` in the 2nd combination.
325
+
326
+ Parameters:
327
+ -----------
328
+ - facts : str
329
+ Represents the case facts.
330
+
331
+ Returns:
332
+ -------
333
+ - facts_vector : tf.Tensor
334
+ A Tensor of 2000-d represents `facts`.
335
+ """
336
+ facts = pd.Series(facts)
337
+
338
+ cnn_vectorizer = self.vectorizer_generator.generate_cnn_vectorizer()
339
+
340
+ facts_vector = preprocessor.convert_text_to_vectors_cnn(
341
+ facts, train=False, text_vectorizer=cnn_vectorizer)
342
+
343
+ return facts_vector
344
+
345
+ def prepare_glove(self, facts: str) -> np.ndarray:
346
+ """
347
+ Responsible for converting `facts` string to numerical vector
348
+ using glove `vectorizer_generator` in the 2nd combination.
349
+
350
+ Parameters:
351
+ -----------
352
+ - facts : str
353
+ Represents the case facts.
354
+
355
+ Returns:
356
+ -------
357
+ - facts_vector : np.ndarray
358
+ A nd.ndarray of 50-d represents `facts`.
359
+ """
360
+
361
+ facts = pd.Series(facts)
362
+
363
+ glove_tokneizer = self.vectorizer_generator.generate_glove_tokenizer()
364
+
365
+ facts_vector = preprocessor.convert_text_to_vectors_glove(
366
+ facts, train=False, glove_tokenizer=glove_tokneizer)
367
+
368
+ return facts_vector
369
+
370
+ def prepare_lstm(self, facts: str) -> np.ndarray:
371
+ """
372
+ Responsible for converting `facts` string to numerical vector
373
+ using lstm `vectorizer_generator` in the 1st combination.
374
+
375
+ Parameters:
376
+ -----------
377
+ - facts : str
378
+ Represents the case facts.
379
+
380
+ Returns:
381
+ -------
382
+ - facts_vector_padded : np.ndarray
383
+ A nd.ndarray of 974-d represents `facts`.
384
+ """
385
+
386
+ facts = pd.Series(facts)
387
+ lstm_tokenizer = self.vectorizer_generator.generate_lstm_tokenizer()
388
+ facts_vector = lstm_tokenizer.texts_to_sequences(facts)
389
+ facts_vector_padded = pad_sequences(facts_vector, 974)
390
+
391
+ return facts_vector_padded
392
+
393
+ def prepare_bert(self, facts: str) -> tf.Tensor:
394
+ """
395
+ Responsible for converting `facts` string to numerical vector
396
+ using bert `vectorizer_generator` in the 1st combination.
397
+
398
+ Parameters:
399
+ -----------
400
+ - facts : str
401
+ Represents the case facts.
402
+
403
+ Returns:
404
+ -------
405
+ - tf.Tensor
406
+ A tf.Tensor of 256-d represents `facts` input ids.
407
+ """
408
+
409
+ bert_tokenizer = self.vectorizer_generator.generate_bert_tokenizer()
410
+ facts_vector_dict = bert_tokenizer.encode_plus(
411
+ facts,
412
+ max_length=256,
413
+ truncation=True,
414
+ padding='max_length',
415
+ add_special_tokens=True,
416
+ return_tensors='tf'
417
+ )
418
+
419
+ return facts_vector_dict["input_ids"]
420
+
421
+
422
+ class Predictor:
423
+ """Responsible for get predictions of JudgerAIs' models"""
424
+
425
+ def __init__(self) -> None:
426
+ self.data_preparator = DataPreparator()
427
+
428
+ def predict_doc2vec(self, facts: str) -> np.ndarray:
429
+ """
430
+ Get prediction of `facts` using `doc2vec_model`.
431
+
432
+ Parameters:
433
+ ----------
434
+ - facts : str
435
+ Represents the case facts.
436
+
437
+ Returns:
438
+ --------
439
+ - pet_res_scores : np.ndarray
440
+ An array contains 2 elements, one for probability of petitioner winning
441
+ and the second for the probability of respondent winning.
442
+ """
443
+
444
+ facts_vector = self.data_preparator.prepare_doc2vec(facts)
445
+ predictions = doc2vec_model.predict(facts_vector)
446
+
447
+ pet_res_scores = []
448
+ for i in predictions:
449
+ temp = i[0]
450
+ pet_res_scores.append(np.array([1 - temp, temp]))
451
+
452
+ return np.array(pet_res_scores)
453
+
454
+ def predict_tf_idf(self, anonymized_facts: str) -> np.ndarray:
455
+ """
456
+ Get prediction of `facts` using `tfidf_model`.
457
+
458
+ Parameters:
459
+ -----------
460
+ - anonymized_facts : str
461
+ Represents the case facts after anonymization.
462
+
463
+ Returns:
464
+ --------
465
+ - pet_res_scores : np.ndarray
466
+ An array contains 2 elements, one for probability of petitioner winning
467
+ and the second for the probability of respondent winning.
468
+ """
469
+
470
+ facts_vector = self.data_preparator.prepare_tf_idf(anonymized_facts)
471
+ predictions = tfidf_model.predict(facts_vector)
472
+
473
+ pet_res_scores = []
474
+ for i in predictions:
475
+ temp = i[0]
476
+ pet_res_scores.append(np.array([1 - temp, temp]))
477
+
478
+ return np.array(pet_res_scores)
479
+
480
+ def predict_cnn(self, facts: str) -> np.ndarray:
481
+ """
482
+ Get prediction of `facts` using `cnn_model`.
483
+
484
+ Parameters:
485
+ ----------
486
+ - facts : str
487
+ Represents the case facts.
488
+
489
+ Returns:
490
+ --------
491
+ - pet_res_scores : np.ndarray
492
+ An array contains 2 elements, one for probability of petitioner winning
493
+ and the second for the probability of respondent winning.
494
+ """
495
+
496
+ facts_vector = self.data_preparator.prepare_cnn(facts)
497
+ predictions = cnn_model.predict(facts_vector)
498
+
499
+ pet_res_scores = []
500
+ for i in predictions:
501
+ temp = i[0]
502
+ pet_res_scores.append(np.array([1 - temp, temp]))
503
+
504
+ return np.array(pet_res_scores)
505
+
506
+ def predict_glove(self, facts: str) -> np.ndarray:
507
+ """
508
+ Get prediction of `facts` using `glove_model`.
509
+
510
+ Parameters:
511
+ ----------
512
+ - facts : str
513
+ Represents the case facts.
514
+
515
+ Returns:
516
+ --------
517
+ - pet_res_scores : np.ndarray
518
+ An array contains 2 elements, one for probability of petitioner winning
519
+ and the second for the probability of respondent winning.
520
+ """
521
+
522
+ facts_vector = self.data_preparator.prepare_glove(facts)
523
+ predictions = glove_model.predict(facts_vector)
524
+
525
+ pet_res_scores = []
526
+ for i in predictions:
527
+ temp = i[0]
528
+ pet_res_scores.append(np.array([1 - temp, temp]))
529
+
530
+ return np.array(pet_res_scores)
531
+
532
+ def predict_lstm(self, facts: str) -> np.ndarray:
533
+ """
534
+ Get prediction of `facts` using `lstm_model`.
535
+
536
+ Parameters:
537
+ ----------
538
+ - facts : str
539
+ Represents the case facts.
540
+
541
+ Returns:
542
+ --------
543
+ - pet_res_scores : np.ndarray
544
+ An array contains 2 elements, one for probability of petitioner winning
545
+ and the second for the probability of respondent winning.
546
+ """
547
+
548
+ facts_vector = self.data_preparator.prepare_lstm(facts)
549
+ predictions = lstm_model.predict(facts_vector)
550
+
551
+ pet_res_scores = []
552
+ for i in predictions:
553
+ temp = i[0]
554
+ pet_res_scores.append(np.array([1 - temp, temp]))
555
+
556
+ return np.array(pet_res_scores)
557
+
558
+ def predict_bert(self, facts: str) -> np.ndarray:
559
+ """
560
+ Get prediction of `facts` using `bert_model`.
561
+
562
+ Parameters:
563
+ ----------
564
+ - facts : str
565
+ Represents the case facts.
566
+
567
+ Returns:
568
+ --------
569
+ - predictions : np.ndarray
570
+ An array contains 2 elements, one for probability of petitioner winning
571
+ and the second for the probability of respondent winning.
572
+ """
573
+
574
+ facts_vector = self.data_preparator.prepare_bert(facts)
575
+ predictions = bert_model.predict(facts_vector)
576
+
577
+ return predictions
578
+
579
+ def predict_fasttext(self, facts: str) -> np.ndarray:
580
+ """
581
+ Get prediction of `facts` using `fasttext`.
582
+
583
+ Parameters:
584
+ ----------
585
+ - facts : str
586
+ Represents the case facts.
587
+
588
+ Returns:
589
+ --------
590
+ - pet_res_scores : np.ndarray
591
+ An array contains 2 elements, one for probability of petitioner winning
592
+ and the second for the probability of respondent winning.
593
+ """
594
+
595
+ prediction = fasttext_model.predict(facts)[1]
596
+ prediction = np.array([prediction])
597
+
598
+ pet_res_scores = []
599
+ for i in prediction:
600
+ temp = i[0]
601
+ pet_res_scores.append(np.array([1 - temp, temp]))
602
+
603
+ return np.array(pet_res_scores)
604
+
605
+ def summarize_facts(self, facts: str) -> str:
606
+ summarized_case_facts = summarization_model(facts)[0]['summary_text']
607
+ return summarized_case_facts
plotting.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sn
7
+
8
+ from sklearn.metrics import auc
9
+ from sklearn.metrics import roc_curve
10
+ from sklearn.metrics import classification_report
11
+ from sklearn.metrics import confusion_matrix
12
+
13
+ from tensorflow import keras
14
+
15
+
16
+ class PlottingManager:
17
+ """Responsible for providing plots & visualization for the models."""
18
+
19
+ def __init__(self) -> None:
20
+ """Define style for visualizations."""
21
+ plt.style.use("seaborn")
22
+
23
+ def plot_subplots_curve(
24
+ self,
25
+ training_measure: List[List[float]],
26
+ validation_measure: List[List[float]],
27
+ title: str,
28
+ train_color: str = "orangered",
29
+ validation_color: str = "dodgerblue",
30
+ ) -> None:
31
+ """
32
+ Plotting subplots of the elements of `training_measure` vs. `validation_measure`.
33
+
34
+ Parameters:
35
+ ------------
36
+ - training_measure : List[List[float]]
37
+ A `k` by `num_epochs` list contains the trained measure whether it's loss or
38
+ accuracy for each fold.
39
+ - validation_measure : List[List[float]]
40
+ A `k` by `num_epochs` list contains the validation measure whether it's loss
41
+ or accuracy for each fold.
42
+ - title : str
43
+ Represents the title of the plot.
44
+ - train_color : str, optional
45
+ Represents the graph color for the `training_measure`. (Default is "orangered").
46
+ - validation_color : str, optional
47
+ Represents the graph color for the `validation_measure`. (Default is "dodgerblue").
48
+ """
49
+
50
+ plt.figure(figsize=(12, 8))
51
+
52
+ for i in range(len(training_measure)):
53
+ plt.subplot(2, 2, i + 1)
54
+ plt.plot(training_measure[i], c=train_color)
55
+ plt.plot(validation_measure[i], c=validation_color)
56
+ plt.title("Fold " + str(i + 1))
57
+
58
+ plt.suptitle(title)
59
+ plt.show()
60
+
61
+ def plot_heatmap(
62
+ self, measure: List[List[float]], title: str, cmap: str = "coolwarm"
63
+ ) -> None:
64
+ """
65
+ Plotting a heatmap of the values in `measure`.
66
+
67
+ Parameters:
68
+ ------------
69
+ - measure : List[List[float]]
70
+ A `k` by `num_epochs` list contains the measure whether it's loss
71
+ or accuracy for each fold.
72
+ - title : str
73
+ Title of the plot.
74
+ - cmap : str, optional
75
+ Color map of the plot (default is "coolwarm").
76
+ """
77
+
78
+ # transpose the array to make it `num_epochs` by `k`
79
+ values_array = np.array(measure).T
80
+ df_cm = pd.DataFrame(
81
+ values_array,
82
+ range(1, values_array.shape[0] + 1),
83
+ ["fold " + str(i + 1) for i in range(4)],
84
+ )
85
+
86
+ plt.figure(figsize=(10, 8))
87
+ plt.title(
88
+ title + " Throughout " + str(values_array.shape[1]) + " Folds", pad=20
89
+ )
90
+ sn.heatmap(df_cm, annot=True, cmap=cmap, annot_kws={"size": 10})
91
+ plt.show()
92
+
93
+ def plot_average_curves(
94
+ self,
95
+ title: str,
96
+ x: List[float],
97
+ y: List[float],
98
+ x_label: str,
99
+ y_label: str,
100
+ train_color: str = "orangered",
101
+ validation_color: str = "dodgerblue",
102
+ ) -> None:
103
+ """
104
+ Plotting the curves of `x` against `y`, where x and y are training and validation
105
+ measures (loss or accuracy).
106
+
107
+ Parameters:
108
+ ------------
109
+ - title : str
110
+ Title of the plot.
111
+ - x : List[float]
112
+ Training measure of the models (loss or accuracy).
113
+ - y : List[float]
114
+ Validation measure of the models (loss or accuracy).
115
+ - x_label : str
116
+ Label of the training measure to put it in plot legend.
117
+ - y_label : str
118
+ Label of the validation measure to put it in plot legend.
119
+ - train_color : str, optional
120
+ Color of the training plot (default is "orangered").
121
+ - validation_color : str, optional
122
+ Color of the validation plot (default is "dodgerblue").
123
+ """
124
+
125
+ plt.title(title, pad=20)
126
+ plt.plot(x, c=train_color, label=x_label)
127
+ plt.plot(y, c=validation_color, label=y_label)
128
+ plt.legend()
129
+ plt.show()
130
+
131
+ def plot_roc_curve(
132
+ self,
133
+ all_models: List[keras.models.Sequential],
134
+ X_test: pd.DataFrame,
135
+ y_test: pd.Series,
136
+ ) -> None:
137
+ """
138
+ Plotting the AUC-ROC curve of all the passed models in `all_models`.
139
+
140
+ Parameters:
141
+ ------------
142
+ - all_models : List[keras.models.Sequential]
143
+ Contains all trained models, number of models equals number of
144
+ `k` fold cross-validation.
145
+ - X_test : pd.DataFrame
146
+ Contains the testing vectors.
147
+ - y_test : pd.Series
148
+ Contains the testing labels.
149
+ """
150
+
151
+ plt.figure(figsize=(12, 8))
152
+ for i, model in enumerate(all_models):
153
+ y_pred = model.predict(X_test).ravel()
154
+ fpr, tpr, _ = roc_curve(y_test, y_pred)
155
+ auc_curve = auc(fpr, tpr)
156
+ plt.subplot(2, 2, i + 1)
157
+ plt.plot([0, 1], [0, 1], color="dodgerblue", linestyle="--")
158
+ plt.plot(
159
+ fpr,
160
+ tpr,
161
+ color="orangered",
162
+ label=f"Fold {str(i+1)} (area = {auc_curve:.3f})",
163
+ )
164
+ plt.legend(loc="best")
165
+ plt.title(f"Fold {str(i+1)}")
166
+
167
+ plt.suptitle("AUC-ROC curves")
168
+ plt.show()
169
+
170
+ def plot_classification_report(
171
+ self, model: keras.models.Sequential, X_test: pd.DataFrame, y_test: pd.Series
172
+ ) -> str | dict:
173
+ """
174
+ Plotting the classification report of the passed `model`.
175
+
176
+ Parameters:
177
+ ------------
178
+ - model : keras.models.Sequential
179
+ The trained model that will be evaluated.
180
+ - X_test : pd.DataFrame
181
+ Contains the testing vectors.
182
+ - y_test : pd.Series
183
+ Contains the testing labels.
184
+
185
+ Returns:
186
+ --------
187
+ - str | dict: The classification report for the given model and testing data.
188
+ It returns a string if `output_format` is set to 'str', and returns
189
+ a dictionary if `output_format` is set to 'dict'.
190
+ """
191
+
192
+ y_pred = model.predict(X_test).ravel()
193
+ preds = np.where(y_pred > 0.5, 1, 0)
194
+ cls_report = classification_report(y_test, preds)
195
+
196
+ return cls_report
197
+
198
+ def plot_confusion_matrix(
199
+ self,
200
+ all_models: List[keras.models.Sequential],
201
+ X_test: pd.DataFrame,
202
+ y_test: pd.Series,
203
+ ) -> None:
204
+ """
205
+ Plotting the confusion matrix of each model in `all_models`.
206
+
207
+ Parameters:
208
+ ------------
209
+ - all_models: list[keras.models.Sequential]
210
+ Contains all trained models, number of models equals
211
+ number of `k` fold cross-validation.
212
+ - X_test: pd.DataFrame
213
+ Contains the testing vectors.
214
+ - y_test: pd.Series
215
+ Contains the testing labels.
216
+ """
217
+
218
+ _, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
219
+
220
+ for i, (model, ax) in enumerate(zip(all_models, axes.flatten())):
221
+ y_pred = model.predict(X_test).ravel()
222
+ preds = np.where(y_pred > 0.5, 1, 0)
223
+
224
+ conf_matrix = confusion_matrix(y_test, preds)
225
+ sn.heatmap(conf_matrix, annot=True, ax=ax)
226
+ ax.set_title(f"Fold {i+1}")
227
+
228
+ plt.suptitle("Confusion Matrices")
229
+ plt.tight_layout()
230
+ plt.show()
preprocessing.py ADDED
@@ -0,0 +1,591 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # global
2
+ import string
3
+ from typing import List, Tuple
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ import re
9
+ import nltk
10
+
11
+ from sklearn.utils import resample
12
+
13
+ from gensim.models.doc2vec import Doc2Vec, TaggedDocument
14
+ from nltk.tokenize import RegexpTokenizer
15
+
16
+ import tensorflow as tf
17
+ from keras.layers import TextVectorization
18
+ from keras.preprocessing.text import Tokenizer
19
+ from keras.utils import pad_sequences
20
+
21
+ # local
22
+ from utils import Doc2VecModel
23
+
24
+
25
+ punct = string.punctuation
26
+ stemmer = nltk.stem.PorterStemmer()
27
+ eng_stopwords = nltk.corpus.stopwords.words("english")
28
+
29
+
30
+ class Preprocessor:
31
+ """Responsible for preprocessing case facts."""
32
+
33
+ def __init__(self) -> None:
34
+ pass
35
+
36
+ def _nltk_tokenizer(self, text: str) -> List[str]:
37
+ """
38
+ Tokenize a given `text` using the RegexpTokenizer from the nltk library.
39
+
40
+ Parameters:
41
+ -----------
42
+ - text : str
43
+ A string containing the text to be tokenized.
44
+
45
+ Returns:
46
+ --------
47
+ - tokens : List[str]
48
+ A list of tokens generated by the tokenizer.
49
+ """
50
+
51
+ tokenizer = RegexpTokenizer(r"\w+")
52
+ tokens = tokenizer.tokenize(text)
53
+
54
+ return tokens
55
+
56
+ def _tokenize_text(self, text_column: pd.Series) -> pd.Series:
57
+ """Splitting `text_column` into tokens.
58
+
59
+ Parameters:
60
+ ------------
61
+ - text_column : pd.Series
62
+ Contains text that needs to be tokenized.
63
+
64
+ Returns:
65
+ --------
66
+ - tokenized_text : pd.Series
67
+ Contains tokenized version of `text_column`.
68
+ """
69
+
70
+ tokenized_text = text_column.apply(self._nltk_tokenizer)
71
+ return tokenized_text
72
+
73
+ def _convert_to_tagged_document(
74
+ self, text_column: pd.Series
75
+ ) -> Tuple[List[str], List[TaggedDocument]]:
76
+ """
77
+ Convert `text_column` of specific to TaggedDocuments.
78
+
79
+ Parameters:
80
+ ------------
81
+ - column : pd.Series
82
+ Contains the list of tokens of each fact.
83
+
84
+ Returns:
85
+ --------
86
+ A tuble containing the following items:
87
+ - tokens_list : list[str]
88
+ Contains all tokens of each case in the `text_column`.
89
+ - tagged_docs : list[TaggedDocument]
90
+ Contains TaggedDocument object for each case.
91
+ """
92
+
93
+ tokens_list = text_column.to_list()
94
+ tagged_docs = [TaggedDocument(t, [str(i)])
95
+ for i, t in enumerate(tokens_list)]
96
+
97
+ return tokens_list, tagged_docs
98
+
99
+ def _vectorize_text(
100
+ self, doc2vec_model: Doc2Vec, df: pd.Series, tokens_list: List[str]
101
+ ) -> pd.DataFrame:
102
+ """
103
+ Convert values of `tokens_list` to a vector.
104
+
105
+ Parameters:
106
+ -----------
107
+ - doc2vec_model : Doc2Vev
108
+ Trained Doc2Vec model.
109
+ - df : pd.Series
110
+ This will use only to get its indicies for the new generated dataframe.
111
+ - tokens_list : List[str]
112
+ Contains all tokens of each case.
113
+
114
+ Returns:
115
+ --------
116
+ - text_vectors_df : pd.DataFrame
117
+ Contains the vector representaion for each case.
118
+ """
119
+
120
+ text_vectors = [doc2vec_model.infer_vector(doc) for doc in tokens_list]
121
+ text_vectors_df = pd.DataFrame(text_vectors, index=df.index)
122
+
123
+ return text_vectors_df
124
+
125
+ def _anonymize_case_facts(
126
+ self, first_party_name: str, second_party_name: str, facts: str
127
+ ) -> str:
128
+ """
129
+ Anonymize case facts by replacing its party names with "_PARTY_" tag.
130
+
131
+ Parameters:
132
+ ------------
133
+ - first_party_name : str
134
+ Represents first party name or petitioner name.
135
+ - second_party_name : str
136
+ Represents second party name or respondent name.
137
+ - facts : str
138
+ Represents case facts.
139
+
140
+ Returns:
141
+ --------
142
+ - anonymized_facts : str
143
+ An anonymized version of `facts`.
144
+ """
145
+
146
+ # remove any commas and any non alphabet characters
147
+ first_party_name = re.sub(r"[\,+]", " ", first_party_name)
148
+ first_party_name = re.sub(r"[^a-zA-Z]", " ", first_party_name)
149
+
150
+ second_party_name = re.sub(r"[\,+]", " ", second_party_name)
151
+ second_party_name = re.sub(r"[^a-zA-Z]", " ", second_party_name)
152
+
153
+ for name in first_party_name.split():
154
+ facts = re.sub(name, " _PARTY_ ", facts)
155
+
156
+ for name in second_party_name.split():
157
+ facts = re.sub(name, " _PARTY_ ", facts)
158
+
159
+ # replace any consecutive _PARTY_ tags with only one _PARTY_ tag.
160
+ regex_continous_tags = r"(_PARTY_\s+){2,}"
161
+ anonymized_facts = re.sub(regex_continous_tags, " _PARTY_ ", facts)
162
+ # remove ant consecutive spaces
163
+ anonymized_facts = re.sub(r"\s+", " ", anonymized_facts)
164
+
165
+ return anonymized_facts
166
+
167
+ def _preprocess_text(self, text: str) -> str:
168
+ """
169
+ Preprocessing & cleaning `text` including:
170
+ - lowercasing
171
+ - removing quotation marks
172
+ - removing digits
173
+ - removing punctuation
174
+ - removing brackets, braces, and paranthesis
175
+ - removeing stopwords
176
+ - stemming tokens
177
+
178
+ Parameters:
179
+ ------------
180
+ - text : str
181
+ Text need to be processed (cleaned).
182
+
183
+ Returns:
184
+ --------
185
+ - processed_text : str
186
+ A preprocessed version of `text`.
187
+ """
188
+
189
+ text = text.lower()
190
+ # remove quotation marks
191
+ text = re.sub(r"\'", "", text)
192
+ # remove digits
193
+ text = re.sub(r"\d+", "", text)
194
+ # remove punctuation but with keeping '_' letter
195
+ text = "".join([ch for ch in text if (ch == "_") or (ch not in punct)])
196
+ # remove brackets, braces, and parantheses
197
+ text = re.sub(r"[\[\]\(\)\{\}]+", " ", text)
198
+ tokens = nltk.word_tokenize(text)
199
+ # remove stopwords and stemming tokens
200
+ tokens = [stemmer.stem(token)
201
+ for token in tokens if token not in eng_stopwords]
202
+ # convert tokens back to string
203
+ processed_text = " ".join(tokens)
204
+
205
+ return processed_text
206
+
207
+ def convert_text_to_vectors_doc2vec(
208
+ self,
209
+ text_column: pd.Series,
210
+ train: bool = True,
211
+ embeddings_doc2vec: Doc2Vec = None,
212
+ ) -> Tuple[Doc2Vec, pd.DataFrame] | pd.DataFrame:
213
+ """
214
+ Converting `text_column` to vectors using `Doc2Vec` model
215
+
216
+ Parameters:
217
+ ------------
218
+ - text_column : pd.Series
219
+ Contains the case facts.
220
+ - train : bool, optional
221
+ Defines whether the model will be trained or not. (if True, Doc2Vec will be trained |
222
+ else, Doc2Vec will used the passed `embeddings_Doc2Vec`). (Default is True).
223
+ - embeddings_doc2vec : Doc2Vec, optional
224
+ Trained Doc2Vec model will be used for generating embeddings of `text_column` if
225
+ `train` is False. (Default is None).
226
+
227
+ Returns:
228
+ --------
229
+ 1. A tuple contains the following:
230
+ - embeddings_doc2vec : Doc2Vec
231
+ Trained Doc2Vec model.
232
+ - text_vectors_df : pd.DataFrame
233
+ A DataFrame contains `text_column` vectors if `train` is True.
234
+
235
+ 2. text_vectors_df : pd.DataFrame
236
+ A DataFrame contains `text_column` vectors if `train` is False.
237
+
238
+ Raises:
239
+ -------
240
+ - AssertionError
241
+ If train is False and `embeddings_doc2vec` is None.
242
+ - AssertionError
243
+ If train is False and `embedding_doc2vec` is not an instance of Doc2Vec
244
+ """
245
+
246
+ tokenized_text = self._tokenize_text(text_column)
247
+ tokens_list, tagged_docs = self._convert_to_tagged_document(
248
+ tokenized_text)
249
+
250
+ if train:
251
+ doc2vec_model = Doc2VecModel()
252
+ embeddings_doc2vec = doc2vec_model.train_doc2vec_embeddings_model(
253
+ tagged_docs
254
+ )
255
+ text_vectors_df = self._vectorize_text(
256
+ embeddings_doc2vec, text_column, tokens_list
257
+ )
258
+ return embeddings_doc2vec, text_vectors_df
259
+
260
+ assert (
261
+ embeddings_doc2vec is not None
262
+ ), "`embedding_doc2vec` argument must be not None."
263
+ assert isinstance(
264
+ embeddings_doc2vec, Doc2Vec
265
+ ), "`embedding_doc2vec` argument must be an instance of Doc2Vec to infer vectors."
266
+ text_vectors_df = self._vectorize_text(
267
+ embeddings_doc2vec, text_column, tokens_list
268
+ )
269
+
270
+ return text_vectors_df
271
+
272
+ def convert_text_to_vectors_tf_idf(
273
+ self,
274
+ text_column: pd.Series,
275
+ ngrams: int = 2,
276
+ max_tokens: int = 10000,
277
+ output_mode: str = "tf-idf",
278
+ train: bool = True,
279
+ text_vectorizer: TextVectorization = None,
280
+ ) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor:
281
+ """
282
+ Converting `text_column` to vectors using `TextVectorization` layer.
283
+
284
+ Parameters:
285
+ ------------
286
+ - text_column : pd.Series
287
+ Contains the case facts.
288
+ - ngrams : int, optional
289
+ Defines the number of n-gram (Default is 2).
290
+ - max_tokens : int, optional
291
+ Defines the number of max_tokens of `text_vectorizer` (Default is 10,000).
292
+ - output_mode : str, optional
293
+ Represents the output vectors type whether it is "tfi-df" or "binary" or "count"
294
+ (Default is "tf-idf").
295
+ - train : bool, optional
296
+ Defines whether the model will be trained or not. (if True, TextVectorization
297
+ will be trained, else, TextVectorization will used the passed `text_vectorizer`).
298
+ (Default is True).
299
+ - text_vectorizer : TextVectorization, optional
300
+ Trained TextVectorization layer will be used for generating embeddings of
301
+ `text_column` if `train` is False. (Default is None).
302
+
303
+ Returns:
304
+ --------
305
+ - if `train` == True:
306
+ A tuple contains the following:
307
+ - text_vectorizer : TextVectorization
308
+ Trained TextVectorization layer.
309
+ - text_vectors : tf.Tensor
310
+ A Tensor contains `text_column` training vectors.
311
+ - otherwise:
312
+ text_vectors : tf.Tensor
313
+ A Tensor contains `text_column` testing vectors.
314
+
315
+ Raises:
316
+ -------
317
+ - AssertionError
318
+ If train is False and `text_vectorizer` is None.
319
+ - AssertionError
320
+ If train is False and `text_vectorizer` is not an instance of TextVectorization.
321
+ """
322
+
323
+ if train:
324
+ text_vectorizer = TextVectorization(
325
+ ngrams=ngrams, max_tokens=max_tokens, output_mode=output_mode
326
+ )
327
+ text_vectorizer.adapt(text_column)
328
+ text_vectors = text_vectorizer(text_column)
329
+
330
+ return text_vectorizer, text_vectors
331
+
332
+ assert (
333
+ text_vectorizer is not None
334
+ ), "`text_vectorizer` argument must be not None."
335
+ assert isinstance(
336
+ text_vectorizer, TextVectorization
337
+ ), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors."
338
+ text_vectors = text_vectorizer(text_column)
339
+
340
+ return text_vectors
341
+
342
+ def convert_text_to_vectors_cnn(
343
+ self,
344
+ text_column: pd.Series,
345
+ max_tokens: int = 2000,
346
+ output_sequence_length: int = 500,
347
+ output_mode: str = "int",
348
+ train: bool = True,
349
+ text_vectorizer: TextVectorization = None,
350
+ ) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor:
351
+ """
352
+ Converting `text_column` to vectors using `TextVectorization` layer.
353
+
354
+ Parameters:
355
+ ------------
356
+ - text_column : pd.Series
357
+ Contains the case facts.
358
+ - max_tokens : int, optional
359
+ Defines the number of max_tokens of `text_vectorizer` (Default is 2000).
360
+ - output_sequence_length : int, optional
361
+ Represents the dimensions of the output vector (Default is 500).
362
+ - output_mode : str, optional
363
+ Represents the output vectors type whether it is "int" or "binary" or "tfi-df".
364
+ - train : bool, optional
365
+ Defines whether the model will be trained or not. (if True,
366
+ TextVectorization will be trained | else, TextVectorization will used the
367
+ passed `text_vectorizer`). (Default is True).
368
+ - text_vectorizer : TextVectorization, optional
369
+ Trained TextVectorization layer will be used for generating embeddings of
370
+ `text_column` if `train` is False. (Default is None).
371
+
372
+ Returns:
373
+ --------
374
+ - if `train` == True:
375
+ A tuple contains the following:
376
+ - text_vectorizer : TextVectorization
377
+ Trained TextVectorization layer.
378
+ - text_vectors : tf.Tensor
379
+ A Tensor contains `text_column` training vectors.
380
+ - otherwise:
381
+ text_vectors : tf.Tensor
382
+ A Tensor contains `text_column` testing vectors.
383
+
384
+ Raises:
385
+ -------
386
+ - AssertionError
387
+ If train is False and `text_vectorizer` is None.
388
+ - AssertionError
389
+ If train is False and `text_vectorizer` is not an instance of TextVectorization.
390
+ """
391
+
392
+ if train:
393
+ text_vectorizer = TextVectorization(
394
+ max_tokens=max_tokens,
395
+ output_mode=output_mode,
396
+ output_sequence_length=output_sequence_length,
397
+ )
398
+ text_vectorizer.adapt(text_column)
399
+ text_vectors = text_vectorizer(text_column)
400
+ return text_vectorizer, text_vectors
401
+
402
+ assert (
403
+ text_vectorizer is not None
404
+ ), "`text_vectorizer` argument must be not None."
405
+ assert isinstance(
406
+ text_vectorizer, TextVectorization
407
+ ), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors."
408
+ text_vectors = text_vectorizer(text_column)
409
+
410
+ return text_vectors
411
+
412
+ def convert_text_to_vectors_glove(
413
+ self,
414
+ text_column: pd.Series,
415
+ train: bool = True,
416
+ glove_tokenizer: Tokenizer = None,
417
+ vocab_size: int = 1000,
418
+ oov_token: str = "<OOV>",
419
+ max_length: int = 50,
420
+ padding_type: str = "post",
421
+ truncation_type: str = "post",
422
+ ) -> Tuple[Tokenizer, np.ndarray] | np.ndarray:
423
+ """
424
+ Converting `text_column` to vectors using `glove_tokenizer`.
425
+
426
+ Parameters:
427
+ ------------
428
+ - text_column : pd.Series
429
+ Contains the case facts.
430
+ - train : bool, optional
431
+ Defines whether the model will be trained or not. (if True,
432
+ Tokenizer will be trained | else, Tokenizer will used the
433
+ passed `glove_tokenizer`). (Default is True).
434
+ - glove_tokenizer : Tokenizer, optional
435
+ Trained Tokenizer layer will be used for generating embeddings of
436
+ `text_column` if `train` is False. (Default is None).
437
+ - vocab_size : int, optional
438
+ Represents the number of supported vocabulary of the Tokenizer,
439
+ any token not in this vocabulary will be treated as an out-of-vocabulary
440
+ token(OOV). (Default is 1000).
441
+ - oov_tokens : str, optional
442
+ Represents the token of an out-of-vocabulary token (Default is "<OOV>").
443
+ - max_length : int, optional
444
+ Defins the output vector's dimension. (Default is 50).
445
+ - padding_type : str, optional
446
+ Defines the padding type of the vectors, if the vector size is less than
447
+ `max_length`, the rest of the `max_length` will be padded with 0 (Default is "post").
448
+ - truncation_type : str, optional
449
+ Defines the truncation type of the vectors, if the vector size is more than
450
+ `max_length`, the extra of the `max_length` will be truncated (Default is "post").
451
+
452
+ Returns:
453
+ --------
454
+ - if `train` == True:
455
+ A tuple contains the following:
456
+ - glove_tokenizer : Tokenizer
457
+ Trained Tokenizer layer.
458
+ - text_padded : np.ndarray
459
+ An array contains `text_column` vectors.
460
+ - otherwise:
461
+ text_padded : np.ndarray
462
+ An array contains `text_column` vectors.
463
+
464
+ Raises:
465
+ -------
466
+ - AssertionError
467
+ If train is False and `glove_tokenizer` is None.
468
+ - AssertionError
469
+ If train is False and `glove_tokenizer` is not instance of Tokenizer.
470
+ """
471
+
472
+ if train:
473
+ glove_tokenizer = Tokenizer(
474
+ num_words=vocab_size, oov_token=oov_token)
475
+ glove_tokenizer.fit_on_texts(text_column)
476
+ text_sequences = glove_tokenizer.texts_to_sequences(text_column)
477
+ text_padded = pad_sequences(
478
+ text_sequences,
479
+ maxlen=max_length,
480
+ padding=padding_type,
481
+ truncating=truncation_type,
482
+ )
483
+
484
+ return glove_tokenizer, text_padded
485
+
486
+ assert (
487
+ glove_tokenizer is not None
488
+ ), "`glove_tokenizer` argument must be not None."
489
+ assert isinstance(
490
+ glove_tokenizer, Tokenizer
491
+ ), "`glove_tokenizer` argument must be an instance of Tokenizer."
492
+ text_sequences = glove_tokenizer.texts_to_sequences(text_column)
493
+ text_padded = pad_sequences(
494
+ text_sequences,
495
+ maxlen=max_length,
496
+ padding=padding_type,
497
+ truncating=truncation_type,
498
+ )
499
+
500
+ return text_padded
501
+
502
+ def balance_data(self, X_train: pd.Series, y_train: pd.Series) -> pd.DataFrame:
503
+ """
504
+ Balancing `X_train` and `y_train` to distribute the targets in `y_train` equally.
505
+
506
+ Parameters:
507
+ ------------
508
+ - text_column : pd.Series
509
+ Contains the case facts.
510
+ - y_train : pd.Series
511
+ Contains the training targets.
512
+
513
+ Returns:
514
+ --------
515
+ - shuffled_balanced_df : pd.DataFrame
516
+ Contains the new balanced dataframe with shuffling indicies.
517
+ """
518
+
519
+ df = pd.concat([X_train, y_train], axis=1)
520
+
521
+ first_party = df[df["winner_index"] == 0]
522
+ second_party = df[df["winner_index"] == 1]
523
+
524
+ upsample_second_party = resample(
525
+ second_party, replace=True, n_samples=len(first_party), random_state=42
526
+ )
527
+
528
+ upsample_df = pd.concat([upsample_second_party, first_party])
529
+
530
+ shuffled_indices = np.arange(upsample_df.shape[0])
531
+ np.random.shuffle(shuffled_indices)
532
+
533
+ shuffled_balanced_df = upsample_df.iloc[shuffled_indices, :]
534
+
535
+ return shuffled_balanced_df
536
+
537
+ def anonymize_data(
538
+ self,
539
+ first_party_names: pd.Series,
540
+ second_party_names: pd.Series,
541
+ text_column: pd.Series,
542
+ ) -> pd.Series:
543
+ """
544
+ Anonymize `text_column` by replacing `first_party_names` and
545
+ `second_party_names` wit "_PARTY_" tag.
546
+
547
+ Parameters:
548
+ ------------
549
+ - first_party_names : pd.Series
550
+ Contains all first party names needed to be anonymized.
551
+ - second_party_names : pd.Series
552
+ Contains all second party names needed to be anonymized.
553
+ - text_column : pd.Series
554
+ Contains all texts needed to be anonymized.
555
+
556
+ Returns:
557
+ --------
558
+ - all_anonyimzed_facts : pd.Series
559
+ Contains anonymized version of `text_column`.
560
+ """
561
+
562
+ all_anonymized_facts = []
563
+
564
+ for i in range(text_column.shape[0]):
565
+ facts = text_column.iloc[i]
566
+ first_party_name = first_party_names.iloc[i]
567
+ second_party_name = second_party_names.iloc[i]
568
+ anonymized_facts = self._anonymize_case_facts(
569
+ first_party_name, second_party_name, facts
570
+ )
571
+ all_anonymized_facts.append(anonymized_facts)
572
+
573
+ return pd.Series(all_anonymized_facts)
574
+
575
+ def preprocess_data(self, text_column: pd.Series) -> pd.Series:
576
+ """
577
+ Preprocessing & cleaning all texts in `text_column`.
578
+
579
+ Parameters:
580
+ ------------
581
+ - text_column : pd.Series
582
+ Contains all case facts.
583
+
584
+ Returns:
585
+ --------
586
+ - preprocessed_text : pd.Series
587
+ Contains all texts after being processed.
588
+ """
589
+
590
+ preprocessed_text = text_column.apply(self._preprocess_text)
591
+ return preprocessed_text
style.css ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @import url('https://fonts.googleapis.com/css2?family=Cairo:wght@300;400;500;600;700;800&display=swap');
2
+
3
+ * {
4
+ font-family: 'Cairo', sans-serif !important;
5
+ }
6
+
7
+ /* title */
8
+ .e16nr0p30 {
9
+ font-weight: 700;
10
+ font-size: 30px;
11
+ }
12
+
13
+ /* buttons */
14
+ .edgvbvh10,
15
+ .edgvbvh5 {
16
+ width: 100%;
17
+ height: 40px;
18
+ background-color: #4756ff;
19
+ color: #fff;
20
+ transition: 0.4s;
21
+ border: none;
22
+ }
23
+
24
+ .edgvbvh10:hover,
25
+ .edgvbvh5:hover {
26
+ background-color: #3747fd;
27
+ color: #fff;
28
+ border: none;
29
+ }
30
+
31
+ .edgvbvh10:focus,
32
+ .edgvbvh5:focus {
33
+ background-color: #3747fd;
34
+ color: #fff !important;
35
+ box-shadow: none;
36
+ border: none;
37
+ }
38
+
39
+ /* header */
40
+ .row_heading {
41
+ font-size: 14px;
42
+ }
43
+
44
+ /* spinner */
45
+ .css-1y04v0k.e17lx80j1,
46
+ .css-p6380s.e17lx80j1 {
47
+ margin: 0px;
48
+ border-color: #34e27f #b3b3b333 #cacaca33 !important;
49
+ -webkit-box-flex: 0;
50
+ flex-grow: 0;
51
+ flex-shrink: 0;
52
+ }
53
+
54
+ /* inputs styling */
55
+ .st-bf {
56
+ transition: 0.8s;
57
+ border: none !important;
58
+ }
59
+
60
+ .st-bf:hover {
61
+ box-shadow: 0 0 0 4px #dbdbdb !important;
62
+ }
63
+
64
+ /* text stylings */
65
+ .highlight-petitioner {
66
+ border-radius: 0.4rem;
67
+ background-color: rgba(253, 231, 142, 0.4);
68
+ color: #ffd061;
69
+ padding: 1px 5px;
70
+ margin-top: 10px;
71
+ margin-right: 5px;
72
+ }
73
+
74
+ .highlight-respondent {
75
+ border-radius: 0.4rem;
76
+ background-color: rgba(78, 170, 255, 0.2);
77
+ color: #6195ff;
78
+ padding: 1px 5px;
79
+ margin-top: 10px;
80
+ margin-right: 5px;
81
+ }
82
+
83
+ .bold-text {
84
+ font-weight: 700 !important;
85
+ }
86
+
87
+ .text-facts {
88
+ line-height: 40px;
89
+ }
90
+
91
+ /* footer */
92
+ footer {
93
+ display: none !important;
94
+ }
utils.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, List, Tuple
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ from gensim.models.doc2vec import Doc2Vec, TaggedDocument
7
+
8
+ import tensorflow as tf
9
+ from tensorflow import keras
10
+ from keras.preprocessing.text import Tokenizer
11
+
12
+
13
+ def read_data(filepath="../csvs/"):
14
+ """
15
+ Reading CSV files of the dataset.
16
+
17
+ Parameters:
18
+ ----------
19
+ - filepath : str
20
+ Defines the path that contains the CSV files.
21
+
22
+ Returns:
23
+ --------
24
+ A tuple contains the following:
25
+ - X_train : pd.DataFrame
26
+ - y_train : pd.Series
27
+ - X_test : pd.DataFrame
28
+ - y_test : pd.Series
29
+ """
30
+
31
+ X_train = pd.read_csv(filepath + "X_train.csv")
32
+ X_train = X_train.iloc[:, 1:]
33
+
34
+ X_test = pd.read_csv(filepath + "X_test.csv")
35
+ X_test = X_test.iloc[:, 1:]
36
+
37
+ y_train = pd.read_csv(filepath + "y_train.csv")
38
+ y_train = y_train.iloc[:, 1:]
39
+
40
+ y_test = pd.read_csv(filepath + "y_test.csv")
41
+ y_test = y_test.iloc[:, 1:]
42
+
43
+ return X_train, X_test, y_train, y_test
44
+
45
+
46
+ def train_model(
47
+ model_building_func: Callable[[], keras.models.Sequential],
48
+ X_train_vectors: pd.DataFrame | np.ndarray | tf.Tensor,
49
+ y_train: pd.Series,
50
+ k: int = 4,
51
+ num_epochs: int = 30,
52
+ batch_size: int = 64,
53
+ ) -> Tuple[
54
+ List[keras.models.Sequential],
55
+ List[List[float]],
56
+ List[List[float]],
57
+ List[List[float]],
58
+ List[List[float]],
59
+ ]:
60
+ """
61
+ Trains a model on `X_train_vectors` and `y_train` using k-fold cross-validation.
62
+
63
+ Parameters:
64
+ -----------
65
+ - model_building_func : Callable[[], tf.keras.models.Sequential]
66
+ A function that builds and compiles a Keras Sequential model.
67
+ - X_train_vectors : pd.DataFrame
68
+ The training input data.
69
+ - y_train : pd.Series
70
+ The training target data.
71
+ - k : int, optional
72
+ The number of folds for cross-validation (default is 4).
73
+ - num_epochs : int, optional
74
+ The number of epochs to train for (default is 30).
75
+ - batch_size : int, optional
76
+ The batch size to use during training (default is 64).
77
+
78
+ Returns:
79
+ --------
80
+ A tuple containing the following items:
81
+ - all_models : List[keras.models.Sequential]
82
+ A list of `k` trained models.
83
+ - all_losses : List[List[float]]
84
+ A `k` by `num_epochs` list containing the training losses for each fold.
85
+ - all_val_losses : List[List[float]]
86
+ A `k` by `num_epochs` list containing the validation losses for each fold.
87
+ - all_acc : List[List[float]]
88
+ A `k` by `num_epochs` list containing the training accuracies for each fold.
89
+ - all_val_acc : List[List[float]]
90
+ A `k` by `num_epochs` list containing the validation accuracies for each fold.
91
+ """
92
+
93
+ num_validation_samples = len(X_train_vectors) // k
94
+
95
+ all_models = []
96
+ all_losses = []
97
+ all_val_losses = []
98
+ all_accuracies = []
99
+ all_val_accuracies = []
100
+
101
+ for fold in range(k):
102
+ print(f"fold: {fold+1}")
103
+ validation_data = X_train_vectors[
104
+ num_validation_samples * fold : num_validation_samples * (fold + 1)
105
+ ]
106
+ validation_targets = y_train[
107
+ num_validation_samples * fold : num_validation_samples * (fold + 1)
108
+ ]
109
+
110
+ training_data = np.concatenate(
111
+ [
112
+ X_train_vectors[: num_validation_samples * fold],
113
+ X_train_vectors[num_validation_samples * (fold + 1) :],
114
+ ]
115
+ )
116
+ training_targets = np.concatenate(
117
+ [
118
+ y_train[: num_validation_samples * fold],
119
+ y_train[num_validation_samples * (fold + 1) :],
120
+ ]
121
+ )
122
+
123
+ model = model_building_func()
124
+ history = model.fit(
125
+ training_data,
126
+ training_targets,
127
+ validation_data=(validation_data, validation_targets),
128
+ epochs=num_epochs,
129
+ batch_size=batch_size,
130
+ )
131
+
132
+ all_models.append(model)
133
+ all_losses.append(history.history["loss"])
134
+ all_val_losses.append(history.history["val_loss"])
135
+ all_accuracies.append(history.history["accuracy"])
136
+ all_val_accuracies.append(history.history["val_accuracy"])
137
+
138
+ return (all_models, all_losses, all_val_losses, all_accuracies, all_val_accuracies)
139
+
140
+
141
+ def print_testing_loss_accuracy(
142
+ all_models: List[keras.models.Sequential],
143
+ X_test_vectors: pd.DataFrame | np.ndarray | tf.Tensor,
144
+ y_test: pd.Series,
145
+ ) -> None:
146
+ """
147
+ Displaying testing loss and testing accuracy of each model in `all_models`,
148
+ and displaying their average.
149
+
150
+ Parameters:
151
+ ------------
152
+ - all_models : List[keras.models.Sequential]
153
+ A list of size `k` contains trained models.
154
+ - X_test_vectors : pd.DataFrame
155
+ Contains testing vectors.
156
+ - y_test : pd.Series
157
+ Contains testing labels.
158
+ """
159
+
160
+ sum_testing_losses = 0.0
161
+ sum_testing_accuracies = 0.0
162
+
163
+ for i, model in enumerate(all_models):
164
+ print(f"model: {i+1}")
165
+ loss_accuracy = model.evaluate(X_test_vectors, y_test, verbose=1)
166
+ sum_testing_losses += loss_accuracy[0]
167
+ sum_testing_accuracies += loss_accuracy[1]
168
+ print("====" * 20)
169
+
170
+ num_models = len(all_models)
171
+ avg_testing_loss = sum_testing_losses / num_models
172
+ avg_testing_acc = sum_testing_accuracies / num_models
173
+ print(f"average testing loss: {avg_testing_loss:.3f}")
174
+ print(f"average testing accuracy: {avg_testing_acc:.3f}")
175
+
176
+
177
+ def calculate_average_measures(
178
+ all_losses: list[list[float]],
179
+ all_val_losses: list[list[float]],
180
+ all_accuracies: list[list[float]],
181
+ all_val_accuracies: list[list[float]],
182
+ ) -> Tuple[
183
+ List[keras.models.Sequential],
184
+ List[List[float]],
185
+ List[List[float]],
186
+ List[List[float]],
187
+ List[List[float]],
188
+ ]:
189
+ """
190
+ Calculate the average measures of cross-validated results.
191
+
192
+ Parameters:
193
+ ------------
194
+ - all_losses : List[List[float]]
195
+ A `k` by `num_epochs` list contains the values of training losses.
196
+ - all_val_losses : List[List[float]]
197
+ A `k` by `num_epochs` list contains the values of validation losses.
198
+ - all_accuracies : List[List[float]]
199
+ A `k` by `num_epochs` list contains the values of training accuracies.
200
+ - all_val_accuracies : List[List[float]]
201
+ A `k` by `num_epochs` list contains the values of validation accuracies.
202
+
203
+ Returns:
204
+ --------
205
+ A tuple containing the following items:
206
+ - avg_loss_hist : List[float]
207
+ A list of length `num_epochs` contains the average of training losses.
208
+ - avg_val_loss_hist : List[float]
209
+ A list of length `num_epochs` contains the average of validaton losses.
210
+ - avg_acc_hist : List[float]
211
+ A list of length `num_epochs` contains the average of training accuracies.
212
+ - avg_val_acc_hist : List[float]
213
+ A list of length `num_epochs` contains the average of validation accuracies.
214
+ """
215
+
216
+ num_epochs = len(all_losses[0])
217
+ avg_loss_hist = [np.mean([x[i] for x in all_losses]) for i in range(num_epochs)]
218
+ avg_val_loss_hist = [
219
+ np.mean([x[i] for x in all_val_losses]) for i in range(num_epochs)
220
+ ]
221
+ avg_acc_hist = [np.mean([x[i] for x in all_accuracies]) for i in range(num_epochs)]
222
+ avg_val_acc_hist = [
223
+ np.mean([x[i] for x in all_val_accuracies]) for i in range(num_epochs)
224
+ ]
225
+
226
+ return (avg_loss_hist, avg_val_loss_hist, avg_acc_hist, avg_val_acc_hist)
227
+
228
+
229
+ class Doc2VecModel:
230
+ """Responsible of creating, initializing, and training Doc2Vec embeddings model."""
231
+
232
+ def __init__(self, vector_size=50, min_count=2, epochs=100, dm=1, window=5) -> None:
233
+ """
234
+ Initalize a Doc2Vec model.
235
+
236
+ Parameters:
237
+ ------------
238
+ - vector_size : int, optional
239
+ Dimensionality of the feature vectors (Default is 50).
240
+ - min_count : int, optional
241
+ Ignores all words with total frequency lower than this (Default is 2).
242
+ - epochs : int, optional
243
+ Represents the number of training epochs (Default is 100).
244
+ - dm : int, optional
245
+ Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
246
+ Otherwise, `distributed bag of words` (PV-DBOW) is employed (Default is 1).
247
+ - window : int, optional
248
+ The maximum distance between the current and predicted word within a
249
+ sentence (Default is 5).
250
+ """
251
+
252
+ self.doc2vec_model = Doc2Vec(
253
+ vector_size=vector_size,
254
+ min_count=min_count,
255
+ epochs=epochs,
256
+ dm=dm,
257
+ seed=865,
258
+ window=window,
259
+ )
260
+
261
+ def train_doc2vec_embeddings_model(
262
+ self, tagged_docs_train: List[TaggedDocument]
263
+ ) -> Doc2Vec:
264
+ """
265
+ Train Doc2Vec model on `tagged_docs_train`.
266
+
267
+ Parameters:
268
+ ------------
269
+ - tagged_docs_train : list[TaggedDocument]
270
+ Contains the required format of training Doc2Vec model.
271
+
272
+ Returns:
273
+ --------
274
+ - doc2vec_model : Doc2Vec
275
+ The trained Doc2Vec model.
276
+ """
277
+
278
+ self.doc2vec_model.build_vocab(tagged_docs_train)
279
+ self.doc2vec_model.train(
280
+ tagged_docs_train,
281
+ total_examples=self.doc2vec_model.corpus_count,
282
+ epochs=self.doc2vec_model.epochs,
283
+ )
284
+
285
+ return self.doc2vec_model
286
+
287
+
288
+ class GloveModel:
289
+ """Responsible for creating and generating the glove embedding layer"""
290
+
291
+ def __init__(self) -> None:
292
+ pass
293
+
294
+ def _generate_glove_embedding_index(
295
+ self, glove_file_path: str = "GloVe/glove.6B.50d.txt"
296
+ ) -> dict:
297
+ """
298
+ Responsible for generating glove embedding index.
299
+
300
+ Parameters:
301
+ ------------
302
+ - glove_file_path : str
303
+ Defines the path of the pretrained GloVe embeddings text file
304
+ (Default is "GloVe/glove.6B.50d.txt").
305
+
306
+ Returns:
307
+ --------
308
+ - embedding_index : dict
309
+ Contains each word as a key, and its co-effeicents as a value.
310
+ """
311
+
312
+ embeddings_index = {}
313
+ with open(glove_file_path, encoding="utf8") as f:
314
+ for line in f:
315
+ values = line.split()
316
+ word = values[0]
317
+ coefs = np.asarray(values[1:], dtype="float32")
318
+ embeddings_index[word] = coefs
319
+
320
+ return embeddings_index
321
+
322
+ def _generate_glove_embedding_matrix(
323
+ self, word_index: dict, embedding_index: dict, max_length: int
324
+ ) -> np.ndarray:
325
+ """
326
+ Generating embedding matrix of each word in `word_index`.
327
+
328
+ Parameters:
329
+ -----------
330
+ - word_index : dict
331
+ Contains words as keys with there indicies as values.
332
+ - embedding_index : dict
333
+ Contains each word as a key, and its co-effeicents as a value.
334
+ - max_length : int
335
+ Defines the size of the embedding vector of each word in the
336
+ embedding matrix.
337
+
338
+ Returns:
339
+ --------
340
+ - embedding_matrix : np.ndarray
341
+ Contains all embedding vectors for each word in`word_index`.
342
+ """
343
+
344
+ embedding_matrix = np.zeros((len(word_index) + 1, max_length))
345
+
346
+ for word, i in word_index.items():
347
+ embedding_vector = embedding_index.get(word)
348
+ if embedding_vector is not None:
349
+ embedding_matrix[i] = embedding_vector
350
+
351
+ return embedding_matrix
352
+
353
+ def generate_glove_embedding_layer(
354
+ self, glove_tokenizer: Tokenizer, max_length: int = 50
355
+ ) -> keras.layers.Embedding:
356
+ """
357
+ Create GloVe embedding layer for later usage in the neural network.
358
+
359
+ Paramters:
360
+ ----------
361
+ - glove_tokenizer : Tokenizer
362
+ Trained tokenizer on training data to extract word index from it.
363
+ - max_length : int, optional
364
+ Defines the maximum length of the output embedding vector for
365
+ each word. (Default is 50).
366
+
367
+ Returns:
368
+ --------
369
+ - embedding_layer : keras.layers.Embedding
370
+ An embedding layer of size `word index + 1` by `max_length` with
371
+ trained weights that can be used a vectorizer of case facts.
372
+ """
373
+
374
+ word_index = glove_tokenizer.word_index
375
+
376
+ embedding_index = self._generate_glove_embedding_index()
377
+ embedding_matrix = self._generate_glove_embedding_matrix(
378
+ word_index, embedding_index, max_length
379
+ )
380
+
381
+ embedding_layer = keras.layers.Embedding(
382
+ len(word_index) + 1,
383
+ max_length,
384
+ weights=[embedding_matrix],
385
+ input_length=max_length,
386
+ trainable=False,
387
+ )
388
+
389
+ return embedding_layer