Spaces:

Hetan07
/

trial

Runtime error

App Files Files Community

Hetan07 commited on Oct 27, 2023

Commit

0e929cb

•

1 Parent(s): d8be960

Upload 10 files

Browse files

Files changed (10) hide show

csvs/X_test.csv +0 -0
csvs/X_train.csv +0 -0
csvs/y_test.csv +694 -0
csvs/y_train.csv +2772 -0
dataset/task1_data.pkl +3 -0
src/deployment_utils.py +607 -0
src/plotting.py +230 -0
src/preprocessing.py +591 -0
src/style.css +94 -0
src/utils.py +389 -0

csvs/X_test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

csvs/X_train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

csvs/y_test.csv ADDED Viewed

	@@ -0,0 +1,694 @@

+,winner_index
+397,1
+559,1
+401,0
+923,0
+334,1
+1726,0
+1653,0
+1905,0
+1899,0
+1455,0
+855,1
+548,1
+1666,0
+1116,1
+344,0
+1277,1
+410,0
+1174,1
+1308,0
+2188,0
+2324,0
+222,1
+1783,0
+1395,0
+888,1
+1275,1
+999,0
+978,0
+175,1
+94,0
+542,0
+341,0
+1833,0
+1027,0
+679,1
+711,1
+306,0
+393,1
+1764,0
+1204,1
+1310,1
+326,0
+414,0
+180,1
+208,0
+1009,0
+753,1
+589,0
+312,0
+336,0
+219,0
+477,0
+355,0
+644,0
+1059,0
+757,1
+450,0
+1279,0
+1304,1
+2250,0
+1124,0
+133,1
+1031,0
+1024,1
+1818,0
+1305,0
+150,1
+177,0
+122,0
+990,1
+489,0
+1692,0
+1106,1
+1623,0
+680,1
+1155,0
+103,0
+494,0
+964,1
+2338,0
+603,0
+1646,0
+1415,0
+800,0
+2161,0
+1159,1
+584,0
+1228,0
+659,0
+1645,0
+612,0
+510,1
+1838,0
+885,1
+1374,0
+621,1
+83,1
+541,0
+844,1
+2237,0
+598,1
+58,1
+522,0
+1307,0
+2045,0
+463,0
+1048,0
+1003,1
+149,1
+1177,0
+556,1
+1758,0
+965,0
+361,0
+2179,0
+2361,0
+10,0
+301,1
+1857,0
+230,0
+113,0
+1516,0
+450,1
+435,0
+1738,0
+697,1
+1007,1
+488,1
+474,1
+2344,0
+975,0
+318,1
+266,0
+17,1
+1328,1
+483,0
+661,0
+683,1
+27,0
+2317,0
+1605,0
+142,0
+666,0
+2038,0
+592,0
+648,0
+1409,0
+634,1
+359,1
+549,0
+626,1
+1824,0
+889,0
+746,1
+62,1
+97,1
+2352,0
+391,1
+1503,0
+1517,0
+886,0
+2068,0
+828,1
+1377,0
+145,1
+1449,0
+1802,0
+167,0
+537,1
+843,0
+825,1
+1909,0
+1250,1
+1797,0
+536,1
+91,1
+1106,0
+996,1
+840,0
+716,1
+1198,0
+227,0
+1386,0
+801,1
+430,0
+1695,0
+328,1
+1337,1
+849,0
+1220,1
+1815,0
+913,1
+650,1
+1072,0
+1023,0
+636,1
+106,0
+473,0
+660,1
+1398,0
+510,0
+181,0
+882,0
+1385,0
+1298,1
+796,0
+387,0
+431,1
+1271,1
+1332,1
+639,0
+472,0
+545,1
+65,1
+2232,0
+397,0
+214,1
+688,1
+356,0
+405,1
+1348,1
+18,1
+1830,0
+739,1
+1788,0
+1435,0
+544,0
+1750,0
+663,1
+892,0
+476,1
+770,0
+2249,0
+2139,0
+1123,1
+1326,1
+1107,0
+1349,1
+816,0
+265,0
+1640,0
+1234,0
+553,1
+1933,0
+1921,0
+909,1
+691,0
+772,0
+872,0
+675,1
+1160,1
+585,0
+1664,0
+150,0
+1053,1
+700,1
+617,0
+1892,0
+1249,1
+205,0
+470,0
+348,1
+582,1
+1002,0
+1554,0
+290,0
+502,1
+130,0
+2046,0
+471,1
+635,0
+987,0
+493,0
+1242,1
+38,1
+910,1
+1609,0
+85,1
+760,1
+244,0
+1762,0
+622,0
+420,0
+420,1
+1212,0
+2111,0
+1082,0
+1059,1
+1081,1
+533,1
+2365,0
+466,0
+898,1
+705,1
+631,1
+1300,1
+1443,0
+800,1
+984,0
+1615,0
+1260,0
+1672,0
+1636,0
+582,0
+1330,1
+61,0
+1010,1
+2299,0
+1796,0
+907,1
+586,1
+1032,1
+1613,0
+950,0
+249,1
+1948,0
+132,1
+1011,0
+1224,1
+1352,0
+308,1
+1751,0
+724,0
+766,1
+1316,0
+821,0
+721,0
+982,1
+941,1
+1025,0
+1603,0
+1014,1
+989,0
+123,0
+2222,0
+390,1
+565,1
+293,0
+333,1
+2115,0
+216,0
+778,1
+1073,1
+812,0
+1264,0
+758,1
+1223,0
+1877,0
+577,1
+104,1
+91,0
+1233,0
+245,1
+1262,1
+1188,1
+1208,1
+956,1
+1644,0
+802,0
+1383,0
+1649,0
+780,0
+1050,0
+1343,1
+1424,0
+116,1
+1835,0
+191,1
+475,0
+1224,0
+1820,0
+23,0
+1626,0
+2274,0
+381,0
+757,0
+752,0
+335,0
+475,1
+608,1
+895,1
+1241,1
+192,0
+925,0
+696,1
+764,1
+432,1
+899,1
+1154,0
+677,0
+507,1
+2180,0
+1390,0
+1648,0
+335,1
+24,0
+1722,0
+600,1
+295,0
+1437,0
+283,0
+592,1
+1963,0
+546,1
+2112,0
+942,1
+543,0
+2351,0
+60,1
+924,1
+195,0
+1075,0
+404,1
+161,0
+1915,0
+1241,0
+2138,0
+39,1
+376,0
+462,1
+520,1
+166,1
+1199,0
+1902,0
+918,0
+1425,0
+1019,1
+570,1
+347,0
+842,1
+751,0
+1885,0
+664,0
+1454,0
+707,0
+683,0
+1289,1
+47,1
+198,0
+44,0
+1427,0
+1043,0
+24,1
+973,0
+1684,0
+828,0
+197,0
+1812,0
+2193,0
+1366,0
+623,0
+1141,1
+995,0
+2113,0
+1157,1
+784,0
+1755,0
+444,1
+2205,0
+723,1
+916,0
+1391,0
+417,0
+377,0
+550,1
+1048,1
+261,1
+1679,0
+197,1
+1209,1
+86,0
+1102,1
+94,1
+110,0
+144,0
+803,0
+193,1
+337,0
+316,0
+1924,0
+1290,1
+370,1
+853,0
+933,1
+1898,0
+265,1
+662,1
+839,1
+1451,0
+671,0
+552,0
+1506,0
+1715,0
+110,1
+1988,0
+2320,0
+945,0
+1232,0
+681,0
+1292,0
+2006,0
+643,1
+1638,0
+1096,1
+917,1
+2177,0
+1011,1
+355,1
+1,1
+1057,0
+418,0
+1149,0
+563,0
+899,0
+1151,1
+799,1
+462,0
+327,1
+1142,1
+1358,0
+948,0
+1064,1
+131,0
+1279,1
+1564,0
+112,1
+1005,0
+1682,0
+918,1
+793,0
+32,1
+647,1
+424,1
+1070,0
+1996,0
+1159,0
+275,1
+490,1
+2163,0
+736,1
+973,1
+2321,0
+601,1
+901,0
+2085,0
+293,1
+1459,0
+1244,1
+905,0
+935,1
+887,1
+95,1
+108,1
+1044,0
+479,0
+370,0
+1126,0
+255,1
+1254,0
+169,1
+28,0
+216,1
+863,0
+1965,0
+581,0
+486,0
+163,0
+1038,0
+2042,0
+130,1
+1229,1
+1851,0
+891,1
+761,1
+136,1
+868,1
+1195,1
+1978,0
+497,0
+817,0
+517,0
+262,0
+1147,0
+446,0
+964,0
+314,0
+1181,1
+250,1
+1207,0
+823,1
+187,1
+726,1
+886,1
+1180,0
+542,1
+1122,1
+228,1
+1346,1
+2013,0
+569,0
+336,1
+2167,0
+1987,0
+354,1
+607,1
+550,0
+2131,0
+678,0
+2063,0
+170,0
+1201,1
+332,0
+806,1
+1476,0
+195,1
+818,0
+1276,1
+2175,0
+622,1
+141,1
+140,0
+549,1
+576,0
+30,0
+718,0
+1577,0
+1100,0
+1592,0
+2101,0
+2089,0
+753,0
+921,0
+1145,0
+612,1
+2062,0
+1217,0
+534,0
+400,1
+766,0
+491,0
+1128,1
+1512,0
+1163,1
+485,1
+1022,0
+70,1
+1373,0
+1060,0
+867,1
+893,0
+2255,0
+776,0
+1826,0
+271,1
+2316,0
+1556,0
+878,0
+2066,0
+1148,1
+1662,0
+174,1
+1051,1
+548,0
+455,0
+628,0
+685,1
+833,1
+730,1
+2226,0
+433,1
+992,1
+1055,1
+1487,0
+1321,0
+1823,0
+1678,0
+1509,0
+131,1
+323,1
+2069,0
+1282,0
+457,0
+1008,0
+129,0
+1121,0
+42,1
+203,0
+156,1
+1054,0
+539,1
+1119,0
+732,0
+1819,0

csvs/y_train.csv ADDED Viewed

	@@ -0,0 +1,2772 @@

+,winner_index
+182,0
+1245,0
+880,0
+170,1
+1171,1
+940,0
+16,0
+605,0
+2256,0
+2367,0
+1744,0
+1560,0
+985,1
+1945,0
+926,1
+595,0
+346,1
+855,0
+102,0
+1472,0
+488,0
+786,0
+2105,0
+1514,0
+380,1
+2130,0
+432,0
+1273,0
+852,0
+386,1
+272,1
+181,1
+392,1
+851,1
+2238,0
+427,0
+1119,1
+572,1
+806,0
+530,1
+405,0
+1416,0
+241,1
+1031,1
+2363,0
+689,0
+304,1
+1005,1
+25,0
+2332,0
+896,0
+179,1
+1242,0
+2372,0
+1711,0
+1128,0
+344,1
+715,1
+1521,0
+2356,0
+2257,0
+1298,0
+1290,0
+175,0
+456,1
+495,1
+613,0
+1326,0
+183,0
+627,1
+2199,0
+606,0
+1445,0
+77,1
+1544,0
+1110,1
+1862,0
+1716,0
+378,0
+1082,1
+1158,0
+1094,1
+1868,0
+937,1
+2306,0
+890,1
+426,0
+1276,0
+1072,1
+2296,0
+1074,1
+1701,0
+211,1
+93,0
+604,0
+111,1
+1342,1
+48,1
+690,1
+1177,1
+254,1
+1798,0
+2230,0
+619,0
+545,0
+1479,0
+1185,0
+771,1
+1175,0
+1651,0
+214,0
+1907,0
+2044,0
+1465,0
+812,1
+1077,1
+791,0
+200,1
+847,0
+1339,1
+1101,1
+192,1
+1872,0
+429,0
+271,0
+2135,0
+1187,0
+1247,0
+299,0
+976,1
+831,0
+652,1
+865,1
+190,1
+515,1
+371,1
+2005,0
+632,1
+1317,1
+425,1
+76,0
+299,1
+1036,0
+1084,0
+281,0
+783,1
+1207,1
+1220,0
+237,0
+762,0
+1922,0
+670,1
+1853,0
+126,1
+2221,0
+229,1
+1080,1
+2265,0
+813,1
+1211,1
+1113,0
+1105,0
+303,1
+834,1
+1328,0
+748,0
+151,1
+1315,1
+47,0
+290,1
+229,0
+672,0
+714,1
+1375,0
+1297,1
+623,1
+352,1
+460,1
+1144,1
+1143,0
+566,0
+539,0
+1295,1
+1655,0
+939,1
+1482,0
+9,1
+374,0
+1076,1
+1208,0
+1357,0
+1389,0
+88,0
+870,1
+2247,0
+698,1
+903,1
+915,0
+1874,0
+1292,1
+59,0
+526,0
+1263,0
+638,1
+669,1
+1026,1
+890,0
+2110,0
+668,0
+1941,0
+1105,1
+738,1
+2092,0
+931,0
+1401,0
+1345,0
+929,0
+933,0
+2165,0
+781,1
+136,0
+11,0
+2362,0
+1500,0
+802,1
+1402,0
+407,1
+445,0
+1189,1
+1675,0
+848,0
+1203,0
+785,0
+1066,0
+749,0
+1318,0
+980,1
+1234,1
+698,0
+2326,0
+1502,0
+505,0
+1840,0
+2,1
+844,0
+167,1
+528,1
+11,1
+762,1
+805,0
+1747,0
+430,1
+411,1
+2169,0
+113,1
+219,1
+1404,0
+647,0
+657,0
+798,0
+1100,1
+620,1
+1121,1
+2305,0
+1896,0
+1746,0
+1710,0
+1569,0
+740,0
+1227,1
+1089,1
+525,1
+55,1
+630,0
+562,0
+1299,0
+1611,0
+747,1
+36,0
+2330,0
+653,1
+2129,0
+765,0
+56,1
+991,1
+452,1
+1221,1
+362,0
+1083,1
+460,0
+1900,0
+2301,0
+1414,0
+1017,1
+2314,0
+308,0
+1870,0
+316,1
+518,0
+349,0
+1136,0
+217,1
+2244,0
+614,1
+1844,0
+1299,1
+343,1
+959,0
+1541,0
+930,1
+1343,0
+120,1
+2280,0
+83,0
+296,1
+841,0
+7,1
+365,0
+21,0
+1205,1
+1096,0
+115,0
+936,1
+87,0
+2026,0
+442,0
+160,1
+307,1
+1156,0
+1773,0
+534,1
+570,0
+599,0
+803,1
+1494,0
+578,0
+883,0
+950,1
+2036,0
+203,1
+2100,0
+13,1
+1702,0
+1230,0
+1349,0
+274,0
+673,1
+569,1
+248,0
+2349,0
+69,1
+1432,0
+459,0
+836,1
+385,0
+1323,1
+1302,0
+1951,0
+1341,1
+291,1
+1335,1
+161,1
+2166,0
+1114,1
+252,1
+423,0
+1452,0
+1186,0
+583,1
+1889,0
+579,1
+324,0
+139,1
+1691,0
+1034,1
+342,1
+374,1
+790,1
+147,1
+748,1
+600,0
+1098,1
+1768,0
+317,0
+81,0
+1340,0
+472,1
+934,1
+822,0
+1331,1
+1035,1
+1319,0
+1243,0
+1260,1
+0,1
+1674,0
+257,1
+2334,0
+1338,1
+1405,0
+1928,0
+1620,0
+715,0
+2182,0
+1595,0
+1216,1
+453,1
+1308,1
+624,0
+1051,0
+71,0
+22,1
+1527,0
+1867,0
+282,0
+338,1
+320,1
+10,1
+41,1
+961,1
+1728,0
+561,0
+744,1
+535,0
+1047,1
+575,1
+1303,0
+168,1
+1420,0
+1665,0
+1515,0
+1311,1
+2359,0
+1351,0
+788,1
+1274,1
+684,1
+1736,0
+674,0
+1686,0
+164,0
+1132,0
+621,0
+109,0
+6,1
+373,1
+1008,1
+1929,0
+1287,0
+568,0
+652,0
+4,0
+172,1
+312,1
+2376,0
+85,0
+300,0
+1075,1
+787,0
+14,1
+489,1
+407,0
+369,1
+275,0
+2055,0
+1103,0
+314,1
+487,1
+135,1
+134,1
+1122,0
+1286,0
+2033,0
+866,1
+478,1
+722,1
+792,0
+95,0
+454,0
+313,1
+597,0
+343,0
+541,1
+2346,0
+2212,0
+349,1
+1323,0
+104,0
+2070,0
+808,0
+815,1
+1990,0
+1583,0
+201,1
+1050,1
+834,0
+218,0
+602,1
+2194,0
+580,0
+100,1
+1488,0
+1265,1
+850,1
+46,1
+231,0
+2076,0
+2151,0
+125,1
+1071,0
+419,1
+1334,0
+1344,1
+775,1
+1152,0
+709,1
+1204,0
+1164,0
+609,1
+419,0
+944,0
+2303,0
+1297,0
+1799,0
+526,1
+353,0
+2302,0
+889,1
+1980,0
+1238,0
+225,1
+468,1
+1060,1
+287,1
+1765,0
+1629,0
+559,0
+642,0
+2001,0
+740,1
+463,1
+1198,1
+1680,0
+210,0
+1117,0
+1992,0
+1189,0
+1269,0
+728,1
+297,1
+239,1
+2174,0
+159,1
+879,1
+2127,0
+1384,0
+1800,0
+72,1
+975,1
+272,0
+339,1
+1372,0
+1307,1
+1133,0
+691,1
+731,0
+74,1
+2266,0
+158,0
+790,0
+938,1
+1559,0
+491,1
+473,1
+1045,1
+732,1
+1091,1
+536,0
+1301,0
+2019,0
+1182,0
+661,1
+1071,1
+189,0
+501,0
+745,1
+1125,1
+1704,0
+341,1
+815,0
+233,1
+840,1
+809,0
+1120,1
+589,1
+1192,0
+1673,0
+2197,0
+369,0
+767,0
+687,0
+241,0
+402,0
+651,1
+618,0
+963,1
+551,0
+671,1
+915,1
+597,1
+1314,1
+587,0
+1203,1
+152,1
+994,1
+767,1
+2243,0
+295,1
+364,1
+2297,0
+1995,0
+702,1
+1832,0
+317,1
+1120,0
+1086,0
+673,0
+777,0
+1423,0
+1473,0
+1137,1
+1376,0
+754,1
+284,1
+1312,1
+846,0
+669,0
+832,0
+884,1
+1316,1
+34,0
+1249,0
+2210,0
+321,1
+1251,1
+268,0
+41,0
+464,0
+69,0
+992,0
+306,1
+213,0
+108,0
+215,0
+512,1
+874,1
+846,1
+28,1
+1604,0
+1520,0
+499,0
+449,0
+116,0
+1040,1
+1092,1
+1761,0
+1484,0
+760,0
+1140,1
+395,1
+1142,0
+1586,0
+1092,0
+300,1
+2192,0
+830,1
+1748,0
+269,1
+1294,1
+1231,0
+701,1
+326,1
+1439,0
+928,1
+398,0
+145,0
+1670,0
+428,1
+2253,0
+1009,1
+986,0
+270,0
+1382,0
+924,0
+204,1
+540,0
+1491,0
+1522,0
+1113,1
+199,1
+298,0
+838,0
+2366,0
+302,0
+982,0
+1123,0
+1183,1
+60,0
+1133,1
+1061,0
+664,1
+712,1
+1813,0
+820,0
+1743,0
+977,0
+909,0
+689,1
+759,0
+2375,0
+761,0
+2128,0
+309,1
+120,0
+1314,0
+988,0
+1669,0
+1162,1
+1272,1
+1007,0
+1959,0
+1057,1
+1707,0
+580,1
+1387,0
+1066,1
+557,1
+499,1
+1440,0
+854,1
+252,0
+1135,1
+372,0
+2189,0
+1184,1
+794,0
+198,1
+1296,0
+1821,0
+330,1
+1218,1
+826,0
+1336,0
+123,1
+2010,0
+1270,0
+1413,0
+358,0
+665,1
+396,0
+367,1
+752,1
+1214,0
+416,1
+1789,0
+2067,0
+1193,1
+322,0
+1016,1
+2154,0
+1767,0
+494,1
+19,1
+332,1
+35,0
+1632,0
+376,1
+66,1
+144,1
+72,0
+959,1
+649,1
+658,0
+263,0
+2261,0
+946,1
+1330,0
+1042,1
+1588,0
+1518,0
+2248,0
+1230,1
+1849,0
+1817,0
+1699,0
+2358,0
+1185,1
+769,0
+1232,1
+484,1
+40,1
+1277,0
+829,1
+637,1
+1190,0
+483,1
+1667,0
+9,0
+1759,0
+368,0
+2288,0
+2118,0
+721,1
+138,0
+117,1
+103,1
+1035,0
+1852,0
+373,0
+2381,0
+2204,0
+1477,0
+48,0
+533,0
+1194,0
+1055,0
+124,0
+1288,1
+1829,0
+400,0
+1197,0
+8,0
+59,1
+1319,1
+717,1
+994,0
+410,1
+255,0
+1860,0
+506,0
+2152,0
+1769,0
+598,0
+5,0
+2245,0
+755,0
+1331,0
+188,1
+1447,0
+2201,0
+1069,1
+289,1
+1339,0
+206,1
+1890,0
+1078,1
+504,1
+337,1
+1348,0
+1049,1
+603,1
+1912,0
+979,0
+477,1
+222,0
+972,1
+981,1
+795,1
+171,0
+16,1
+594,1
+121,1
+202,1
+2336,0
+1070,1
+1265,0
+604,1
+646,1
+2347,0
+2094,0
+560,1
+496,0
+239,0
+1456,0
+242,0
+897,0
+448,0
+2061,0
+679,0
+532,0
+1727,0
+1286,1
+653,0
+1850,0
+1193,0
+854,0
+771,0
+606,1
+1396,0
+641,0
+493,1
+960,0
+2034,0
+651,0
+451,0
+2383,0
+1263,1
+1165,1
+1153,1
+84,0
+996,0
+1236,1
+945,1
+976,0
+1094,0
+877,0
+2025,0
+1039,1
+857,0
+1267,1
+1280,1
+1088,0
+1116,0
+157,1
+555,1
+962,0
+285,0
+1197,1
+246,0
+263,1
+983,1
+619,1
+968,1
+1309,0
+484,0
+869,0
+643,0
+1322,1
+979,1
+756,1
+1252,0
+734,0
+870,0
+133,0
+1010,0
+223,1
+1671,0
+93,1
+1942,0
+969,0
+687,1
+1099,1
+1548,0
+860,0
+511,1
+554,1
+447,1
+620,0
+640,1
+587,1
+642,1
+1169,1
+258,0
+1471,0
+882,1
+930,0
+2185,0
+843,1
+413,1
+632,0
+640,0
+1906,0
+8,1
+1327,1
+1199,1
+50,1
+234,0
+584,1
+1213,0
+328,0
+1076,0
+1991,0
+794,1
+568,1
+565,0
+1555,0
+92,1
+920,1
+1805,0
+923,1
+324,1
+346,0
+856,0
+12,0
+1079,1
+274,1
+2208,0
+814,1
+1792,0
+1054,1
+2155,0
+2339,0
+881,1
+1033,0
+363,0
+393,0
+151,0
+1621,0
+82,0
+440,1
+871,1
+1244,0
+787,1
+452,0
+2126,0
+1700,0
+743,0
+100,0
+1315,0
+381,1
+954,1
+395,0
+1790,0
+19,0
+1139,1
+861,1
+294,1
+1780,0
+119,0
+1154,1
+137,1
+2254,0
+1172,1
+207,1
+563,1
+43,0
+1140,0
+112,0
+2279,0
+1178,1
+881,0
+1597,0
+596,0
+283,1
+680,0
+322,1
+537,0
+1097,1
+564,1
+558,1
+146,0
+210,1
+968,0
+282,1
+281,1
+29,1
+525,0
+538,1
+339,0
+2145,0
+758,0
+518,1
+1062,0
+591,1
+177,1
+1723,0
+152,0
+1001,1
+2158,0
+345,1
+659,1
+81,1
+325,0
+628,1
+402,1
+1356,0
+98,0
+1056,1
+1210,1
+1148,0
+3,1
+686,1
+1782,0
+56,0
+522,1
+1030,0
+1791,0
+971,1
+32,0
+978,1
+588,1
+678,1
+2292,0
+386,0
+737,0
+1925,0
+609,0
+364,0
+763,0
+645,0
+1847,0
+1104,1
+1461,0
+137,0
+240,0
+674,1
+601,0
+2088,0
+155,1
+938,0
+907,0
+74,0
+911,1
+974,0
+742,1
+983,0
+940,1
+347,1
+492,1
+80,0
+1231,1
+1724,0
+782,1
+1495,0
+2200,0
+51,0
+751,1
+191,0
+1422,0
+496,1
+974,1
+805,1
+1657,0
+433,0
+438,0
+285,1
+1897,0
+1175,1
+1320,0
+2331,0
+1546,0
+780,1
+820,1
+1266,0
+977,1
+1496,0
+1763,0
+1842,0
+1144,0
+1547,0
+228,0
+1708,0
+577,0
+401,1
+418,1
+511,0
+590,1
+1013,1
+567,0
+470,1
+227,1
+436,1
+1047,0
+822,1
+117,0
+29,0
+997,0
+1093,0
+1090,1
+624,1
+164,1
+2378,0
+1084,1
+75,1
+793,1
+910,0
+676,0
+908,0
+2029,0
+92,0
+38,0
+505,1
+409,1
+1388,0
+2122,0
+1997,0
+441,0
+134,0
+107,0
+1026,0
+236,1
+261,0
+1602,0
+2195,0
+1937,0
+2121,0
+627,0
+697,0
+1340,1
+516,1
+467,1
+1173,0
+1448,0
+2065,0
+456,0
+1474,0
+2322,0
+1274,0
+1784,0
+773,1
+818,1
+153,1
+53,0
+1109,1
+2350,0
+1578,0
+1858,0
+321,0
+986,1
+180,0
+1115,0
+1873,0
+1869,0
+416,0
+187,0
+819,1
+220,0
+156,0
+1827,0
+685,0
+1255,1
+952,1
+2341,0
+595,1
+509,0
+251,0
+2172,0
+1934,0
+118,1
+521,0
+1237,1
+807,0
+345,0
+457,1
+955,1
+633,0
+330,0
+423,1
+21,1
+875,1
+280,1
+1165,0
+70,0
+404,0
+389,1
+1012,1
+929,1
+1378,0
+1025,1
+1453,0
+730,0
+1342,0
+486,1
+1540,0
+528,0
+1146,0
+1291,1
+148,1
+778,0
+1151,0
+675,0
+823,0
+2187,0
+498,0
+2134,0
+1004,0
+1248,1
+1273,1
+912,0
+984,1
+615,1
+1013,0
+1760,0
+469,1
+2325,0
+1311,0
+958,0
+194,0
+811,1
+736,0
+1734,0
+842,0
+426,1
+1163,0
+845,0
+1660,0
+1866,0
+629,1
+1238,1
+849,1
+467,0
+741,0
+1324,1
+749,1
+247,1
+1668,0
+25,1
+185,0
+1584,0
+1338,0
+223,0
+657,1
+440,0
+558,0
+1063,0
+792,1
+1772,0
+121,0
+727,0
+527,1
+1164,1
+764,0
+278,0
+773,0
+1168,1
+1859,0
+1046,1
+926,0
+1019,0
+52,0
+1044,1
+1136,1
+1910,0
+277,0
+305,0
+523,0
+922,1
+2304,0
+63,0
+211,0
+1549,0
+2020,0
+1394,0
+396,1
+919,0
+50,0
+1450,0
+831,1
+240,1
+1310,0
+958,1
+58,0
+856,1
+478,0
+636,0
+1137,0
+357,1
+1081,0
+953,1
+1696,0
+1806,0
+279,1
+713,1
+2364,0
+114,0
+190,0
+371,0
+221,0
+720,1
+1508,0
+310,0
+903,0
+795,0
+1102,0
+2272,0
+479,1
+1186,1
+585,1
+64,1
+967,1
+1434,0
+1579,0
+1111,1
+616,1
+1777,0
+993,0
+873,0
+774,1
+1157,0
+824,1
+1919,0
+608,0
+917,0
+1037,0
+1324,0
+379,0
+1368,0
+635,1
+30,1
+1718,0
+231,1
+1103,1
+4,1
+417,1
+2043,0
+68,1
+1347,1
+1836,0
+481,1
+1596,0
+1468,0
+807,1
+315,1
+1807,0
+599,1
+1257,1
+797,1
+412,0
+246,1
+943,1
+1262,0
+124,1
+724,1
+1463,0
+260,0
+562,1
+799,0
+768,1
+487,0
+637,0
+193,0
+1018,1
+1865,0
+273,0
+579,0
+209,0
+704,0
+1064,0
+808,1
+1083,0
+226,0
+398,1
+524,1
+2337,0
+891,0
+1272,0
+894,0
+517,1
+1293,0
+705,0
+630,1
+862,0
+1162,0
+33,1
+183,1
+2176,0
+277,1
+1705,0
+1141,0
+1616,0
+914,1
+1221,0
+1936,0
+435,1
+865,0
+276,1
+1891,0
+1268,1
+1226,1
+304,0
+40,0
+377,1
+1091,0
+57,0
+200,0
+1267,0
+789,1
+206,0
+34,1
+972,0
+2353,0
+232,1
+1161,1
+535,1
+1637,0
+1127,0
+2223,0
+242,1
+46,0
+1253,1
+22,0
+357,0
+439,0
+1030,1
+876,0
+1462,0
+153,0
+235,0
+1253,0
+2214,0
+251,1
+1561,0
+714,0
+677,1
+311,0
+1256,0
+922,0
+1087,1
+353,1
+519,0
+1489,0
+1015,1
+1334,1
+1278,1
+1721,0
+2098,0
+221,1
+931,1
+1557,0
+2258,0
+1893,0
+905,1
+504,0
+1181,0
+770,1
+80,1
+2096,0
+1725,0
+1022,1
+1127,1
+1379,0
+508,1
+1598,0
+2109,0
+1720,0
+1730,0
+531,0
+15,1
+1418,0
+273,1
+1861,0
+135,0
+1475,0
+699,0
+553,0
+852,1
+725,1
+500,1
+1828,0
+692,0
+1261,1
+524,0
+1179,0
+1940,0
+540,1
+1275,0
+2227,0
+90,1
+320,0
+706,1
+362,1
+173,1
+634,0
+1932,0
+965,1
+365,1
+302,1
+458,0
+105,1
+98,1
+1571,0
+127,0
+2183,0
+1694,0
+682,0
+387,1
+264,1
+482,1
+551,1
+1108,0
+745,0
+937,0
+2116,0
+313,0
+168,0
+719,0
+1753,0
+804,1
+1947,0
+1000,0
+1131,1
+935,0
+1863,0
+1337,0
+530,0
+20,1
+1313,1
+660,0
+6,0
+114,1
+443,0
+2354,0
+1085,0
+1717,0
+692,1
+1303,1
+906,0
+1261,0
+552,1
+394,0
+1171,0
+2072,0
+613,1
+832,1
+1962,0
+1531,0
+1179,1
+1888,0
+663,0
+1871,0
+267,0
+329,0
+1430,0
+2164,0
+1733,0
+1069,0
+465,1
+1095,1
+2269,0
+658,1
+1110,0
+1642,0
+813,0
+106,1
+111,0
+471,0
+292,1
+358,1
+189,1
+655,1
+2310,0
+1300,0
+414,1
+1098,0
+403,1
+1572,0
+1317,0
+35,1
+876,1
+1880,0
+1970,0
+896,1
+2015,0
+160,0
+1243,1
+1714,0
+2294,0
+694,0
+1228,1
+97,0
+165,0
+722,0
+521,1
+1222,0
+469,0
+811,0
+971,0
+115,1
+248,1
+363,1
+79,0
+1259,0
+1505,0
+835,1
+512,0
+243,0
+564,0
+196,1
+830,0
+1235,0
+1894,0
+508,0
+15,0
+89,1
+5,1
+1344,0
+1169,0
+1226,0
+948,1
+1438,0
+383,0
+2379,0
+1345,1
+1271,0
+1538,0
+2267,0
+729,1
+185,1
+1729,0
+763,1
+701,0
+633,1
+1381,0
+932,1
+1576,0
+1166,0
+162,1
+234,1
+949,0
+610,0
+2097,0
+1499,0
+1464,0
+1028,0
+389,0
+904,1
+1467,0
+1406,0
+1073,0
+1511,0
+1436,0
+999,1
+503,1
+574,1
+706,0
+182,1
+1138,1
+708,1
+503,0
+946,0
+2309,0
+372,1
+224,1
+872,1
+566,1
+1021,1
+814,0
+571,1
+785,1
+73,1
+719,1
+1843,0
+288,1
+1614,0
+1112,0
+1017,0
+1093,1
+2251,0
+54,0
+631,0
+662,0
+385,1
+742,0
+178,1
+796,1
+1681,0
+449,1
+2233,0
+1795,0
+2114,0
+801,0
+887,0
+142,1
+906,1
+66,0
+406,1
+1690,0
+829,0
+73,0
+238,1
+1856,0
+176,0
+149,0
+1216,0
+908,1
+951,1
+927,0
+1513,0
+1712,0
+702,0
+1129,0
+1256,1
+919,1
+1884,0
+311,1
+36,1
+1794,0
+2093,0
+82,1
+1018,0
+2027,0
+1065,1
+1639,0
+809,1
+1318,1
+90,0
+1074,0
+2342,0
+1305,1
+947,1
+1168,0
+67,1
+270,1
+388,0
+1306,1
+1040,0
+352,0
+453,0
+1545,0
+2207,0
+645,1
+1046,0
+857,1
+644,1
+1519,0
+513,0
+2173,0
+268,1
+718,1
+109,1
+693,0
+759,1
+997,1
+1285,1
+1068,1
+2217,0
+1289,0
+1079,0
+789,0
+45,1
+867,0
+1446,0
+1581,0
+529,1
+2106,0
+1735,0
+2078,0
+879,0
+501,1
+1240,0
+947,0
+1052,0
+2278,0
+2368,0
+1787,0
+1202,0
+746,0
+1039,0
+781,0
+303,0
+1214,1
+573,1
+391,0
+735,0
+625,0
+1215,1
+1042,0
+147,0
+14,0
+1294,0
+1628,0
+256,0
+1080,0
+1312,0
+694,1
+1213,1
+78,0
+286,0
+827,1
+894,1
+141,0
+1283,0
+348,0
+1115,1
+1052,1
+557,0
+1770,0
+1259,1
+447,0
+53,1
+862,1
+1037,1
+775,0
+1268,0
+704,1
+1347,0
+860,1
+2017,0
+233,0
+1781,0
+1329,1
+86,1
+354,0
+824,0
+1683,0
+31,1
+1063,1
+388,1
+1886,0
+331,1
+359,0
+37,1
+1134,1
+279,0
+1745,0
+375,1
+962,1
+422,1
+51,1
+2328,0
+1269,1
+1421,0
+171,1
+895,0
+126,0
+1804,0
+1709,0
+1689,0
+309,0
+425,0
+1528,0
+1341,0
+237,1
+399,0
+1001,0
+838,1
+1779,0
+1650,0
+755,1
+27,1
+179,0
+1000,1
+2246,0
+939,0
+514,1
+1219,1
+868,0
+1219,0
+444,0
+667,0
+2240,0
+538,0
+738,0
+774,0
+1333,1
+1192,1
+1132,1
+378,1
+817,1
+1108,1
+243,1
+2,0
+709,0
+626,0
+1191,0
+654,0
+641,1
+988,1
+783,0
+101,0
+3,0
+991,0
+415,0
+128,1
+2360,0
+96,1
+476,0
+1336,1
+2198,0
+1460,0
+207,0
+1215,0
+1062,1
+399,1
+519,1
+1776,0
+966,1
+253,1
+1251,0
+1156,1
+1532,0
+1139,0
+1756,0
+765,1
+458,1
+2073,0
+1935,0
+2235,0
+208,1
+514,0
+436,0
+607,0
+199,0
+379,1
+43,1
+756,0
+57,1
+583,0
+184,1
+885,0
+88,1
+1183,0
+656,0
+1147,1
+52,1
+638,0
+733,0
+1523,0
+438,1
+1,0
+727,1
+1411,0
+1876,0
+750,1
+2079,0
+826,1
+1652,0
+1786,0
+970,1
+1301,1
+520,0
+654,1
+864,0
+368,1
+1400,0
+1403,0
+2150,0
+1255,0
+1367,0
+837,0
+485,0
+1043,1
+1118,1
+1254,1
+490,0
+575,0
+1808,0
+695,0
+712,0
+2119,0
+735,1
+2148,0
+264,0
+1107,1
+901,1
+2124,0
+893,1
+646,0
+611,0
+779,1
+1170,0
+816,1
+1067,1
+31,0
+523,1
+648,1
+315,0
+639,1
+1600,0
+42,0
+1217,1
+71,1
+1841,0
+278,1
+1155,1
+148,0
+1485,0
+1248,0
+544,1
+1200,0
+176,1
+1056,0
+682,1
+1355,0
+629,0
+1053,0
+739,0
+162,0
+713,0
+1350,0
+672,1
+230,1
+2335,0
+1239,1
+2216,0
+1247,1
+119,1
+1809,0
+1166,1
+37,0
+1280,0
+1361,0
+726,0
+26,0
+782,0
+361,1
+209,1
+186,1
+107,1
+841,1
+429,1
+1195,0
+1167,0
+301,0
+1911,0
+340,1
+1304,0
+819,0
+850,0
+1601,0
+625,1
+1101,0
+89,0
+2081,0
+235,1
+1201,0
+75,0
+710,0
+434,1
+334,0
+1737,0
+1065,0
+1284,0
+276,0
+1239,0
+138,1
+1176,0
+392,0
+250,0
+836,0
+260,1
+443,1
+827,0
+725,0
+409,0
+605,1
+1854,0
+1364,0
+78,1
+717,0
+329,1
+105,0
+262,1
+912,1
+173,0
+1752,0
+96,0
+1146,1
+427,1
+350,1
+2307,0
+1481,0
+1058,0
+547,1
+731,1
+1321,1
+943,0
+1624,0
+259,0
+1209,0
+1130,1
+1302,1
+1117,1
+1875,0
+555,0
+1478,0
+422,0
+1145,1
+1740,0
+556,0
+424,0
+1433,0
+2048,0
+2051,0
+421,1
+1282,1
+1966,0
+1058,1
+1233,1
+474,0
+54,1
+169,0
+498,1
+1196,0
+1617,0
+143,0
+1089,0
+847,1
+1566,0
+681,1
+737,1
+383,1
+1293,1
+703,1
+1878,0
+350,0
+1237,0
+247,0
+966,0
+129,1
+459,1
+699,1
+617,1
+573,0
+188,0
+1469,0
+952,0
+99,0
+869,1
+1125,0
+356,1
+1426,0
+1731,0
+810,1
+1090,0
+981,0
+921,1
+289,0
+284,0
+690,0
+1006,1
+955,0
+1114,0
+2084,0
+408,0
+1258,1
+1086,1
+825,0
+492,0
+615,0
+1831,0
+1882,0
+1172,0
+961,0
+581,1
+554,0
+1839,0
+1174,0
+772,1
+1619,0
+2086,0
+754,0
+2140,0
+1176,1
+244,1
+26,1
+729,0
+1138,0
+985,0
+382,1
+1417,0
+1641,0
+1585,0
+960,1
+411,0
+437,0
+1848,0
+351,1
+217,0
+2108,0
+1134,0
+710,1
+695,1
+1283,1
+2149,0
+441,1
+1130,0
+1457,0
+578,1
+655,0
+1281,0
+561,1
+506,1
+1322,0
+527,0
+1562,0
+927,1
+1143,1
+256,1
+1676,0
+500,0
+2181,0
+1739,0
+970,0
+360,0
+76,1
+898,0
+1325,0
+157,0
+741,1
+616,0
+166,0
+925,1
+2060,0
+226,1
+1006,0
+1150,1
+1225,0
+1023,1
+1419,0
+12,1
+1410,0
+904,0
+1498,0
+1741,0
+408,1
+1029,1
+1846,0
+1524,0
+224,0
+225,0
+366,1
+454,1
+333,0
+1020,0
+1041,0
+723,0
+1654,0
+254,0
+49,1
+139,0
+63,1
+415,1
+128,0
+384,1
+1360,0
+1775,0
+543,1
+204,0
+351,0
+1033,1
+1480,0
+2117,0
+1917,0
+920,0
+236,0
+464,1
+998,1
+1210,0
+1504,0
+708,0
+1291,0
+804,0
+1466,0
+786,1
+461,0
+1635,0
+913,0
+1160,0
+750,0
+611,1
+861,0
+1270,1
+2050,0
+186,0
+547,0
+911,0
+859,1
+297,0
+45,0
+1266,1
+1003,0
+158,1
+1246,1
+269,0
+874,0
+1264,1
+2080,0
+934,0
+1112,1
+1067,0
+1822,0
+213,1
+67,0
+1693,0
+1014,0
+1002,1
+439,1
+588,0
+532,1
+784,1
+1354,0
+839,0
+442,1
+413,0
+892,1
+1346,0
+883,1
+319,0
+1587,0
+259,1
+2091,0
+602,0
+791,1
+184,0
+1399,0
+178,0
+707,1
+375,0
+1565,0
+461,1
+99,1
+1184,0
+2270,0
+1327,0
+676,1
+212,0
+437,1
+858,1
+1281,1
+1016,0
+1223,1
+610,1
+49,0
+1313,0
+574,0
+928,0
+1973,0
+665,0
+502,0
+1296,1
+1887,0
+1149,1
+866,0
+65,0
+342,0
+967,0
+412,1
+1061,1
+1803,0
+666,1
+325,1
+1024,0
+596,1
+1816,0
+1814,0
+1020,1
+2357,0
+1287,1
+1881,0
+163,1
+743,1
+2058,0
+1393,0
+668,1
+693,1
+837,1
+1397,0
+294,0
+1658,0
+902,1
+747,0
+884,0
+296,0
+859,0
+1021,0
+481,0
+1634,0
+2011,0
+1252,1
+989,1
+258,1
+916,1
+2168,0
+1246,0
+1392,0
+788,0
+465,0
+288,0
+969,1
+776,1
+319,1
+1837,0
+1362,0
+875,0
+593,1
+2024,0
+1537,0
+323,0
+995,1
+567,1
+1235,1
+531,1
+2008,0
+146,1
+331,0
+215,1
+218,1
+1938,0
+1698,0
+993,1
+1202,1
+1309,1
+266,1
+390,0
+1697,0
+1153,0
+863,1
+716,0
+403,0
+125,0
+1845,0
+1152,1
+1212,1
+446,1
+2037,0
+1194,1
+1236,0
+1004,1
+2298,0
+797,0
+1486,0
+1353,0
+1359,0
+1335,0
+667,1
+1182,1
+1329,0
+1068,0
+594,0
+1109,0
+87,1
+1492,0
+873,1
+507,0
+650,0
+286,1
+1104,0
+39,0
+858,0
+1663,0
+298,1
+777,1
+703,0
+769,1
+318,0
+1245,1
+1412,0
+590,0
+618,1
+576,1
+428,0
+1190,1
+2206,0
+1369,0
+340,0
+1483,0
+61,1
+1325,1
+572,0
+1187,1
+2056,0
+1968,0
+1225,1
+851,0
+1825,0
+591,0
+810,0
+845,1
+1099,0
+744,0
+380,0
+2157,0
+1507,0
+1173,1
+670,0
+154,1
+212,1
+1931,0
+1732,0
+7,0
+990,0
+1028,1
+1087,0
+84,1
+1793,0
+1129,1
+1126,1
+220,1
+733,1
+384,0
+1429,0
+122,1
+394,1
+2103,0
+1771,0
+174,0
+33,0
+1200,1
+310,1
+1036,1
+688,0
+2133,0
+902,0
+779,0
+1879,0
+1332,0
+17,0
+1170,1
+1295,0
+159,0
+900,1
+1883,0
+434,0
+878,1
+2035,0
+2377,0
+1643,0
+2315,0
+686,0
+448,1
+62,0
+102,1
+68,0
+2277,0
+0,0
+238,0
+1188,0
+431,0
+194,1
+202,0
+44,1
+421,0
+466,1
+1493,0
+546,0
+1754,0
+1627,0
+64,0
+482,0
+253,0
+2123,0
+1470,0
+720,0
+2219,0
+649,0
+1180,1
+305,1
+516,0
+132,0
+864,1
+140,1
+1774,0
+2159,0
+165,1
+13,0
+1196,1
+451,1
+980,0
+954,0
+1778,0
+480,0
+1012,0
+280,0
+205,1
+366,0
+529,0
+1029,0
+560,0
+1045,0
+1038,1
+1218,0
+1284,1
+292,0
+1749,0
+1811,0
+914,0
+2064,0
+232,0
+1085,1
+480,1
+1135,0
+656,1
+307,0
+1536,0
+728,0
+711,0
+897,1
+513,1
+1229,0
+1111,0
+291,0
+1431,0
+118,0
+172,0
+1161,0
+497,1
+2271,0
+257,0
+1258,0
+998,0
+1097,0
+1206,1
+1550,0
+445,1
+509,1
+287,0
+79,1
+833,0
+360,1
+101,1
+23,1
+1370,0
+143,1
+1027,1
+871,0
+1785,0
+55,0
+944,1
+880,1
+1158,1
+942,0
+2178,0
+1320,1
+515,0
+1088,1
+1371,0
+957,1
+1191,1
+1407,0
+835,0
+155,0
+1049,0
+987,1
+696,0
+2313,0
+468,0
+382,0
+821,1
+455,1
+1124,1
+1250,0
+949,1
+798,1
+957,0
+1041,1
+1706,0
+267,1
+1444,0
+1118,0
+1408,0
+77,0
+1757,0
+2132,0
+495,0
+1510,0
+853,1
+1222,1
+586,0
+614,0
+1742,0
+734,1
+2142,0
+245,0
+877,1
+888,0
+1257,0
+700,0
+249,0
+2370,0
+1497,0
+1240,1
+1015,0
+127,1
+848,1
+1167,1
+1206,0
+1864,0
+941,0
+2141,0
+2283,0

dataset/task1_data.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:046121ecaa796a0d453ce75820b5b6d53d468a03b7352074029504c9f96e3c32
+size 4611306

src/deployment_utils.py ADDED Viewed

	@@ -0,0 +1,607 @@

+# global
+from typing import Tuple, List
+import re
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from tensorflow import keras
+from keras.utils import pad_sequences
+from keras.preprocessing.text import Tokenizer
+from gensim.models.doc2vec import Doc2Vec
+import transformers
+from transformers import pipeline, BertTokenizer
+import fasttext
+# local
+from preprocessing import Preprocessor
+from utils import read_data
+# read data
+X_train, X_test, y_train, y_test = read_data()
+# instantiate preprocessor object
+preprocessor = Preprocessor()
+# load models
+doc2vec_model_embeddings = Doc2Vec.load(
+    "F:/Graduation Project/Project/models/best_doc2vec_embeddings")
+doc2vec_model = keras.models.load_model(
+    "F:/Graduation Project/Project/models/best_doc2vec_model.h5")
+tfidf_model = keras.models.load_model(
+    "F:/Graduation Project/Project/models/best_tfidf_model.h5")
+cnn_model = keras.models.load_model(
+    "F:/Graduation Project/Project/models/best_cnn_model.h5")
+glove_model = keras.models.load_model(
+    "F:/Graduation Project/Project/models/best_glove_model.h5")
+lstm_model = keras.models.load_model(
+    "F:/Graduation Project/Project/models/best_lstm_model.h5")
+bert_model = keras.models.load_model(
+    "F:/Graduation Project/Project/models/best_bert_model.h5", custom_objects={"TFBertModel": transformers.TFBertModel})
+fasttext_model = fasttext.load_model(
+    "F:/Graduation Project/Project/models/best_fasttext_model.bin")
+summarization_model = pipeline(
+    "summarization", model="facebook/bart-large-cnn")
+# TODO: Add Docstrings
+def extract_case_information(case_content: str):
+    content_list = case_content.split("\n")
+    petitioner = re.findall(r"petitioner:(.+)", content_list[0])[0]
+    respondent = re.findall(r"respondent:(.+)", content_list[1])[0]
+    facts = re.findall(r"facts:(.+)", content_list[2])[0]
+    return petitioner, respondent, facts
+def generate_random_sample() -> Tuple[str, str, str, int]:
+    """
+    Randomly fetch a random case from `X_test` to test it.
+    Returns:
+    --------
+    A tuple contains the following:
+        - petitioner : str
+            Contains petitioner name.
+        - respondent : str
+            Contains respondent name.
+        - facts : str
+            Contains case facts.
+        - label : int
+            Represents the winning index(0 = petitioner, 1 = respondent).
+    """
+    random_idx = np.random.randint(low=0, high=len(X_test))
+    petitioner = X_test["first_party"].iloc[random_idx]
+    respondent = X_test["second_party"].iloc[random_idx]
+    facts = X_test["Facts"].iloc[random_idx]
+    label = y_test.iloc[random_idx][0]
+    return petitioner, respondent, facts, label
+def generate_highlighted_words(facts: str, petitioner_words: List[str], respondent_words: List[str]):
+    """
+    Highlight `petitioner_words` and `respondent_words` for model
+    interpretation.
+    Parameters:
+    -----------
+        - facts : str
+            Facts of a specific case.
+        - petitioner_words : List[str]
+            Contains all words that model pays attention
+            to be a petetioner words.
+        - respondent_words : List[str]
+            Contains all words that model pays attention
+            to be a respondent words.
+    Returns:
+    --------
+        - rendered_text : str
+            Contains the `facts` but with adding
+            highlighting mechanism to visualize it using CSS in HTML format.
+    Example:
+    --------
+        >>> facts_ = 'Mohammed shot Aly after a hot negotiation happened  between
+        ... them about the profits of their company'
+        >>> petitioner_words_ = ['shot', 'hot']
+        >>> respondent_words_ = ['profits']
+        >>> generate_highlighted_words(facts, petitioner_words_, respondent_words_)
+        >>> output:
+        <div class='text-facts'> Mohammed <span class='highlight-petitioner'>shot</span>
+        Aly after a <span class='highlight-petitioner'>hot</span> negotiation happened
+        between them about <span class='highlight-respondent'>profits</span> of their
+        company </div>
+    """
+    rendered_text = '<div class="text-facts"> '
+    for word in facts.split():
+        if word in petitioner_words:
+            highlight_word = ' <span class="highlight-petitioner"> ' + word + " </span> "
+            rendered_text += highlight_word
+        elif word in respondent_words:
+            highlight_word = ' <span class="highlight-respondent"> ' + word + " </span> "
+            rendered_text += highlight_word
+        else:
+            rendered_text += " " + word
+    rendered_text += " </div>"
+    return rendered_text
+class VectorizerGenerator:
+    """Responsible for creation and generation of tokenizers and text
+    vectorizers for JudgerAIs' models"""
+    def __init__(self) -> None:
+        pass
+    def generate_tf_idf_vectorizer(self) -> keras.layers.TextVectorization:
+        """
+        Generating best text vectroizer of the tf-idf model (3rd combination).
+        Returns:
+        -------
+        - text_vectorizer : keras.layers.TextVectorization
+            Represents the case facts' vectorizer that converts case facts to
+            numerical tensors.
+        """
+        first_party_names = X_train["first_party"]
+        second_party_names = X_train["second_party"]
+        facts = X_train["Facts"]
+        anonymized_facts = preprocessor.anonymize_data(
+            first_party_names, second_party_names, facts)
+        text_vectorizer, _ = preprocessor.convert_text_to_vectors_tf_idf(
+            anonymized_facts)
+        return text_vectorizer
+    def generate_cnn_vectorizer(self) -> keras.layers.TextVectorization:
+        """
+        Generating best text vectroizer of the cnn model (2nd combination).
+        Returns:
+        -------
+        - text_vectorizer : keras.layers.TextVectorization
+            Represents the case facts' vectorizer that converts case facts to
+            numerical tensors.
+        """
+        balanced_df = preprocessor.balance_data(X_train["Facts"], y_train)
+        X_train_balanced = balanced_df["Facts"]
+        text_vectorizer, _ = preprocessor.convert_text_to_vectors_cnn(
+            X_train_balanced)
+        return text_vectorizer
+    def generate_glove_tokenizer(self) -> keras.preprocessing.text.Tokenizer:
+        """
+        Generating best glove tokenizer of the GloVe model (2nd combination).
+        Returns:
+        -------
+        - glove_tokenizer : keras.preprocessing.text.Tokenizer
+            Represents the case facts' tokenizer that converts case facts to
+            numerical tensors.
+        """
+        balanced_df = preprocessor.balance_data(X_train["Facts"], y_train)
+        X_train_balanced = balanced_df["Facts"]
+        glove_tokenizer, _ = preprocessor.convert_text_to_vectors_glove(
+            X_train_balanced)
+        return glove_tokenizer
+    def generate_lstm_tokenizer(self) -> keras.preprocessing.text.Tokenizer:
+        """
+        Generating best text tokenizer of the LSTM model (1st combination).
+        Returns:
+        -------
+        - lstm_tokenizer : keras.preprocessing.text.Tokenizer
+            Represents the case facts' tokenizer that converts case facts to
+            numerical tensors.
+        """
+        lstm_tokenizer = Tokenizer(num_words=18430)
+        lstm_tokenizer.fit_on_texts(X_train)
+        return lstm_tokenizer
+    def generate_bert_tokenizer(self) -> transformers.BertTokenizer:
+        """
+        Generating best bert tokenizer of the BERT model (1st combination).
+        Returns:
+        -------
+        - bert_tokenizer : transformers.BertTokenizer
+            Represents the case facts' tokenizer that converts case facts to
+            input ids tensors.
+        """
+        bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        return bert_tokenizer
+class DataPreparator:
+    """Responsible for preparing the case facts aka converting case facts to
+    numerical vectors using `VectorizerGenerator` object."""
+    def __init__(self) -> None:
+        self.vectorizer_generator = VectorizerGenerator()
+    def prepare_doc2vec(self, facts: str) -> pd.DataFrame:
+        """
+        Responsible for converting `facts` string to numerical vector
+        using `doc2vec_model_embeddings`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        -------
+        - facts_vector : pd.DataFrame
+            A row DataFrame represents the 50-d vector of the `facts`.
+        """
+        facts = pd.Series(facts)
+        facts_processed = preprocessor.preprocess_data(facts)
+        facts_vectors = preprocessor.convert_text_to_vectors_doc2vec(
+            facts_processed, train=False, embeddings_doc2vec=doc2vec_model_embeddings)
+        return facts_vectors
+    def _anonymize_facts(self, first_party_name: str, second_party_name: str, facts: str) -> str:
+        """
+        Anonymize case `facts` by replacing `first_party_name` & `second_party_name` with
+        generic tag "__PARTY__".
+        Parameters:
+        -----------
+        - first_party_name : str
+            Represents the petitioner name.
+        - second_party_name : str
+            Represents the respondent name.
+        - facts : str
+            Represents the case facts.
+        Returns:
+        -------
+        - anonymized_facts : str
+            Represents `facts` after anonymization.
+        """
+        anonymized_facts = preprocessor._anonymize_case_facts(
+            first_party_name, second_party_name, facts)
+        return anonymized_facts
+    def prepare_tf_idf(self, anonymized_facts: str) -> tf.Tensor:
+        """
+        Responsible for converting `facts` string to numerical vector
+        using tf-idf `vectorizer_generator` in the 3rd combination.
+        Parameters:
+        -----------
+        - anonymized_facts : str
+            Represents the case facts after anonymization.
+        Returns:
+        -------
+        - facts_vector : tf.Tensor
+            A Tensor of 10000-d represents `facts`.
+        """
+        anonymized_facts = pd.Series(anonymized_facts)
+        tf_idf_vectorizer = self.vectorizer_generator.generate_tf_idf_vectorizer()
+        facts_vector = preprocessor.convert_text_to_vectors_tf_idf(
+            anonymized_facts, train=False, text_vectorizer=tf_idf_vectorizer)
+        return facts_vector
+    def prepare_cnn(self, facts: str) -> tf.Tensor:
+        """
+        Responsible for converting `facts` string to numerical vector
+        using cnn `vectorizer_generator` in the 2nd combination.
+        Parameters:
+        -----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        -------
+        - facts_vector : tf.Tensor
+            A Tensor of 2000-d represents `facts`.
+        """
+        facts = pd.Series(facts)
+        cnn_vectorizer = self.vectorizer_generator.generate_cnn_vectorizer()
+        facts_vector = preprocessor.convert_text_to_vectors_cnn(
+            facts, train=False, text_vectorizer=cnn_vectorizer)
+        return facts_vector
+    def prepare_glove(self, facts: str) -> np.ndarray:
+        """
+        Responsible for converting `facts` string to numerical vector
+        using glove `vectorizer_generator` in the 2nd combination.
+        Parameters:
+        -----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        -------
+        - facts_vector : np.ndarray
+            A nd.ndarray of 50-d represents `facts`.
+        """
+        facts = pd.Series(facts)
+        glove_tokneizer = self.vectorizer_generator.generate_glove_tokenizer()
+        facts_vector = preprocessor.convert_text_to_vectors_glove(
+            facts, train=False, glove_tokenizer=glove_tokneizer)
+        return facts_vector
+    def prepare_lstm(self, facts: str) -> np.ndarray:
+        """
+        Responsible for converting `facts` string to numerical vector
+        using lstm `vectorizer_generator` in the 1st combination.
+        Parameters:
+        -----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        -------
+        - facts_vector_padded : np.ndarray
+            A nd.ndarray of 974-d represents `facts`.
+        """
+        facts = pd.Series(facts)
+        lstm_tokenizer = self.vectorizer_generator.generate_lstm_tokenizer()
+        facts_vector = lstm_tokenizer.texts_to_sequences(facts)
+        facts_vector_padded = pad_sequences(facts_vector, 974)
+        return facts_vector_padded
+    def prepare_bert(self, facts: str) -> tf.Tensor:
+        """
+        Responsible for converting `facts` string to numerical vector
+        using bert `vectorizer_generator` in the 1st combination.
+        Parameters:
+        -----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        -------
+        - tf.Tensor
+            A tf.Tensor of 256-d represents `facts` input ids.
+        """
+        bert_tokenizer = self.vectorizer_generator.generate_bert_tokenizer()
+        facts_vector_dict = bert_tokenizer.encode_plus(
+            facts,
+            max_length=256,
+            truncation=True,
+            padding='max_length',
+            add_special_tokens=True,
+            return_tensors='tf'
+        )
+        return facts_vector_dict["input_ids"]
+class Predictor:
+    """Responsible for get predictions of JudgerAIs' models"""
+    def __init__(self) -> None:
+        self.data_preparator = DataPreparator()
+    def predict_doc2vec(self, facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `doc2vec_model`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        --------
+        - pet_res_scores : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        facts_vector = self.data_preparator.prepare_doc2vec(facts)
+        predictions = doc2vec_model.predict(facts_vector)
+        pet_res_scores = []
+        for i in predictions:
+            temp = i[0]
+            pet_res_scores.append(np.array([1 - temp, temp]))
+        return np.array(pet_res_scores)
+    def predict_tf_idf(self, anonymized_facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `tfidf_model`.
+        Parameters:
+        -----------
+        - anonymized_facts : str
+            Represents the case facts after anonymization.
+        Returns:
+        --------
+        - pet_res_scores : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        facts_vector = self.data_preparator.prepare_tf_idf(anonymized_facts)
+        predictions = tfidf_model.predict(facts_vector)
+        pet_res_scores = []
+        for i in predictions:
+            temp = i[0]
+            pet_res_scores.append(np.array([1 - temp, temp]))
+        return np.array(pet_res_scores)
+    def predict_cnn(self, facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `cnn_model`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        --------
+        - pet_res_scores : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        facts_vector = self.data_preparator.prepare_cnn(facts)
+        predictions = cnn_model.predict(facts_vector)
+        pet_res_scores = []
+        for i in predictions:
+            temp = i[0]
+            pet_res_scores.append(np.array([1 - temp, temp]))
+        return np.array(pet_res_scores)
+    def predict_glove(self, facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `glove_model`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        --------
+        - pet_res_scores : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        facts_vector = self.data_preparator.prepare_glove(facts)
+        predictions = glove_model.predict(facts_vector)
+        pet_res_scores = []
+        for i in predictions:
+            temp = i[0]
+            pet_res_scores.append(np.array([1 - temp, temp]))
+        return np.array(pet_res_scores)
+    def predict_lstm(self, facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `lstm_model`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        --------
+        - pet_res_scores : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        facts_vector = self.data_preparator.prepare_lstm(facts)
+        predictions = lstm_model.predict(facts_vector)
+        pet_res_scores = []
+        for i in predictions:
+            temp = i[0]
+            pet_res_scores.append(np.array([1 - temp, temp]))
+        return np.array(pet_res_scores)
+    def predict_bert(self, facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `bert_model`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        --------
+        - predictions : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        facts_vector = self.data_preparator.prepare_bert(facts)
+        predictions = bert_model.predict(facts_vector)
+        return predictions
+    def predict_fasttext(self, facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `fasttext`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        --------
+        - pet_res_scores : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        prediction = fasttext_model.predict(facts)[1]
+        prediction = np.array([prediction])
+        pet_res_scores = []
+        for i in prediction:
+            temp = i[0]
+            pet_res_scores.append(np.array([1 - temp, temp]))
+        return np.array(pet_res_scores)
+    def summarize_facts(self, facts: str) -> str:
+        summarized_case_facts = summarization_model(facts)[0]['summary_text']
+        return summarized_case_facts

src/plotting.py ADDED Viewed

	@@ -0,0 +1,230 @@

+from typing import List
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sn
+from sklearn.metrics import auc
+from sklearn.metrics import roc_curve
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+from tensorflow import keras
+class PlottingManager:
+    """Responsible for providing plots & visualization for the models."""
+    def __init__(self) -> None:
+        """Define style for visualizations."""
+        plt.style.use("seaborn")
+    def plot_subplots_curve(
+        self,
+        training_measure: List[List[float]],
+        validation_measure: List[List[float]],
+        title: str,
+        train_color: str = "orangered",
+        validation_color: str = "dodgerblue",
+    ) -> None:
+        """
+        Plotting subplots of the elements of `training_measure` vs. `validation_measure`.
+        Parameters:
+        ------------
+        - training_measure : List[List[float]]
+            A `k` by `num_epochs` list contains the trained measure whether it's loss or
+            accuracy for each fold.
+        - validation_measure : List[List[float]]
+            A `k` by `num_epochs` list contains the validation measure whether it's loss
+            or accuracy for each fold.
+        - title : str
+            Represents the title of the plot.
+        - train_color : str, optional
+            Represents the graph color for the `training_measure`. (Default is "orangered").
+        - validation_color : str, optional
+            Represents the graph color for the `validation_measure`. (Default is "dodgerblue").
+        """
+        plt.figure(figsize=(12, 8))
+        for i in range(len(training_measure)):
+            plt.subplot(2, 2, i + 1)
+            plt.plot(training_measure[i], c=train_color)
+            plt.plot(validation_measure[i], c=validation_color)
+            plt.title("Fold " + str(i + 1))
+        plt.suptitle(title)
+        plt.show()
+    def plot_heatmap(
+        self, measure: List[List[float]], title: str, cmap: str = "coolwarm"
+    ) -> None:
+        """
+        Plotting a heatmap of the values in `measure`.
+        Parameters:
+        ------------
+        - measure : List[List[float]]
+            A `k` by `num_epochs` list contains the measure whether it's loss
+            or accuracy for each fold.
+        - title : str
+            Title of the plot.
+        - cmap : str, optional
+            Color map of the plot (default is "coolwarm").
+        """
+        # transpose the array to make it `num_epochs` by `k`
+        values_array = np.array(measure).T
+        df_cm = pd.DataFrame(
+            values_array,
+            range(1, values_array.shape[0] + 1),
+            ["fold " + str(i + 1) for i in range(4)],
+        )
+        plt.figure(figsize=(10, 8))
+        plt.title(
+            title + " Throughout " + str(values_array.shape[1]) + " Folds", pad=20
+        )
+        sn.heatmap(df_cm, annot=True, cmap=cmap, annot_kws={"size": 10})
+        plt.show()
+    def plot_average_curves(
+        self,
+        title: str,
+        x: List[float],
+        y: List[float],
+        x_label: str,
+        y_label: str,
+        train_color: str = "orangered",
+        validation_color: str = "dodgerblue",
+    ) -> None:
+        """
+        Plotting the curves of `x` against `y`, where x and y are training and validation
+        measures (loss or accuracy).
+        Parameters:
+        ------------
+        - title : str
+            Title of the plot.
+        - x : List[float]
+            Training measure of the models (loss or accuracy).
+        - y : List[float]
+            Validation measure of the models (loss or accuracy).
+        - x_label : str
+            Label of the training measure to put it in plot legend.
+        - y_label : str
+            Label of the validation measure to put it in plot legend.
+        - train_color : str, optional
+            Color of the training plot (default is "orangered").
+        - validation_color : str, optional
+            Color of the validation plot (default is "dodgerblue").
+        """
+        plt.title(title, pad=20)
+        plt.plot(x, c=train_color, label=x_label)
+        plt.plot(y, c=validation_color, label=y_label)
+        plt.legend()
+        plt.show()
+    def plot_roc_curve(
+        self,
+        all_models: List[keras.models.Sequential],
+        X_test: pd.DataFrame,
+        y_test: pd.Series,
+    ) -> None:
+        """
+        Plotting the AUC-ROC curve of all the passed models in `all_models`.
+        Parameters:
+        ------------
+        - all_models : List[keras.models.Sequential]
+            Contains all trained models, number of models equals number of
+             `k` fold cross-validation.
+        - X_test : pd.DataFrame
+            Contains the testing vectors.
+        - y_test : pd.Series
+            Contains the testing labels.
+        """
+        plt.figure(figsize=(12, 8))
+        for i, model in enumerate(all_models):
+            y_pred = model.predict(X_test).ravel()
+            fpr, tpr, _ = roc_curve(y_test, y_pred)
+            auc_curve = auc(fpr, tpr)
+            plt.subplot(2, 2, i + 1)
+            plt.plot([0, 1], [0, 1], color="dodgerblue", linestyle="--")
+            plt.plot(
+                fpr,
+                tpr,
+                color="orangered",
+                label=f"Fold {str(i+1)} (area = {auc_curve:.3f})",
+            )
+            plt.legend(loc="best")
+            plt.title(f"Fold {str(i+1)}")
+        plt.suptitle("AUC-ROC curves")
+        plt.show()
+    def plot_classification_report(
+        self, model: keras.models.Sequential, X_test: pd.DataFrame, y_test: pd.Series
+    ) -> str | dict:
+        """
+        Plotting the classification report of the passed `model`.
+        Parameters:
+        ------------
+        - model : keras.models.Sequential
+            The trained model that will be evaluated.
+        - X_test : pd.DataFrame
+            Contains the testing vectors.
+        - y_test : pd.Series
+            Contains the testing labels.
+        Returns:
+        --------
+        - str | dict: The classification report for the given model and testing data.
+            It returns a string if `output_format` is set to 'str', and returns
+            a dictionary if `output_format` is set to 'dict'.
+        """
+        y_pred = model.predict(X_test).ravel()
+        preds = np.where(y_pred > 0.5, 1, 0)
+        cls_report = classification_report(y_test, preds)
+        return cls_report
+    def plot_confusion_matrix(
+        self,
+        all_models: List[keras.models.Sequential],
+        X_test: pd.DataFrame,
+        y_test: pd.Series,
+    ) -> None:
+        """
+        Plotting the confusion matrix of each model in `all_models`.
+        Parameters:
+        ------------
+        - all_models: list[keras.models.Sequential]
+            Contains all trained models, number of models equals
+            number of `k` fold cross-validation.
+        - X_test: pd.DataFrame
+            Contains the testing vectors.
+        - y_test: pd.Series
+            Contains the testing labels.
+        """
+        _, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
+        for i, (model, ax) in enumerate(zip(all_models, axes.flatten())):
+            y_pred = model.predict(X_test).ravel()
+            preds = np.where(y_pred > 0.5, 1, 0)
+            conf_matrix = confusion_matrix(y_test, preds)
+            sn.heatmap(conf_matrix, annot=True, ax=ax)
+            ax.set_title(f"Fold {i+1}")
+        plt.suptitle("Confusion Matrices")
+        plt.tight_layout()
+        plt.show()

src/preprocessing.py ADDED Viewed

	@@ -0,0 +1,591 @@

+# global
+import string
+from typing import List, Tuple
+import numpy as np
+import pandas as pd
+import re
+import nltk
+from sklearn.utils import resample
+from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+from nltk.tokenize import RegexpTokenizer
+import tensorflow as tf
+from keras.layers import TextVectorization
+from keras.preprocessing.text import Tokenizer
+from keras.utils import pad_sequences
+# local
+from utils import Doc2VecModel
+punct = string.punctuation
+stemmer = nltk.stem.PorterStemmer()
+eng_stopwords = nltk.corpus.stopwords.words("english")
+class Preprocessor:
+    """Responsible for preprocessing case facts."""
+    def __init__(self) -> None:
+        pass
+    def _nltk_tokenizer(self, text: str) -> List[str]:
+        """
+        Tokenize a given `text` using the RegexpTokenizer from the nltk library.
+        Parameters:
+        -----------
+        - text : str
+            A string containing the text to be tokenized.
+        Returns:
+        --------
+        - tokens : List[str]
+            A list of tokens generated by the tokenizer.
+        """
+        tokenizer = RegexpTokenizer(r"\w+")
+        tokens = tokenizer.tokenize(text)
+        return tokens
+    def _tokenize_text(self, text_column: pd.Series) -> pd.Series:
+        """Splitting `text_column` into tokens.
+        Parameters:
+        ------------
+        - text_column : pd.Series
+            Contains text that needs to be tokenized.
+        Returns:
+        --------
+        - tokenized_text : pd.Series
+            Contains tokenized version of `text_column`.
+        """
+        tokenized_text = text_column.apply(self._nltk_tokenizer)
+        return tokenized_text
+    def _convert_to_tagged_document(
+        self, text_column: pd.Series
+    ) -> Tuple[List[str], List[TaggedDocument]]:
+        """
+        Convert `text_column` of specific to TaggedDocuments.
+        Parameters:
+        ------------
+        - column : pd.Series
+            Contains the list of tokens of each fact.
+        Returns:
+        --------
+        A tuble containing the following items:
+            - tokens_list : list[str]
+                Contains all tokens of each case in the `text_column`.
+            - tagged_docs : list[TaggedDocument]
+                Contains TaggedDocument object for each case.
+        """
+        tokens_list = text_column.to_list()
+        tagged_docs = [TaggedDocument(t, [str(i)])
+                       for i, t in enumerate(tokens_list)]
+        return tokens_list, tagged_docs
+    def _vectorize_text(
+        self, doc2vec_model: Doc2Vec, df: pd.Series, tokens_list: List[str]
+    ) -> pd.DataFrame:
+        """
+        Convert  values of `tokens_list` to a vector.
+        Parameters:
+        -----------
+        - doc2vec_model : Doc2Vev
+            Trained Doc2Vec model.
+        - df : pd.Series
+            This will use only to get its indicies for the new generated dataframe.
+        - tokens_list : List[str]
+            Contains all tokens of each case.
+        Returns:
+        --------
+        - text_vectors_df : pd.DataFrame
+            Contains the vector representaion for each case.
+        """
+        text_vectors = [doc2vec_model.infer_vector(doc) for doc in tokens_list]
+        text_vectors_df = pd.DataFrame(text_vectors, index=df.index)
+        return text_vectors_df
+    def _anonymize_case_facts(
+        self, first_party_name: str, second_party_name: str, facts: str
+    ) -> str:
+        """
+        Anonymize case facts by replacing its party names with "_PARTY_" tag.
+        Parameters:
+        ------------
+        - first_party_name : str
+            Represents first party name or petitioner name.
+        - second_party_name : str
+            Represents second party name or respondent name.
+        - facts : str
+            Represents case facts.
+        Returns:
+        --------
+        - anonymized_facts : str
+            An anonymized version of `facts`.
+        """
+        # remove any commas and any non alphabet characters
+        first_party_name = re.sub(r"[\,+]", " ", first_party_name)
+        first_party_name = re.sub(r"[^a-zA-Z]", " ", first_party_name)
+        second_party_name = re.sub(r"[\,+]", " ", second_party_name)
+        second_party_name = re.sub(r"[^a-zA-Z]", " ", second_party_name)
+        for name in first_party_name.split():
+            facts = re.sub(name, " _PARTY_ ", facts)
+        for name in second_party_name.split():
+            facts = re.sub(name, " _PARTY_ ", facts)
+        # replace any consecutive _PARTY_ tags with only one _PARTY_ tag.
+        regex_continous_tags = r"(_PARTY_\s+){2,}"
+        anonymized_facts = re.sub(regex_continous_tags, " _PARTY_ ", facts)
+        # remove ant consecutive spaces
+        anonymized_facts = re.sub(r"\s+", " ", anonymized_facts)
+        return anonymized_facts
+    def _preprocess_text(self, text: str) -> str:
+        """
+        Preprocessing & cleaning `text` including:
+        - lowercasing
+        - removing quotation marks
+        - removing digits
+        - removing punctuation
+        - removing brackets, braces, and paranthesis
+        - removeing stopwords
+        - stemming tokens
+        Parameters:
+        ------------
+        - text : str
+            Text need to be processed (cleaned).
+        Returns:
+        --------
+        - processed_text : str
+            A preprocessed version of `text`.
+        """
+        text = text.lower()
+        # remove quotation marks
+        text = re.sub(r"\'", "", text)
+        # remove digits
+        text = re.sub(r"\d+", "", text)
+        # remove punctuation but with keeping '_' letter
+        text = "".join([ch for ch in text if (ch == "_") or (ch not in punct)])
+        # remove brackets, braces, and parantheses
+        text = re.sub(r"[\[\]\(\)\{\}]+", " ", text)
+        tokens = nltk.word_tokenize(text)
+        # remove stopwords and stemming tokens
+        tokens = [stemmer.stem(token)
+                  for token in tokens if token not in eng_stopwords]
+        # convert tokens back to string
+        processed_text = " ".join(tokens)
+        return processed_text
+    def convert_text_to_vectors_doc2vec(
+        self,
+        text_column: pd.Series,
+        train: bool = True,
+        embeddings_doc2vec: Doc2Vec = None,
+    ) -> Tuple[Doc2Vec, pd.DataFrame] | pd.DataFrame:
+        """
+        Converting `text_column` to vectors using `Doc2Vec` model
+        Parameters:
+        ------------
+        - text_column : pd.Series
+            Contains the case facts.
+        - train : bool, optional
+            Defines whether the model will be trained or not. (if True, Doc2Vec will be trained |
+            else, Doc2Vec will used the passed `embeddings_Doc2Vec`). (Default is True).
+        - embeddings_doc2vec : Doc2Vec, optional
+            Trained Doc2Vec model will be used for generating embeddings of `text_column` if
+            `train` is False. (Default is None).
+        Returns:
+        --------
+        1. A tuple contains the following:
+            - embeddings_doc2vec : Doc2Vec
+                Trained Doc2Vec model.
+            - text_vectors_df : pd.DataFrame
+                A DataFrame contains `text_column` vectors if `train` is True.
+        2. text_vectors_df : pd.DataFrame
+            A DataFrame contains `text_column` vectors if `train` is False.
+        Raises:
+        -------
+        - AssertionError
+            If train is False and `embeddings_doc2vec` is None.
+        - AssertionError
+            If train is False and `embedding_doc2vec` is not an instance of Doc2Vec
+        """
+        tokenized_text = self._tokenize_text(text_column)
+        tokens_list, tagged_docs = self._convert_to_tagged_document(
+            tokenized_text)
+        if train:
+            doc2vec_model = Doc2VecModel()
+            embeddings_doc2vec = doc2vec_model.train_doc2vec_embeddings_model(
+                tagged_docs
+            )
+            text_vectors_df = self._vectorize_text(
+                embeddings_doc2vec, text_column, tokens_list
+            )
+            return embeddings_doc2vec, text_vectors_df
+        assert (
+            embeddings_doc2vec is not None
+        ), "`embedding_doc2vec` argument must be not None."
+        assert isinstance(
+            embeddings_doc2vec, Doc2Vec
+        ), "`embedding_doc2vec` argument must be an instance of Doc2Vec to infer vectors."
+        text_vectors_df = self._vectorize_text(
+            embeddings_doc2vec, text_column, tokens_list
+        )
+        return text_vectors_df
+    def convert_text_to_vectors_tf_idf(
+        self,
+        text_column: pd.Series,
+        ngrams: int = 2,
+        max_tokens: int = 10000,
+        output_mode: str = "tf-idf",
+        train: bool = True,
+        text_vectorizer: TextVectorization = None,
+    ) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor:
+        """
+        Converting `text_column` to vectors using `TextVectorization` layer.
+        Parameters:
+        ------------
+        - text_column : pd.Series
+            Contains the case facts.
+        - ngrams : int, optional
+            Defines the number of n-gram (Default is 2).
+        - max_tokens : int, optional
+            Defines the number of max_tokens of `text_vectorizer` (Default is 10,000).
+        - output_mode : str, optional
+            Represents the output vectors type whether it is "tfi-df" or "binary" or "count"
+            (Default is "tf-idf").
+        - train : bool, optional
+            Defines whether the model will be trained or not. (if True, TextVectorization
+            will be trained, else, TextVectorization will used the passed `text_vectorizer`).
+            (Default is True).
+        - text_vectorizer : TextVectorization, optional
+            Trained TextVectorization layer will be used for generating embeddings of
+            `text_column` if `train` is False. (Default is None).
+        Returns:
+        --------
+        - if `train` == True:
+            A tuple contains the following:
+                - text_vectorizer : TextVectorization
+                    Trained TextVectorization layer.
+                - text_vectors : tf.Tensor
+                    A Tensor contains `text_column` training vectors.
+        - otherwise:
+            text_vectors : tf.Tensor
+                A Tensor contains `text_column` testing vectors.
+        Raises:
+        -------
+        - AssertionError
+            If train is False and `text_vectorizer` is None.
+        - AssertionError
+            If train is False and `text_vectorizer` is not an instance of TextVectorization.
+        """
+        if train:
+            text_vectorizer = TextVectorization(
+                ngrams=ngrams, max_tokens=max_tokens, output_mode=output_mode
+            )
+            text_vectorizer.adapt(text_column)
+            text_vectors = text_vectorizer(text_column)
+            return text_vectorizer, text_vectors
+        assert (
+            text_vectorizer is not None
+        ), "`text_vectorizer` argument must be not None."
+        assert isinstance(
+            text_vectorizer, TextVectorization
+        ), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors."
+        text_vectors = text_vectorizer(text_column)
+        return text_vectors
+    def convert_text_to_vectors_cnn(
+        self,
+        text_column: pd.Series,
+        max_tokens: int = 2000,
+        output_sequence_length: int = 500,
+        output_mode: str = "int",
+        train: bool = True,
+        text_vectorizer: TextVectorization = None,
+    ) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor:
+        """
+        Converting `text_column` to vectors using `TextVectorization` layer.
+        Parameters:
+        ------------
+        - text_column : pd.Series
+            Contains the case facts.
+        - max_tokens : int, optional
+            Defines the number of max_tokens of `text_vectorizer` (Default is 2000).
+        - output_sequence_length : int, optional
+            Represents the dimensions of the output vector (Default is 500).
+        - output_mode : str, optional
+            Represents the output vectors type whether it is "int" or "binary" or "tfi-df".
+        - train : bool, optional
+            Defines whether the model will be trained or not. (if True,
+            TextVectorization will be trained | else, TextVectorization will used the
+            passed `text_vectorizer`). (Default is True).
+        - text_vectorizer : TextVectorization, optional
+            Trained TextVectorization layer will be used for generating embeddings of
+             `text_column` if `train` is False. (Default is None).
+        Returns:
+        --------
+        - if `train` == True:
+            A tuple contains the following:
+                - text_vectorizer : TextVectorization
+                    Trained TextVectorization layer.
+                - text_vectors : tf.Tensor
+                    A Tensor contains `text_column` training vectors.
+        - otherwise:
+            text_vectors : tf.Tensor
+                A Tensor contains `text_column` testing vectors.
+        Raises:
+        -------
+        - AssertionError
+            If train is False and `text_vectorizer` is None.
+        - AssertionError
+            If train is False and `text_vectorizer` is not an instance of TextVectorization.
+        """
+        if train:
+            text_vectorizer = TextVectorization(
+                max_tokens=max_tokens,
+                output_mode=output_mode,
+                output_sequence_length=output_sequence_length,
+            )
+            text_vectorizer.adapt(text_column)
+            text_vectors = text_vectorizer(text_column)
+            return text_vectorizer, text_vectors
+        assert (
+            text_vectorizer is not None
+        ), "`text_vectorizer` argument must be not None."
+        assert isinstance(
+            text_vectorizer, TextVectorization
+        ), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors."
+        text_vectors = text_vectorizer(text_column)
+        return text_vectors
+    def convert_text_to_vectors_glove(
+        self,
+        text_column: pd.Series,
+        train: bool = True,
+        glove_tokenizer: Tokenizer = None,
+        vocab_size: int = 1000,
+        oov_token: str = "<OOV>",
+        max_length: int = 50,
+        padding_type: str = "post",
+        truncation_type: str = "post",
+    ) -> Tuple[Tokenizer, np.ndarray] | np.ndarray:
+        """
+        Converting `text_column` to vectors using `glove_tokenizer`.
+        Parameters:
+        ------------
+        - text_column : pd.Series
+            Contains the case facts.
+        - train : bool, optional
+            Defines whether the model will be trained or not. (if True,
+            Tokenizer will be trained | else, Tokenizer will used the
+            passed `glove_tokenizer`). (Default is True).
+        - glove_tokenizer : Tokenizer, optional
+            Trained Tokenizer layer will be used for generating embeddings of
+             `text_column` if `train` is False. (Default is None).
+        - vocab_size : int, optional
+            Represents the number of supported vocabulary of the Tokenizer,
+            any token not in this vocabulary will be treated as an out-of-vocabulary
+            token(OOV). (Default is 1000).
+        - oov_tokens : str, optional
+            Represents the token of an out-of-vocabulary token (Default is "<OOV>").
+        - max_length : int, optional
+            Defins the output vector's dimension. (Default is 50).
+        - padding_type : str, optional
+            Defines the padding type of the vectors, if the vector size is less than
+            `max_length`, the rest of the `max_length` will be padded with 0 (Default is "post").
+        - truncation_type : str, optional
+            Defines the truncation type of the vectors, if the vector size is more than
+            `max_length`, the extra of the `max_length` will be truncated (Default is "post").
+        Returns:
+        --------
+        - if `train` == True:
+            A tuple contains the following:
+                - glove_tokenizer : Tokenizer
+                    Trained Tokenizer layer.
+                - text_padded : np.ndarray
+                    An array contains `text_column` vectors.
+        - otherwise:
+            text_padded : np.ndarray
+                An array contains `text_column` vectors.
+        Raises:
+        -------
+        - AssertionError
+            If train is False and `glove_tokenizer` is None.
+        - AssertionError
+            If train is False and `glove_tokenizer` is not instance of Tokenizer.
+        """
+        if train:
+            glove_tokenizer = Tokenizer(
+                num_words=vocab_size, oov_token=oov_token)
+            glove_tokenizer.fit_on_texts(text_column)
+            text_sequences = glove_tokenizer.texts_to_sequences(text_column)
+            text_padded = pad_sequences(
+                text_sequences,
+                maxlen=max_length,
+                padding=padding_type,
+                truncating=truncation_type,
+            )
+            return glove_tokenizer, text_padded
+        assert (
+            glove_tokenizer is not None
+        ), "`glove_tokenizer` argument must be not None."
+        assert isinstance(
+            glove_tokenizer, Tokenizer
+        ), "`glove_tokenizer` argument must be an instance of Tokenizer."
+        text_sequences = glove_tokenizer.texts_to_sequences(text_column)
+        text_padded = pad_sequences(
+            text_sequences,
+            maxlen=max_length,
+            padding=padding_type,
+            truncating=truncation_type,
+        )
+        return text_padded
+    def balance_data(self, X_train: pd.Series, y_train: pd.Series) -> pd.DataFrame:
+        """
+        Balancing `X_train` and `y_train` to distribute the targets in `y_train` equally.
+        Parameters:
+        ------------
+        - text_column : pd.Series
+             Contains the case facts.
+         - y_train : pd.Series
+             Contains the training targets.
+         Returns:
+         --------
+         -  shuffled_balanced_df : pd.DataFrame
+             Contains the new balanced dataframe with shuffling indicies.
+        """
+        df = pd.concat([X_train, y_train], axis=1)
+        first_party = df[df["winner_index"] == 0]
+        second_party = df[df["winner_index"] == 1]
+        upsample_second_party = resample(
+            second_party, replace=True, n_samples=len(first_party), random_state=42
+        )
+        upsample_df = pd.concat([upsample_second_party, first_party])
+        shuffled_indices = np.arange(upsample_df.shape[0])
+        np.random.shuffle(shuffled_indices)
+        shuffled_balanced_df = upsample_df.iloc[shuffled_indices, :]
+        return shuffled_balanced_df
+    def anonymize_data(
+        self,
+        first_party_names: pd.Series,
+        second_party_names: pd.Series,
+        text_column: pd.Series,
+    ) -> pd.Series:
+        """
+        Anonymize `text_column` by replacing `first_party_names` and
+        `second_party_names` wit "_PARTY_" tag.
+        Parameters:
+        ------------
+        - first_party_names : pd.Series
+            Contains all first party names needed to be anonymized.
+        - second_party_names : pd.Series
+            Contains all second party names needed to be anonymized.
+        - text_column : pd.Series
+            Contains all texts needed to be anonymized.
+        Returns:
+        --------
+        - all_anonyimzed_facts : pd.Series
+            Contains anonymized version of `text_column`.
+        """
+        all_anonymized_facts = []
+        for i in range(text_column.shape[0]):
+            facts = text_column.iloc[i]
+            first_party_name = first_party_names.iloc[i]
+            second_party_name = second_party_names.iloc[i]
+            anonymized_facts = self._anonymize_case_facts(
+                first_party_name, second_party_name, facts
+            )
+            all_anonymized_facts.append(anonymized_facts)
+        return pd.Series(all_anonymized_facts)
+    def preprocess_data(self, text_column: pd.Series) -> pd.Series:
+        """
+        Preprocessing & cleaning all texts in `text_column`.
+        Parameters:
+        ------------
+        - text_column : pd.Series
+            Contains all case facts.
+        Returns:
+        --------
+        - preprocessed_text : pd.Series
+            Contains all texts after being processed.
+        """
+        preprocessed_text = text_column.apply(self._preprocess_text)
+        return preprocessed_text

src/style.css ADDED Viewed

	@@ -0,0 +1,94 @@

+@import url('https://fonts.googleapis.com/css2?family=Cairo:wght@300;400;500;600;700;800&display=swap');
+* {
+    font-family: 'Cairo', sans-serif !important;
+}
+/* title */
+.e16nr0p30 {
+    font-weight: 700;
+    font-size: 30px;
+}
+/* buttons */
+.edgvbvh10,
+.edgvbvh5 {
+    width: 100%;
+    height: 40px;
+    background-color: #4756ff;
+    color: #fff;
+    transition: 0.4s;
+    border: none;
+}
+.edgvbvh10:hover,
+.edgvbvh5:hover {
+    background-color: #3747fd;
+    color: #fff;
+    border: none;
+}
+.edgvbvh10:focus,
+.edgvbvh5:focus {
+    background-color: #3747fd;
+    color: #fff !important;
+    box-shadow: none;
+    border: none;
+}
+/* header */
+.row_heading {
+    font-size: 14px;
+}
+/* spinner */
+.css-1y04v0k.e17lx80j1,
+.css-p6380s.e17lx80j1 {
+    margin: 0px;
+    border-color: #34e27f #b3b3b333 #cacaca33 !important;
+    -webkit-box-flex: 0;
+    flex-grow: 0;
+    flex-shrink: 0;
+}
+/* inputs styling */
+.st-bf {
+    transition: 0.8s;
+    border: none !important;
+}
+.st-bf:hover {
+    box-shadow: 0 0 0 4px #dbdbdb !important;
+}
+/* text stylings */
+.highlight-petitioner {
+    border-radius: 0.4rem;
+    background-color: rgba(253, 231, 142, 0.4);
+    color: #ffd061;
+    padding: 1px 5px;
+    margin-top: 10px;
+    margin-right: 5px;
+}
+.highlight-respondent {
+    border-radius: 0.4rem;
+    background-color: rgba(78, 170, 255, 0.2);
+    color: #6195ff;
+    padding: 1px 5px;
+    margin-top: 10px;
+    margin-right: 5px;
+}
+.bold-text {
+    font-weight: 700 !important;
+}
+.text-facts {
+    line-height: 40px;
+}
+/* footer */
+footer {
+    display: none !important;
+}

src/utils.py ADDED Viewed

	@@ -0,0 +1,389 @@

+from typing import Callable, List, Tuple
+import numpy as np
+import pandas as pd
+from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+import tensorflow as tf
+from tensorflow import keras
+from keras.preprocessing.text import Tokenizer
+def read_data(filepath="../csvs/"):
+    """
+    Reading CSV files of the dataset.
+    Parameters:
+    ----------
+    - filepath : str
+        Defines the path that contains the CSV files.
+    Returns:
+    --------
+    A tuple contains the following:
+        - X_train : pd.DataFrame
+        - y_train : pd.Series
+        - X_test : pd.DataFrame
+        - y_test : pd.Series
+    """
+    X_train = pd.read_csv(filepath + "X_train.csv")
+    X_train = X_train.iloc[:, 1:]
+    X_test = pd.read_csv(filepath + "X_test.csv")
+    X_test = X_test.iloc[:, 1:]
+    y_train = pd.read_csv(filepath + "y_train.csv")
+    y_train = y_train.iloc[:, 1:]
+    y_test = pd.read_csv(filepath + "y_test.csv")
+    y_test = y_test.iloc[:, 1:]
+    return X_train, X_test, y_train, y_test
+def train_model(
+    model_building_func: Callable[[], keras.models.Sequential],
+    X_train_vectors: pd.DataFrame | np.ndarray | tf.Tensor,
+    y_train: pd.Series,
+    k: int = 4,
+    num_epochs: int = 30,
+    batch_size: int = 64,
+) -> Tuple[
+    List[keras.models.Sequential],
+    List[List[float]],
+    List[List[float]],
+    List[List[float]],
+    List[List[float]],
+]:
+    """
+    Trains a model on `X_train_vectors` and `y_train` using k-fold cross-validation.
+    Parameters:
+    -----------
+    - model_building_func : Callable[[], tf.keras.models.Sequential]
+        A function that builds and compiles a Keras Sequential model.
+    - X_train_vectors : pd.DataFrame
+        The training input data.
+    - y_train : pd.Series
+        The training target data.
+    - k : int, optional
+        The number of folds for cross-validation (default is 4).
+    - num_epochs : int, optional
+        The number of epochs to train for (default is 30).
+    - batch_size : int, optional
+        The batch size to use during training (default is 64).
+    Returns:
+    --------
+    A tuple containing the following items:
+        - all_models : List[keras.models.Sequential]
+            A list of `k` trained models.
+        - all_losses : List[List[float]]
+            A `k` by `num_epochs` list containing the training losses for each fold.
+        - all_val_losses : List[List[float]]
+            A `k` by `num_epochs` list containing the validation losses for each fold.
+        - all_acc : List[List[float]]
+            A `k` by `num_epochs` list containing the training accuracies for each fold.
+        - all_val_acc : List[List[float]]
+            A `k` by `num_epochs` list containing the validation accuracies for each fold.
+    """
+    num_validation_samples = len(X_train_vectors) // k
+    all_models = []
+    all_losses = []
+    all_val_losses = []
+    all_accuracies = []
+    all_val_accuracies = []
+    for fold in range(k):
+        print(f"fold: {fold+1}")
+        validation_data = X_train_vectors[
+            num_validation_samples * fold : num_validation_samples * (fold + 1)
+        ]
+        validation_targets = y_train[
+            num_validation_samples * fold : num_validation_samples * (fold + 1)
+        ]
+        training_data = np.concatenate(
+            [
+                X_train_vectors[: num_validation_samples * fold],
+                X_train_vectors[num_validation_samples * (fold + 1) :],
+            ]
+        )
+        training_targets = np.concatenate(
+            [
+                y_train[: num_validation_samples * fold],
+                y_train[num_validation_samples * (fold + 1) :],
+            ]
+        )
+        model = model_building_func()
+        history = model.fit(
+            training_data,
+            training_targets,
+            validation_data=(validation_data, validation_targets),
+            epochs=num_epochs,
+            batch_size=batch_size,
+        )
+        all_models.append(model)
+        all_losses.append(history.history["loss"])
+        all_val_losses.append(history.history["val_loss"])
+        all_accuracies.append(history.history["accuracy"])
+        all_val_accuracies.append(history.history["val_accuracy"])
+    return (all_models, all_losses, all_val_losses, all_accuracies, all_val_accuracies)
+def print_testing_loss_accuracy(
+    all_models: List[keras.models.Sequential],
+    X_test_vectors: pd.DataFrame | np.ndarray | tf.Tensor,
+    y_test: pd.Series,
+) -> None:
+    """
+    Displaying testing loss and testing accuracy of each model in `all_models`,
+    and displaying their average.
+    Parameters:
+    ------------
+    - all_models : List[keras.models.Sequential]
+        A list of size `k` contains trained models.
+    - X_test_vectors : pd.DataFrame
+        Contains testing vectors.
+    - y_test : pd.Series
+        Contains testing labels.
+    """
+    sum_testing_losses = 0.0
+    sum_testing_accuracies = 0.0
+    for i, model in enumerate(all_models):
+        print(f"model: {i+1}")
+        loss_accuracy = model.evaluate(X_test_vectors, y_test, verbose=1)
+        sum_testing_losses += loss_accuracy[0]
+        sum_testing_accuracies += loss_accuracy[1]
+        print("====" * 20)
+    num_models = len(all_models)
+    avg_testing_loss = sum_testing_losses / num_models
+    avg_testing_acc = sum_testing_accuracies / num_models
+    print(f"average testing loss: {avg_testing_loss:.3f}")
+    print(f"average testing accuracy: {avg_testing_acc:.3f}")
+def calculate_average_measures(
+    all_losses: list[list[float]],
+    all_val_losses: list[list[float]],
+    all_accuracies: list[list[float]],
+    all_val_accuracies: list[list[float]],
+) -> Tuple[
+    List[keras.models.Sequential],
+    List[List[float]],
+    List[List[float]],
+    List[List[float]],
+    List[List[float]],
+]:
+    """
+    Calculate the average measures of cross-validated results.
+    Parameters:
+    ------------
+    - all_losses : List[List[float]]
+        A `k` by `num_epochs` list contains the values of training losses.
+    - all_val_losses : List[List[float]]
+        A `k` by `num_epochs` list contains the values of validation losses.
+    - all_accuracies : List[List[float]]
+        A `k` by `num_epochs` list contains the values of training accuracies.
+    - all_val_accuracies : List[List[float]]
+        A `k` by `num_epochs` list contains the values of validation accuracies.
+    Returns:
+    --------
+    A tuple containing the following items:
+        - avg_loss_hist : List[float]
+            A list of length `num_epochs` contains the average of training losses.
+        - avg_val_loss_hist : List[float]
+            A list of length `num_epochs` contains the average of validaton losses.
+        - avg_acc_hist : List[float]
+            A list of length `num_epochs` contains the average of training accuracies.
+        - avg_val_acc_hist : List[float]
+            A list of length `num_epochs` contains the average of validation accuracies.
+    """
+    num_epochs = len(all_losses[0])
+    avg_loss_hist = [np.mean([x[i] for x in all_losses]) for i in range(num_epochs)]
+    avg_val_loss_hist = [
+        np.mean([x[i] for x in all_val_losses]) for i in range(num_epochs)
+    ]
+    avg_acc_hist = [np.mean([x[i] for x in all_accuracies]) for i in range(num_epochs)]
+    avg_val_acc_hist = [
+        np.mean([x[i] for x in all_val_accuracies]) for i in range(num_epochs)
+    ]
+    return (avg_loss_hist, avg_val_loss_hist, avg_acc_hist, avg_val_acc_hist)
+class Doc2VecModel:
+    """Responsible of creating, initializing, and training Doc2Vec embeddings model."""
+    def __init__(self, vector_size=50, min_count=2, epochs=100, dm=1, window=5) -> None:
+        """
+        Initalize a Doc2Vec model.
+        Parameters:
+        ------------
+        - vector_size : int, optional
+            Dimensionality of the feature vectors (Default is 50).
+        - min_count : int, optional
+            Ignores all words with total frequency lower than this (Default is 2).
+        - epochs : int, optional
+            Represents the number of training epochs (Default is 100).
+        - dm : int, optional
+            Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
+            Otherwise, `distributed bag of words` (PV-DBOW) is employed (Default is 1).
+        - window : int, optional
+            The maximum distance between the current and predicted word within a
+            sentence (Default is 5).
+        """
+        self.doc2vec_model = Doc2Vec(
+            vector_size=vector_size,
+            min_count=min_count,
+            epochs=epochs,
+            dm=dm,
+            seed=865,
+            window=window,
+        )
+    def train_doc2vec_embeddings_model(
+        self, tagged_docs_train: List[TaggedDocument]
+    ) -> Doc2Vec:
+        """
+        Train Doc2Vec model on `tagged_docs_train`.
+        Parameters:
+        ------------
+        - tagged_docs_train : list[TaggedDocument]
+            Contains the required format of training Doc2Vec model.
+        Returns:
+        --------
+        - doc2vec_model : Doc2Vec
+            The trained Doc2Vec model.
+        """
+        self.doc2vec_model.build_vocab(tagged_docs_train)
+        self.doc2vec_model.train(
+            tagged_docs_train,
+            total_examples=self.doc2vec_model.corpus_count,
+            epochs=self.doc2vec_model.epochs,
+        )
+        return self.doc2vec_model
+class GloveModel:
+    """Responsible for creating and generating the glove embedding layer"""
+    def __init__(self) -> None:
+        pass
+    def _generate_glove_embedding_index(
+        self, glove_file_path: str = "GloVe/glove.6B.50d.txt"
+    ) -> dict:
+        """
+        Responsible for generating glove embedding index.
+        Parameters:
+        ------------
+        - glove_file_path : str
+            Defines the path of the pretrained GloVe embeddings text file
+            (Default is "GloVe/glove.6B.50d.txt").
+        Returns:
+        --------
+        - embedding_index : dict
+            Contains each word as a key, and its co-effeicents as a value.
+        """
+        embeddings_index = {}
+        with open(glove_file_path, encoding="utf8") as f:
+            for line in f:
+                values = line.split()
+                word = values[0]
+                coefs = np.asarray(values[1:], dtype="float32")
+                embeddings_index[word] = coefs
+        return embeddings_index
+    def _generate_glove_embedding_matrix(
+        self, word_index: dict, embedding_index: dict, max_length: int
+    ) -> np.ndarray:
+        """
+        Generating embedding matrix of each word in `word_index`.
+        Parameters:
+        -----------
+        - word_index : dict
+            Contains words as keys with there indicies as values.
+        - embedding_index : dict
+            Contains each word as a key, and its co-effeicents as a value.
+        - max_length : int
+            Defines the size of the embedding vector of each word in the
+            embedding matrix.
+        Returns:
+        --------
+        - embedding_matrix : np.ndarray
+            Contains all embedding vectors for each word in`word_index`.
+        """
+        embedding_matrix = np.zeros((len(word_index) + 1, max_length))
+        for word, i in word_index.items():
+            embedding_vector = embedding_index.get(word)
+            if embedding_vector is not None:
+                embedding_matrix[i] = embedding_vector
+        return embedding_matrix
+    def generate_glove_embedding_layer(
+        self, glove_tokenizer: Tokenizer, max_length: int = 50
+    ) -> keras.layers.Embedding:
+        """
+        Create GloVe embedding layer for later usage in the neural network.
+        Paramters:
+        ----------
+        - glove_tokenizer : Tokenizer
+            Trained tokenizer on training data to extract word index from it.
+        - max_length : int, optional
+            Defines the maximum length of the output embedding vector for
+            each word. (Default is 50).
+        Returns:
+        --------
+        - embedding_layer : keras.layers.Embedding
+            An embedding layer of size `word index + 1` by `max_length` with
+            trained weights that can be used a vectorizer of case facts.
+        """
+        word_index = glove_tokenizer.word_index
+        embedding_index = self._generate_glove_embedding_index()
+        embedding_matrix = self._generate_glove_embedding_matrix(
+            word_index, embedding_index, max_length
+        )
+        embedding_layer = keras.layers.Embedding(
+            len(word_index) + 1,
+            max_length,
+            weights=[embedding_matrix],
+            input_length=max_length,
+            trainable=False,
+        )
+        return embedding_layer