kimic commited on
Commit
c5cd586
1 Parent(s): 85431a0

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.csv filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
analysis.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
data_1/Fake.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bebf8bcfe95678bf2c732bf413a2ce5f621af0102c82bf08083b2e5d3c693d0c
3
+ size 62789876
data_1/True.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba0844414a65dc6ae7402b8eee5306da24b6b56488d6767135af466c7dcb2775
3
+ size 53582940
data_2/WELFake_Dataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:665331424230fc452e9482c3547a6a199a2c29745ade8d236950d1d105223773
3
+ size 245086152
data_loader.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data import Dataset, DataLoader
2
+ import torch
3
+
4
+
5
+ class NewsDataset(Dataset):
6
+ def __init__(self, titles, texts, labels=None):
7
+ self.titles = titles
8
+ self.texts = texts
9
+ self.labels = labels
10
+
11
+ def __len__(self):
12
+ return len(self.titles)
13
+
14
+ def __getitem__(self, idx):
15
+ if self.labels is not None:
16
+ return self.titles[idx], self.texts[idx], self.labels[idx]
17
+ return self.titles[idx], self.texts[idx]
18
+
19
+
20
+ def create_data_loader(titles, texts, labels=None, batch_size=32, shuffle=False, num_workers=6):
21
+ dataset = NewsDataset(titles, texts, labels)
22
+ return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True, persistent_workers=True)
inference.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pandas as pd
3
+ from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
4
+ from model import LSTMModel
5
+
6
+
7
+ def load_model(model_path, vocab_size):
8
+ model = LSTMModel(vocab_size)
9
+ model.load_state_dict(torch.load(model_path))
10
+ model.eval()
11
+ return model
12
+
13
+
14
+ def predict(model, titles, texts, device):
15
+ titles, texts = titles.to(device), texts.to(device)
16
+ model.to(device)
17
+ with torch.no_grad():
18
+ outputs = model(titles, texts).squeeze()
19
+ return outputs
20
+
21
+
22
+ def evaluate_model(model, data_loader, device, labels):
23
+ model.to(device)
24
+ model.eval()
25
+ predictions = []
26
+ labels = torch.tensor(labels).to(device)
27
+ for titles, texts in data_loader:
28
+ titles, texts = titles.to(device), texts.to(device)
29
+ outputs = predict(model, titles, texts, device)
30
+ predictions.extend(outputs.cpu().numpy())
31
+
32
+ labels = labels.cpu()
33
+ # Calculate metrics
34
+ predicted_labels = [1 if p > 0.5 else 0 for p in predictions]
35
+ accuracy = accuracy_score(labels, predicted_labels)
36
+ f1 = f1_score(labels, predicted_labels)
37
+ auc_roc = roc_auc_score(labels, predictions)
38
+
39
+ return accuracy, f1, auc_roc
inference_main.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pandas as pd
3
+ from preprocessing import preprocess_text, load_tokenizer, prepare_data
4
+ from data_loader import create_data_loader
5
+ from inference import load_model, evaluate_model
6
+
7
+ version = 7
8
+
9
+
10
+ def run_evaluation(model_path, tokenizer_path, device):
11
+ cleaned_path = f'./output/version_{version}/cleaned_inference_data_{version}.csv'
12
+ # Load data
13
+ try:
14
+ df = pd.read_csv(cleaned_path)
15
+ df.dropna(inplace=True)
16
+ print("Cleaned data found.")
17
+ except:
18
+ print("No cleaned data found. Cleaning data now...")
19
+ # Load the datasets
20
+ true_news = pd.read_csv('data_1/True.csv')
21
+ fake_news = pd.read_csv('data_1/Fake.csv')
22
+
23
+ # Add labels
24
+ true_news['label'] = 1
25
+ fake_news['label'] = 0
26
+
27
+ # Combine the datasets
28
+ df = pd.concat([true_news, fake_news], ignore_index=True)
29
+
30
+ # Drop unnecessary columns
31
+ df.drop(columns=['subject', 'date'], inplace=True)
32
+
33
+ df['title'] = df['title'].apply(preprocess_text)
34
+ df['text'] = df['text'].apply(preprocess_text)
35
+
36
+ df.to_csv(cleaned_path, index=False)
37
+ df.dropna(inplace=True)
38
+ print("Cleaned data saved.")
39
+
40
+ labels = df['label'].values
41
+
42
+ # Load tokenizer and model
43
+ tokenizer = load_tokenizer(tokenizer_path)
44
+ model = load_model(model_path, len(tokenizer.word_index) + 1)
45
+
46
+ # Prepare data
47
+ titles = prepare_data(df['title'], tokenizer)
48
+ texts = prepare_data(df['text'], tokenizer)
49
+
50
+ # Create DataLoader
51
+ data_loader = create_data_loader(
52
+ titles, texts, batch_size=32, shuffle=False)
53
+
54
+ # Evaluate
55
+ accuracy, f1, auc_roc = evaluate_model(model, data_loader, device, labels)
56
+ return accuracy, f1, auc_roc
57
+
58
+
59
+ if __name__ == "__main__":
60
+ model_path = f'./output/version_{version}/best_model_{version}.pth'
61
+ tokenizer_path = f'./output/version_{version}/tokenizer_{version}.pickle'
62
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
63
+ print(f"Device: {device}")
64
+
65
+ accuracy, f1, auc_roc = run_evaluation(model_path, tokenizer_path, device)
66
+ print(
67
+ f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}')
inference_more.ipynb ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "GPU is available: True\n"
13
+ ]
14
+ }
15
+ ],
16
+ "source": [
17
+ "import torch\n",
18
+ "import torch.nn as nn\n",
19
+ "import pandas as pd\n",
20
+ "from model import LSTMModel\n",
21
+ "from data_loader import create_data_loader\n",
22
+ "from sklearn.model_selection import train_test_split\n",
23
+ "from sklearn.metrics import f1_score, roc_auc_score\n",
24
+ "from keras_preprocessing.sequence import pad_sequences\n",
25
+ "from torch.utils.data import DataLoader\n",
26
+ "from data_loader import NewsDataset"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 1,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "version = 7"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 2,
41
+ "metadata": {},
42
+ "outputs": [
43
+ {
44
+ "name": "stdout",
45
+ "output_type": "stream",
46
+ "text": [
47
+ "Cleaned data found.\n"
48
+ ]
49
+ }
50
+ ],
51
+ "source": [
52
+ "data_path = './data_2/WELFake_Dataset.csv'\n",
53
+ "cleaned_path = f'./output/version_{version}/cleaned_news_data_{version}.csv'\n",
54
+ "\n",
55
+ "# Load data\n",
56
+ "df = pd.read_csv(cleaned_path)\n",
57
+ "df.dropna(inplace=True)\n",
58
+ "print(\"Cleaned data found.\")"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 3,
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": [
67
+ "from preprocessing import preprocess_text, load_tokenizer, prepare_data\n",
68
+ "tokenizer = load_tokenizer(f'./output/version_{version}/tokenizer_{version}.pickle')"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 4,
74
+ "metadata": {},
75
+ "outputs": [],
76
+ "source": [
77
+ "train_val, test = train_test_split(df, test_size=0.2, random_state=42)\n",
78
+ "train, val = train_test_split(\n",
79
+ "\ttrain_val, test_size=0.25, random_state=42) # 0.25 * 0.8 = 0.2"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": 5,
85
+ "metadata": {},
86
+ "outputs": [],
87
+ "source": [
88
+ "# Tokenize the data\n",
89
+ "X_train_title = tokenizer.texts_to_sequences(train['title'])\n",
90
+ "X_train_text = tokenizer.texts_to_sequences(train['text'])\n",
91
+ "X_val_title = tokenizer.texts_to_sequences(val['title'])\n",
92
+ "X_val_text = tokenizer.texts_to_sequences(val['text'])\n",
93
+ "X_test_title = tokenizer.texts_to_sequences(test['title'])\n",
94
+ "X_test_text = tokenizer.texts_to_sequences(test['text'])\n",
95
+ "\n",
96
+ "# Padding sequences\n",
97
+ "max_length = 500\n",
98
+ "X_train_title = pad_sequences(X_train_title, maxlen=max_length)\n",
99
+ "X_train_text = pad_sequences(X_train_text, maxlen=max_length)\n",
100
+ "X_val_title = pad_sequences(X_val_title, maxlen=max_length)\n",
101
+ "X_val_text = pad_sequences(X_val_text, maxlen=max_length)\n",
102
+ "X_test_title = pad_sequences(X_test_title, maxlen=max_length)\n",
103
+ "X_test_text = pad_sequences(X_test_text, maxlen=max_length)"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 6,
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": 7,
118
+ "metadata": {},
119
+ "outputs": [],
120
+ "source": [
121
+ "model = LSTMModel(len(tokenizer.word_index) + 1).to(device)\n",
122
+ "\n",
123
+ "# Convert data to PyTorch tensors\n",
124
+ "train_data = NewsDataset(torch.tensor(X_train_title), torch.tensor(\n",
125
+ "\tX_train_text), torch.tensor(train['label'].values))\n",
126
+ "val_data = NewsDataset(torch.tensor(X_val_title), torch.tensor(\n",
127
+ "\tX_val_text), torch.tensor(val['label'].values))\n",
128
+ "test_data = NewsDataset(torch.tensor(X_test_title), torch.tensor(\n",
129
+ "\tX_test_text), torch.tensor(test['label'].values))\n",
130
+ "\n",
131
+ "train_loader = DataLoader(train_data, batch_size=32,\n",
132
+ "\t\t\t\t\t\t\tshuffle=True, num_workers=6, pin_memory=True, persistent_workers=True)\n",
133
+ "val_loader = DataLoader(val_data, batch_size=32,\n",
134
+ "\t\t\t\t\t\tshuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)\n",
135
+ "test_loader = DataLoader(test_data, batch_size=32,\n",
136
+ "\t\t\t\t\t\t\tshuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)\n",
137
+ "\n",
138
+ "criterion = nn.BCELoss()\n",
139
+ "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 8,
145
+ "metadata": {},
146
+ "outputs": [
147
+ {
148
+ "name": "stdout",
149
+ "output_type": "stream",
150
+ "text": [
151
+ "Test Accuracy: 98.70%, F1 Score: 0.9868, AUC-ROC: 0.9984\n"
152
+ ]
153
+ }
154
+ ],
155
+ "source": [
156
+ "model.load_state_dict(torch.load(f\"./output/version_{version}/best_model_{version}.pth\", map_location=device))\n",
157
+ "\n",
158
+ "# Testing\n",
159
+ "model.eval()\n",
160
+ "true_labels = []\n",
161
+ "predicted_labels = []\n",
162
+ "predicted_probs = []\n",
163
+ "\n",
164
+ "with torch.no_grad():\n",
165
+ "\tcorrect = 0\n",
166
+ "\ttotal = 0\n",
167
+ "\tfor titles, texts, labels in test_loader:\n",
168
+ "\t\ttitles, texts, labels = titles.to(device), texts.to(\n",
169
+ "\t\t\tdevice), labels.to(device).float()\n",
170
+ "\t\toutputs = model(titles, texts).squeeze()\n",
171
+ "\n",
172
+ "\t\tpredicted = (outputs > 0.5).float()\n",
173
+ "\t\ttotal += labels.size(0)\n",
174
+ "\t\tcorrect += (predicted == labels).sum().item()\n",
175
+ "\t\ttrue_labels.extend(labels.cpu().numpy())\n",
176
+ "\t\tpredicted_labels.extend(predicted.cpu().numpy())\n",
177
+ "\t\tpredicted_probs.extend(outputs.cpu().numpy())\n",
178
+ "\n",
179
+ "test_accuracy = 100 * correct / total\n",
180
+ "f1 = f1_score(true_labels, predicted_labels)\n",
181
+ "auc_roc = roc_auc_score(true_labels, predicted_probs)\n",
182
+ "\n",
183
+ "print(\n",
184
+ "\tf'Test Accuracy: {test_accuracy:.2f}%, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}')\n",
185
+ "\n",
186
+ "# Create DataFrame and Save to CSV\n",
187
+ "confusion_data = pd.DataFrame(\n",
188
+ "\t{'True': true_labels, 'Predicted': predicted_labels})\n",
189
+ "confusion_data.to_csv('confusion_matrix_data.csv', index=False)"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": 36,
195
+ "metadata": {},
196
+ "outputs": [
197
+ {
198
+ "name": "stdout",
199
+ "output_type": "stream",
200
+ "text": [
201
+ " title \\\n",
202
+ "0 Trump’s creating just the kind of legal chaos ... \n",
203
+ "\n",
204
+ " text \n",
205
+ "0 Donald Trump’s request to the Supreme Court o... \n",
206
+ "outputs: 0.5209237933158875\n"
207
+ ]
208
+ },
209
+ {
210
+ "ename": "TypeError",
211
+ "evalue": "iteration over a 0-d array",
212
+ "output_type": "error",
213
+ "traceback": [
214
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
215
+ "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
216
+ "Cell \u001b[1;32mIn[36], line 30\u001b[0m\n\u001b[0;32m 28\u001b[0m outputs \u001b[38;5;241m=\u001b[39m predict(model, titles, texts, device)\n\u001b[0;32m 29\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutputs: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00moutputs\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 30\u001b[0m \u001b[43mpredictions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mextend\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutputs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcpu\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnumpy\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 31\u001b[0m predicted_labels \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m1\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m p \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0.5\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;241m0\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m predictions]\n\u001b[0;32m 33\u001b[0m \u001b[38;5;28mprint\u001b[39m(predictions)\n",
217
+ "\u001b[1;31mTypeError\u001b[0m: iteration over a 0-d array"
218
+ ]
219
+ }
220
+ ],
221
+ "source": [
222
+ "import numpy as np\n",
223
+ "from inference import predict, load_model\n",
224
+ "predictions = []\n",
225
+ "\n",
226
+ "user_title = input(\"Enter title: \")\n",
227
+ "user_text = input(\"Enter text: \")\n",
228
+ "\n",
229
+ "\n",
230
+ "# Creating the DataFrame with the user's input\n",
231
+ "df = pd.DataFrame({'title': [user_title], 'text': [user_text]})\n",
232
+ "print(df.head())\n",
233
+ "\n",
234
+ "df['title'] = df['title'].apply(preprocess_text)\n",
235
+ "df['text'] = df['text'].apply(preprocess_text)\n",
236
+ "\n",
237
+ "tokenizer = load_tokenizer(f\"./output/version_{version}/tokenizer_{version}.pickle\")\n",
238
+ "model = load_model(f\"./output/version_{version}/best_model_{version}.pth\", len(tokenizer.word_index) + 1)\n",
239
+ "title = prepare_data(df[\"title\"], tokenizer)\n",
240
+ "text = prepare_data(df[\"text\"], tokenizer)\n",
241
+ "\n",
242
+ "# Create DataLoader\n",
243
+ "data_loader = create_data_loader(\n",
244
+ " title, text, batch_size=32, shuffle=False)\n",
245
+ "model.eval()\n",
246
+ "model.to(device)\n",
247
+ "for titles, texts in data_loader:\n",
248
+ " titles, texts = titles.to(device), texts.to(device)\n",
249
+ " outputs = predict(model, titles, texts, device)\n",
250
+ " print(f\"outputs: {outputs}\")\n",
251
+ " # predictions.extend(outputs.cpu().numpy())\n",
252
+ "predicted_labels = [1 if p > 0.5 else 0 for p in predictions]\n",
253
+ "\n",
254
+ "print(predictions)\n",
255
+ "print(predicted_labels)"
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "code",
260
+ "execution_count": 18,
261
+ "metadata": {},
262
+ "outputs": [
263
+ {
264
+ "name": "stdout",
265
+ "output_type": "stream",
266
+ "text": [
267
+ " title text\n",
268
+ "0 hello title hello this is text\n"
269
+ ]
270
+ }
271
+ ],
272
+ "source": [
273
+ "user_title = input(\"Enter title: \")\n",
274
+ "user_text = input(\"Enter text: \")\n",
275
+ "\n",
276
+ "# Creating the DataFrame with the user's input\n",
277
+ "df = pd.DataFrame({'title': [user_title], 'text': [user_text]})\n",
278
+ "print(df.head())"
279
+ ]
280
+ }
281
+ ],
282
+ "metadata": {
283
+ "kernelspec": {
284
+ "display_name": "torch",
285
+ "language": "python",
286
+ "name": "python3"
287
+ },
288
+ "language_info": {
289
+ "codemirror_mode": {
290
+ "name": "ipython",
291
+ "version": 3
292
+ },
293
+ "file_extension": ".py",
294
+ "mimetype": "text/x-python",
295
+ "name": "python",
296
+ "nbconvert_exporter": "python",
297
+ "pygments_lexer": "ipython3",
298
+ "version": "3.10.11"
299
+ }
300
+ },
301
+ "nbformat": 4,
302
+ "nbformat_minor": 2
303
+ }
model.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+ class LSTMModel(nn.Module):
6
+ def __init__(self, vocab_size, embedding_dim=128, hidden_size=256, num_layers=2, dropout=0.2):
7
+ super(LSTMModel, self).__init__()
8
+ self.embedding = nn.Embedding(
9
+ num_embeddings=vocab_size, embedding_dim=embedding_dim)
10
+ self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size,
11
+ num_layers=num_layers, batch_first=True, dropout=dropout)
12
+ self.fc = nn.Linear(hidden_size, 1)
13
+
14
+ def forward(self, title, text):
15
+ title_emb = self.embedding(title)
16
+ text_emb = self.embedding(text)
17
+ combined = torch.cat((title_emb, text_emb), dim=1)
18
+ output, (hidden, _) = self.lstm(combined)
19
+ out = self.fc(hidden[-1])
20
+ return torch.sigmoid(out)
output/version_7/best_model_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5b5750829a8f672dcbd297143eddbf0621a024930055dd7f7363db34ac6e374
3
+ size 101492472
output/version_7/cleaned_inference_data_7.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13c4f9369494fb6a24ca2c1415027736e0814b576854b0187a4dd93c5f6f344b
3
+ size 74695505
output/version_7/cleaned_news_data_7.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7b95458a091aeaede51ebb58a6d039e21dbc8d50a78439563d8d2b6149c1150
3
+ size 154624396
output/version_7/confusion_matrix_data_7.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ca5e75060519cb2c81af94130fd48d3db06689baf6f32eb434625ab22aa168f
3
+ size 127312
output/version_7/tokenizer_7.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e129ef007dd8405eefe6ed17a5737e368f2066bef28e933c469a180499994a56
3
+ size 8812251
output/version_7/training_metrics_7.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:273d33e9f9cdd11a57149d86f8ded1ee222cab35508628fa05dbb3f31fae20cb
3
+ size 1252
preprocessing.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import spacy
3
+ from keras.preprocessing.text import Tokenizer
4
+ from keras_preprocessing.sequence import pad_sequences
5
+ import pickle
6
+
7
+ spacy.prefer_gpu()
8
+ print("GPU is available:", spacy.prefer_gpu())
9
+
10
+ # Load spaCy's English model
11
+ nlp = spacy.load('en_core_web_sm')
12
+
13
+
14
+ def preprocess_text(text):
15
+ # Remove patterns like "COUNTRY or STATE NAME (Reuters) -"
16
+ text = re.sub(r'\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-', '', text)
17
+
18
+ # Remove patterns like "Featured image via author name / image place"
19
+ text = re.sub(r'Featured image via .+ / .+', '', text)
20
+
21
+ # Process text with spaCy
22
+ doc = nlp(text)
23
+
24
+ # Improved lemmatization
25
+ lemmatized_text = []
26
+ for token in doc:
27
+ # Preserve named entities in their original form
28
+ if token.ent_type_:
29
+ lemmatized_text.append(token.text)
30
+ # Lemmatize other tokens and exclude non-alpha tokens if necessary
31
+ elif token.is_alpha and not token.is_stop:
32
+ lemmatized_text.append(token.lemma_.lower())
33
+
34
+ return ' '.join(lemmatized_text)
35
+
36
+
37
+ def load_tokenizer(tokenizer_path):
38
+ with open(tokenizer_path, 'rb') as handle:
39
+ tokenizer = pickle.load(handle)
40
+ return tokenizer
41
+
42
+
43
+ def prepare_data(texts, tokenizer, max_length=500):
44
+ sequences = tokenizer.texts_to_sequences(texts)
45
+ padded = pad_sequences(sequences, maxlen=max_length)
46
+ return padded
test.ipynb ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "GPU is available: True\n"
13
+ ]
14
+ },
15
+ {
16
+ "name": "stderr",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "c:\\Users\\kimi\\anaconda3\\envs\\torch\\lib\\site-packages\\spacy\\util.py:910: UserWarning: [W095] Model 'en_core_web_sm' (3.5.0) was trained with spaCy v3.5.0 and may not be 100% compatible with the current version (3.7.2). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n",
20
+ " warnings.warn(warn_msg)\n"
21
+ ]
22
+ }
23
+ ],
24
+ "source": [
25
+ "import torch\n",
26
+ "import torch.nn as nn\n",
27
+ "import pandas as pd\n",
28
+ "from model import LSTMModel\n",
29
+ "from preprocessing import preprocess_text\n",
30
+ "from data_loader import create_data_loader\n",
31
+ "from sklearn.model_selection import train_test_split\n",
32
+ "from sklearn.metrics import f1_score, roc_auc_score\n",
33
+ "from keras.preprocessing.text import Tokenizer\n",
34
+ "from keras_preprocessing.sequence import pad_sequences\n",
35
+ "import pickle\n",
36
+ "import train as tr\n",
37
+ "from torch.utils.data import Dataset, DataLoader\n",
38
+ "from data_loader import NewsDataset"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": null,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "fake_path = './data_1/Fake.csv'\n",
48
+ "true_path = './data_1/True.csv'\n",
49
+ "\n",
50
+ "print(\"No cleaned data found. Cleaning data now...\")\n",
51
+ "# Load the datasets\n",
52
+ "true_news = pd.read_csv('data_1/True.csv')\n",
53
+ "fake_news = pd.read_csv('data_1/Fake.csv')\n",
54
+ "\n",
55
+ "# Add labels\n",
56
+ "true_news['label'] = 1\n",
57
+ "fake_news['label'] = 0\n",
58
+ "\n",
59
+ "# Combine the datasets\n",
60
+ "df = pd.concat([true_news, fake_news], ignore_index=True)\n",
61
+ "\n",
62
+ "# Drop unnecessary columns\n",
63
+ "df.drop(columns=['subject', 'date'], inplace=True)\n",
64
+ "\n",
65
+ "df['title'] = df[0]['title'].apply(preprocess_text)\n",
66
+ "df['text'] = df[0]['text'].apply(preprocess_text)\n",
67
+ "\n",
68
+ "df.to_csv('test.csv', index=False)"
69
+ ]
70
+ }
71
+ ],
72
+ "metadata": {
73
+ "kernelspec": {
74
+ "display_name": "torch",
75
+ "language": "python",
76
+ "name": "python3"
77
+ },
78
+ "language_info": {
79
+ "codemirror_mode": {
80
+ "name": "ipython",
81
+ "version": 3
82
+ },
83
+ "file_extension": ".py",
84
+ "mimetype": "text/x-python",
85
+ "name": "python",
86
+ "nbconvert_exporter": "python",
87
+ "pygments_lexer": "ipython3",
88
+ "version": "3.10.11"
89
+ }
90
+ },
91
+ "nbformat": 4,
92
+ "nbformat_minor": 2
93
+ }
train.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pandas as pd
3
+ import time
4
+ from torch.nn.utils import clip_grad_norm_
5
+
6
+
7
+ def train(model, train_loader, val_loader, criterion, optimizer, epochs, device, version, max_grad_norm=1.0, early_stopping_patience=5, early_stopping_delta=0.001):
8
+ best_accuracy = 0.0
9
+ best_model_path = f'./output/version_{version}/best_model_{version}.pth'
10
+ best_epoch = 0
11
+ early_stopping_counter = 0
12
+ total_batches = len(train_loader)
13
+ metrics = {
14
+ 'epoch': [], 'train_loss': [], 'val_loss': [], 'train_accuracy': [], 'val_accuracy': []
15
+ }
16
+
17
+ for epoch in range(epochs):
18
+ model.train()
19
+ total_loss, train_correct, train_total = 0, 0, 0
20
+ for batch_idx, (titles, texts, labels) in enumerate(train_loader):
21
+ start_time = time.time() # Start time for the batch
22
+
23
+ titles, texts, labels = titles.to(device), texts.to(
24
+ device), labels.to(device).float()
25
+
26
+ # Forward pass
27
+ outputs = model(titles, texts).squeeze()
28
+ loss = criterion(outputs, labels)
29
+
30
+ # Backward and optimize
31
+ optimizer.zero_grad()
32
+ loss.backward()
33
+ if max_grad_norm:
34
+ clip_grad_norm_(model.parameters(), max_norm=max_grad_norm)
35
+ optimizer.step()
36
+
37
+ total_loss += loss.item()
38
+ train_pred = (outputs > 0.5).float()
39
+ train_correct += (train_pred == labels).sum().item()
40
+ train_total += labels.size(0)
41
+
42
+ # Calculate and print batch processing time
43
+ batch_time = time.time() - start_time
44
+ print(
45
+ f'Epoch: {epoch+1}, Batch: {batch_idx+1}/{total_batches}, Batch Processing Time: {batch_time:.4f} seconds')
46
+
47
+ train_accuracy = 100 * train_correct / train_total
48
+ metrics['train_loss'].append(total_loss / len(train_loader))
49
+ metrics['train_accuracy'].append(train_accuracy)
50
+
51
+ # Validation
52
+ model.eval()
53
+ val_loss, val_correct, val_total = 0, 0, 0
54
+ with torch.no_grad():
55
+ for titles, texts, labels in val_loader:
56
+ titles, texts, labels = titles.to(device), texts.to(
57
+ device), labels.to(device).float()
58
+ outputs = model(titles, texts).squeeze()
59
+ loss = criterion(outputs, labels)
60
+ val_loss += loss.item()
61
+ predicted = (outputs > 0.5).float()
62
+ val_total += labels.size(0)
63
+ val_correct += (predicted == labels).sum().item()
64
+
65
+ val_accuracy = 100 * val_correct / val_total
66
+ metrics['val_loss'].append(val_loss / len(val_loader))
67
+ metrics['val_accuracy'].append(val_accuracy)
68
+ metrics['epoch'].append(epoch + 1)
69
+
70
+ # Early stopping logic
71
+ if val_accuracy > best_accuracy + early_stopping_delta:
72
+ best_accuracy = val_accuracy
73
+ early_stopping_counter = 0
74
+ best_epoch = epoch + 1
75
+ torch.save(model.state_dict(), best_model_path)
76
+ else:
77
+ early_stopping_counter += 1
78
+
79
+ if early_stopping_counter >= early_stopping_patience:
80
+ print(f"Early stopping triggered at epoch {epoch + 1}")
81
+ break
82
+
83
+ print(
84
+ f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}, Validation Accuracy: {val_accuracy:.2f}%')
85
+
86
+ pd.DataFrame(metrics).to_csv(
87
+ f'./output/version_{version}/training_metrics_{version}.csv', index=False)
88
+
89
+ return model, best_accuracy, best_epoch
train_main.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import pandas as pd
4
+ from model import LSTMModel
5
+ from preprocessing import preprocess_text
6
+ from data_loader import create_data_loader
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.metrics import f1_score, roc_auc_score
9
+ from keras.preprocessing.text import Tokenizer
10
+ from keras_preprocessing.sequence import pad_sequences
11
+ import pickle
12
+ import train as tr
13
+ from torch.utils.data import Dataset, DataLoader
14
+ from data_loader import NewsDataset
15
+
16
+ version = 7
17
+
18
+ if __name__ == "__main__":
19
+
20
+ # fake_path = './data_1/Fake.csv'
21
+ # true_path = './data_1/True.csv'
22
+ # cleaned_path = './cleaned_news_data.csv'
23
+ # # Load data
24
+ # try:
25
+ # df = pd.read_csv(cleaned_path)
26
+ # df.dropna(inplace=True)
27
+ # print("Cleaned data found.")
28
+ # except:
29
+ # print("No cleaned data found. Cleaning data now...")
30
+ # # Load the datasets
31
+ # true_news = pd.read_csv('data_1/True.csv')
32
+ # fake_news = pd.read_csv('data_1/Fake.csv')
33
+
34
+ # # Add labels
35
+ # true_news['label'] = 1
36
+ # fake_news['label'] = 0
37
+
38
+ # # Combine the datasets
39
+ # df = pd.concat([true_news, fake_news], ignore_index=True)
40
+
41
+ # # Drop unnecessary columns
42
+ # df.drop(columns=['subject', 'date'], inplace=True)
43
+
44
+ # df['title'] = df['title'].apply(preprocess_text)
45
+ # df['text'] = df['text'].apply(preprocess_text)
46
+
47
+ # df.to_csv('cleaned_news_data.csv', index=False)
48
+ # df.dropna(inplace=True)
49
+
50
+ data_path = './data_2/WELFake_Dataset.csv'
51
+ cleaned_path = f'./output/version_{version}/cleaned_news_data_{version}.csv'
52
+ # Load data
53
+ try:
54
+ df = pd.read_csv(cleaned_path)
55
+ df.dropna(inplace=True)
56
+ print("Cleaned data found.")
57
+ except:
58
+ print("No cleaned data found. Cleaning data now...")
59
+ df = pd.read_csv(data_path)
60
+
61
+ # Drop index
62
+ df.drop(df.columns[0], axis=1, inplace=True)
63
+ df.dropna(inplace=True)
64
+
65
+ # Swapping labels around since it originally is the opposite
66
+ df['label'] = df['label'].map({0: 1, 1: 0})
67
+
68
+ df['title'] = df['title'].apply(preprocess_text)
69
+ df['text'] = df['text'].apply(preprocess_text)
70
+
71
+ df.to_csv(cleaned_path, index=False)
72
+ print("Cleaned data saved.")
73
+
74
+ # Splitting the data
75
+ train_val, test = train_test_split(df, test_size=0.2, random_state=42)
76
+ train, val = train_test_split(
77
+ train_val, test_size=0.25, random_state=42) # 0.25 * 0.8 = 0.2
78
+
79
+ # Initialize the tokenizer
80
+ tokenizer = Tokenizer()
81
+
82
+ # Fit the tokenizer on the training data
83
+ tokenizer.fit_on_texts(train['title'] + train['text'])
84
+
85
+ with open(f'./output/version_{version}/tokenizer_{version}.pickle', 'wb') as handle:
86
+ pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
87
+
88
+ # Tokenize the data
89
+ X_train_title = tokenizer.texts_to_sequences(train['title'])
90
+ X_train_text = tokenizer.texts_to_sequences(train['text'])
91
+ X_val_title = tokenizer.texts_to_sequences(val['title'])
92
+ X_val_text = tokenizer.texts_to_sequences(val['text'])
93
+ X_test_title = tokenizer.texts_to_sequences(test['title'])
94
+ X_test_text = tokenizer.texts_to_sequences(test['text'])
95
+
96
+ # Padding sequences
97
+ max_length = 500
98
+ X_train_title = pad_sequences(X_train_title, maxlen=max_length)
99
+ X_train_text = pad_sequences(X_train_text, maxlen=max_length)
100
+ X_val_title = pad_sequences(X_val_title, maxlen=max_length)
101
+ X_val_text = pad_sequences(X_val_text, maxlen=max_length)
102
+ X_test_title = pad_sequences(X_test_title, maxlen=max_length)
103
+ X_test_text = pad_sequences(X_test_text, maxlen=max_length)
104
+
105
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
106
+ print(f"Device: {device}")
107
+
108
+ model = LSTMModel(len(tokenizer.word_index) + 1).to(device)
109
+
110
+ # Convert data to PyTorch tensors
111
+ train_data = NewsDataset(torch.tensor(X_train_title), torch.tensor(
112
+ X_train_text), torch.tensor(train['label'].values))
113
+ val_data = NewsDataset(torch.tensor(X_val_title), torch.tensor(
114
+ X_val_text), torch.tensor(val['label'].values))
115
+ test_data = NewsDataset(torch.tensor(X_test_title), torch.tensor(
116
+ X_test_text), torch.tensor(test['label'].values))
117
+
118
+ train_loader = DataLoader(train_data, batch_size=32,
119
+ shuffle=True, num_workers=6, pin_memory=True, persistent_workers=True)
120
+ val_loader = DataLoader(val_data, batch_size=32,
121
+ shuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)
122
+ test_loader = DataLoader(test_data, batch_size=32,
123
+ shuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)
124
+
125
+ criterion = nn.BCELoss()
126
+ optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
127
+
128
+ trained_model, best_accuracy, best_epoch = tr.train(
129
+ model=model,
130
+ train_loader=train_loader,
131
+ val_loader=val_loader,
132
+ criterion=criterion,
133
+ optimizer=optimizer,
134
+ version=version,
135
+ epochs=50,
136
+ device=device,
137
+ max_grad_norm=1.0,
138
+ early_stopping_patience=3,
139
+ early_stopping_delta=0.001
140
+ )
141
+
142
+ print(f'Best model was saved at epoch: {best_epoch}')
143
+
144
+ # Load the best model before testing
145
+ best_model_path = f'./output/version_{version}/best_model_{version}.pth'
146
+ model.load_state_dict(torch.load(best_model_path, map_location=device))
147
+
148
+ # Testing
149
+ model.eval()
150
+ true_labels = []
151
+ predicted_labels = []
152
+ predicted_probs = []
153
+
154
+ with torch.no_grad():
155
+ correct = 0
156
+ total = 0
157
+ for titles, texts, labels in test_loader:
158
+ titles, texts, labels = titles.to(device), texts.to(
159
+ device), labels.to(device).float()
160
+ outputs = model(titles, texts).squeeze()
161
+
162
+ predicted = (outputs > 0.5).float()
163
+ total += labels.size(0)
164
+ correct += (predicted == labels).sum().item()
165
+ true_labels.extend(labels.cpu().numpy())
166
+ predicted_labels.extend(predicted.cpu().numpy())
167
+ predicted_probs.extend(outputs.cpu().numpy())
168
+
169
+ test_accuracy = 100 * correct / total
170
+ f1 = f1_score(true_labels, predicted_labels)
171
+ auc_roc = roc_auc_score(true_labels, predicted_probs)
172
+
173
+ print(
174
+ f'Test Accuracy: {test_accuracy:.2f}%, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}')
175
+
176
+ # Create DataFrame and Save to CSV
177
+ confusion_data = pd.DataFrame(
178
+ {'True': true_labels, 'Predicted': predicted_labels})
179
+ confusion_data.to_csv(
180
+ f'./output/version_{version}/confusion_matrix_data_{version}.csv', index=False)