Initial commit
Browse files- .gitattributes +1 -0
- .gitignore +160 -0
- analysis.ipynb +0 -0
- data_1/Fake.csv +3 -0
- data_1/True.csv +3 -0
- data_2/WELFake_Dataset.csv +3 -0
- data_loader.py +22 -0
- inference.py +39 -0
- inference_main.py +67 -0
- inference_more.ipynb +303 -0
- model.py +20 -0
- output/version_7/best_model_7.pth +3 -0
- output/version_7/cleaned_inference_data_7.csv +3 -0
- output/version_7/cleaned_news_data_7.csv +3 -0
- output/version_7/confusion_matrix_data_7.csv +3 -0
- output/version_7/tokenizer_7.pickle +3 -0
- output/version_7/training_metrics_7.csv +3 -0
- preprocessing.py +46 -0
- test.ipynb +93 -0
- train.py +89 -0
- train_main.py +180 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.csv filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
analysis.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data_1/Fake.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bebf8bcfe95678bf2c732bf413a2ce5f621af0102c82bf08083b2e5d3c693d0c
|
3 |
+
size 62789876
|
data_1/True.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba0844414a65dc6ae7402b8eee5306da24b6b56488d6767135af466c7dcb2775
|
3 |
+
size 53582940
|
data_2/WELFake_Dataset.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:665331424230fc452e9482c3547a6a199a2c29745ade8d236950d1d105223773
|
3 |
+
size 245086152
|
data_loader.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch.utils.data import Dataset, DataLoader
|
2 |
+
import torch
|
3 |
+
|
4 |
+
|
5 |
+
class NewsDataset(Dataset):
|
6 |
+
def __init__(self, titles, texts, labels=None):
|
7 |
+
self.titles = titles
|
8 |
+
self.texts = texts
|
9 |
+
self.labels = labels
|
10 |
+
|
11 |
+
def __len__(self):
|
12 |
+
return len(self.titles)
|
13 |
+
|
14 |
+
def __getitem__(self, idx):
|
15 |
+
if self.labels is not None:
|
16 |
+
return self.titles[idx], self.texts[idx], self.labels[idx]
|
17 |
+
return self.titles[idx], self.texts[idx]
|
18 |
+
|
19 |
+
|
20 |
+
def create_data_loader(titles, texts, labels=None, batch_size=32, shuffle=False, num_workers=6):
|
21 |
+
dataset = NewsDataset(titles, texts, labels)
|
22 |
+
return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True, persistent_workers=True)
|
inference.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
|
4 |
+
from model import LSTMModel
|
5 |
+
|
6 |
+
|
7 |
+
def load_model(model_path, vocab_size):
|
8 |
+
model = LSTMModel(vocab_size)
|
9 |
+
model.load_state_dict(torch.load(model_path))
|
10 |
+
model.eval()
|
11 |
+
return model
|
12 |
+
|
13 |
+
|
14 |
+
def predict(model, titles, texts, device):
|
15 |
+
titles, texts = titles.to(device), texts.to(device)
|
16 |
+
model.to(device)
|
17 |
+
with torch.no_grad():
|
18 |
+
outputs = model(titles, texts).squeeze()
|
19 |
+
return outputs
|
20 |
+
|
21 |
+
|
22 |
+
def evaluate_model(model, data_loader, device, labels):
|
23 |
+
model.to(device)
|
24 |
+
model.eval()
|
25 |
+
predictions = []
|
26 |
+
labels = torch.tensor(labels).to(device)
|
27 |
+
for titles, texts in data_loader:
|
28 |
+
titles, texts = titles.to(device), texts.to(device)
|
29 |
+
outputs = predict(model, titles, texts, device)
|
30 |
+
predictions.extend(outputs.cpu().numpy())
|
31 |
+
|
32 |
+
labels = labels.cpu()
|
33 |
+
# Calculate metrics
|
34 |
+
predicted_labels = [1 if p > 0.5 else 0 for p in predictions]
|
35 |
+
accuracy = accuracy_score(labels, predicted_labels)
|
36 |
+
f1 = f1_score(labels, predicted_labels)
|
37 |
+
auc_roc = roc_auc_score(labels, predictions)
|
38 |
+
|
39 |
+
return accuracy, f1, auc_roc
|
inference_main.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import pandas as pd
|
3 |
+
from preprocessing import preprocess_text, load_tokenizer, prepare_data
|
4 |
+
from data_loader import create_data_loader
|
5 |
+
from inference import load_model, evaluate_model
|
6 |
+
|
7 |
+
version = 7
|
8 |
+
|
9 |
+
|
10 |
+
def run_evaluation(model_path, tokenizer_path, device):
|
11 |
+
cleaned_path = f'./output/version_{version}/cleaned_inference_data_{version}.csv'
|
12 |
+
# Load data
|
13 |
+
try:
|
14 |
+
df = pd.read_csv(cleaned_path)
|
15 |
+
df.dropna(inplace=True)
|
16 |
+
print("Cleaned data found.")
|
17 |
+
except:
|
18 |
+
print("No cleaned data found. Cleaning data now...")
|
19 |
+
# Load the datasets
|
20 |
+
true_news = pd.read_csv('data_1/True.csv')
|
21 |
+
fake_news = pd.read_csv('data_1/Fake.csv')
|
22 |
+
|
23 |
+
# Add labels
|
24 |
+
true_news['label'] = 1
|
25 |
+
fake_news['label'] = 0
|
26 |
+
|
27 |
+
# Combine the datasets
|
28 |
+
df = pd.concat([true_news, fake_news], ignore_index=True)
|
29 |
+
|
30 |
+
# Drop unnecessary columns
|
31 |
+
df.drop(columns=['subject', 'date'], inplace=True)
|
32 |
+
|
33 |
+
df['title'] = df['title'].apply(preprocess_text)
|
34 |
+
df['text'] = df['text'].apply(preprocess_text)
|
35 |
+
|
36 |
+
df.to_csv(cleaned_path, index=False)
|
37 |
+
df.dropna(inplace=True)
|
38 |
+
print("Cleaned data saved.")
|
39 |
+
|
40 |
+
labels = df['label'].values
|
41 |
+
|
42 |
+
# Load tokenizer and model
|
43 |
+
tokenizer = load_tokenizer(tokenizer_path)
|
44 |
+
model = load_model(model_path, len(tokenizer.word_index) + 1)
|
45 |
+
|
46 |
+
# Prepare data
|
47 |
+
titles = prepare_data(df['title'], tokenizer)
|
48 |
+
texts = prepare_data(df['text'], tokenizer)
|
49 |
+
|
50 |
+
# Create DataLoader
|
51 |
+
data_loader = create_data_loader(
|
52 |
+
titles, texts, batch_size=32, shuffle=False)
|
53 |
+
|
54 |
+
# Evaluate
|
55 |
+
accuracy, f1, auc_roc = evaluate_model(model, data_loader, device, labels)
|
56 |
+
return accuracy, f1, auc_roc
|
57 |
+
|
58 |
+
|
59 |
+
if __name__ == "__main__":
|
60 |
+
model_path = f'./output/version_{version}/best_model_{version}.pth'
|
61 |
+
tokenizer_path = f'./output/version_{version}/tokenizer_{version}.pickle'
|
62 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
63 |
+
print(f"Device: {device}")
|
64 |
+
|
65 |
+
accuracy, f1, auc_roc = run_evaluation(model_path, tokenizer_path, device)
|
66 |
+
print(
|
67 |
+
f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}')
|
inference_more.ipynb
ADDED
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stdout",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"GPU is available: True\n"
|
13 |
+
]
|
14 |
+
}
|
15 |
+
],
|
16 |
+
"source": [
|
17 |
+
"import torch\n",
|
18 |
+
"import torch.nn as nn\n",
|
19 |
+
"import pandas as pd\n",
|
20 |
+
"from model import LSTMModel\n",
|
21 |
+
"from data_loader import create_data_loader\n",
|
22 |
+
"from sklearn.model_selection import train_test_split\n",
|
23 |
+
"from sklearn.metrics import f1_score, roc_auc_score\n",
|
24 |
+
"from keras_preprocessing.sequence import pad_sequences\n",
|
25 |
+
"from torch.utils.data import DataLoader\n",
|
26 |
+
"from data_loader import NewsDataset"
|
27 |
+
]
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"cell_type": "code",
|
31 |
+
"execution_count": 1,
|
32 |
+
"metadata": {},
|
33 |
+
"outputs": [],
|
34 |
+
"source": [
|
35 |
+
"version = 7"
|
36 |
+
]
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"cell_type": "code",
|
40 |
+
"execution_count": 2,
|
41 |
+
"metadata": {},
|
42 |
+
"outputs": [
|
43 |
+
{
|
44 |
+
"name": "stdout",
|
45 |
+
"output_type": "stream",
|
46 |
+
"text": [
|
47 |
+
"Cleaned data found.\n"
|
48 |
+
]
|
49 |
+
}
|
50 |
+
],
|
51 |
+
"source": [
|
52 |
+
"data_path = './data_2/WELFake_Dataset.csv'\n",
|
53 |
+
"cleaned_path = f'./output/version_{version}/cleaned_news_data_{version}.csv'\n",
|
54 |
+
"\n",
|
55 |
+
"# Load data\n",
|
56 |
+
"df = pd.read_csv(cleaned_path)\n",
|
57 |
+
"df.dropna(inplace=True)\n",
|
58 |
+
"print(\"Cleaned data found.\")"
|
59 |
+
]
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"cell_type": "code",
|
63 |
+
"execution_count": 3,
|
64 |
+
"metadata": {},
|
65 |
+
"outputs": [],
|
66 |
+
"source": [
|
67 |
+
"from preprocessing import preprocess_text, load_tokenizer, prepare_data\n",
|
68 |
+
"tokenizer = load_tokenizer(f'./output/version_{version}/tokenizer_{version}.pickle')"
|
69 |
+
]
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"cell_type": "code",
|
73 |
+
"execution_count": 4,
|
74 |
+
"metadata": {},
|
75 |
+
"outputs": [],
|
76 |
+
"source": [
|
77 |
+
"train_val, test = train_test_split(df, test_size=0.2, random_state=42)\n",
|
78 |
+
"train, val = train_test_split(\n",
|
79 |
+
"\ttrain_val, test_size=0.25, random_state=42) # 0.25 * 0.8 = 0.2"
|
80 |
+
]
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"cell_type": "code",
|
84 |
+
"execution_count": 5,
|
85 |
+
"metadata": {},
|
86 |
+
"outputs": [],
|
87 |
+
"source": [
|
88 |
+
"# Tokenize the data\n",
|
89 |
+
"X_train_title = tokenizer.texts_to_sequences(train['title'])\n",
|
90 |
+
"X_train_text = tokenizer.texts_to_sequences(train['text'])\n",
|
91 |
+
"X_val_title = tokenizer.texts_to_sequences(val['title'])\n",
|
92 |
+
"X_val_text = tokenizer.texts_to_sequences(val['text'])\n",
|
93 |
+
"X_test_title = tokenizer.texts_to_sequences(test['title'])\n",
|
94 |
+
"X_test_text = tokenizer.texts_to_sequences(test['text'])\n",
|
95 |
+
"\n",
|
96 |
+
"# Padding sequences\n",
|
97 |
+
"max_length = 500\n",
|
98 |
+
"X_train_title = pad_sequences(X_train_title, maxlen=max_length)\n",
|
99 |
+
"X_train_text = pad_sequences(X_train_text, maxlen=max_length)\n",
|
100 |
+
"X_val_title = pad_sequences(X_val_title, maxlen=max_length)\n",
|
101 |
+
"X_val_text = pad_sequences(X_val_text, maxlen=max_length)\n",
|
102 |
+
"X_test_title = pad_sequences(X_test_title, maxlen=max_length)\n",
|
103 |
+
"X_test_text = pad_sequences(X_test_text, maxlen=max_length)"
|
104 |
+
]
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"cell_type": "code",
|
108 |
+
"execution_count": 6,
|
109 |
+
"metadata": {},
|
110 |
+
"outputs": [],
|
111 |
+
"source": [
|
112 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
|
113 |
+
]
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"cell_type": "code",
|
117 |
+
"execution_count": 7,
|
118 |
+
"metadata": {},
|
119 |
+
"outputs": [],
|
120 |
+
"source": [
|
121 |
+
"model = LSTMModel(len(tokenizer.word_index) + 1).to(device)\n",
|
122 |
+
"\n",
|
123 |
+
"# Convert data to PyTorch tensors\n",
|
124 |
+
"train_data = NewsDataset(torch.tensor(X_train_title), torch.tensor(\n",
|
125 |
+
"\tX_train_text), torch.tensor(train['label'].values))\n",
|
126 |
+
"val_data = NewsDataset(torch.tensor(X_val_title), torch.tensor(\n",
|
127 |
+
"\tX_val_text), torch.tensor(val['label'].values))\n",
|
128 |
+
"test_data = NewsDataset(torch.tensor(X_test_title), torch.tensor(\n",
|
129 |
+
"\tX_test_text), torch.tensor(test['label'].values))\n",
|
130 |
+
"\n",
|
131 |
+
"train_loader = DataLoader(train_data, batch_size=32,\n",
|
132 |
+
"\t\t\t\t\t\t\tshuffle=True, num_workers=6, pin_memory=True, persistent_workers=True)\n",
|
133 |
+
"val_loader = DataLoader(val_data, batch_size=32,\n",
|
134 |
+
"\t\t\t\t\t\tshuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)\n",
|
135 |
+
"test_loader = DataLoader(test_data, batch_size=32,\n",
|
136 |
+
"\t\t\t\t\t\t\tshuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)\n",
|
137 |
+
"\n",
|
138 |
+
"criterion = nn.BCELoss()\n",
|
139 |
+
"optimizer = torch.optim.Adam(model.parameters(), lr=0.001)"
|
140 |
+
]
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"cell_type": "code",
|
144 |
+
"execution_count": 8,
|
145 |
+
"metadata": {},
|
146 |
+
"outputs": [
|
147 |
+
{
|
148 |
+
"name": "stdout",
|
149 |
+
"output_type": "stream",
|
150 |
+
"text": [
|
151 |
+
"Test Accuracy: 98.70%, F1 Score: 0.9868, AUC-ROC: 0.9984\n"
|
152 |
+
]
|
153 |
+
}
|
154 |
+
],
|
155 |
+
"source": [
|
156 |
+
"model.load_state_dict(torch.load(f\"./output/version_{version}/best_model_{version}.pth\", map_location=device))\n",
|
157 |
+
"\n",
|
158 |
+
"# Testing\n",
|
159 |
+
"model.eval()\n",
|
160 |
+
"true_labels = []\n",
|
161 |
+
"predicted_labels = []\n",
|
162 |
+
"predicted_probs = []\n",
|
163 |
+
"\n",
|
164 |
+
"with torch.no_grad():\n",
|
165 |
+
"\tcorrect = 0\n",
|
166 |
+
"\ttotal = 0\n",
|
167 |
+
"\tfor titles, texts, labels in test_loader:\n",
|
168 |
+
"\t\ttitles, texts, labels = titles.to(device), texts.to(\n",
|
169 |
+
"\t\t\tdevice), labels.to(device).float()\n",
|
170 |
+
"\t\toutputs = model(titles, texts).squeeze()\n",
|
171 |
+
"\n",
|
172 |
+
"\t\tpredicted = (outputs > 0.5).float()\n",
|
173 |
+
"\t\ttotal += labels.size(0)\n",
|
174 |
+
"\t\tcorrect += (predicted == labels).sum().item()\n",
|
175 |
+
"\t\ttrue_labels.extend(labels.cpu().numpy())\n",
|
176 |
+
"\t\tpredicted_labels.extend(predicted.cpu().numpy())\n",
|
177 |
+
"\t\tpredicted_probs.extend(outputs.cpu().numpy())\n",
|
178 |
+
"\n",
|
179 |
+
"test_accuracy = 100 * correct / total\n",
|
180 |
+
"f1 = f1_score(true_labels, predicted_labels)\n",
|
181 |
+
"auc_roc = roc_auc_score(true_labels, predicted_probs)\n",
|
182 |
+
"\n",
|
183 |
+
"print(\n",
|
184 |
+
"\tf'Test Accuracy: {test_accuracy:.2f}%, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}')\n",
|
185 |
+
"\n",
|
186 |
+
"# Create DataFrame and Save to CSV\n",
|
187 |
+
"confusion_data = pd.DataFrame(\n",
|
188 |
+
"\t{'True': true_labels, 'Predicted': predicted_labels})\n",
|
189 |
+
"confusion_data.to_csv('confusion_matrix_data.csv', index=False)"
|
190 |
+
]
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"cell_type": "code",
|
194 |
+
"execution_count": 36,
|
195 |
+
"metadata": {},
|
196 |
+
"outputs": [
|
197 |
+
{
|
198 |
+
"name": "stdout",
|
199 |
+
"output_type": "stream",
|
200 |
+
"text": [
|
201 |
+
" title \\\n",
|
202 |
+
"0 Trump’s creating just the kind of legal chaos ... \n",
|
203 |
+
"\n",
|
204 |
+
" text \n",
|
205 |
+
"0 Donald Trump’s request to the Supreme Court o... \n",
|
206 |
+
"outputs: 0.5209237933158875\n"
|
207 |
+
]
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"ename": "TypeError",
|
211 |
+
"evalue": "iteration over a 0-d array",
|
212 |
+
"output_type": "error",
|
213 |
+
"traceback": [
|
214 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
215 |
+
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
216 |
+
"Cell \u001b[1;32mIn[36], line 30\u001b[0m\n\u001b[0;32m 28\u001b[0m outputs \u001b[38;5;241m=\u001b[39m predict(model, titles, texts, device)\n\u001b[0;32m 29\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutputs: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00moutputs\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 30\u001b[0m \u001b[43mpredictions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mextend\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutputs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcpu\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnumpy\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 31\u001b[0m predicted_labels \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m1\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m p \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0.5\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;241m0\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m predictions]\n\u001b[0;32m 33\u001b[0m \u001b[38;5;28mprint\u001b[39m(predictions)\n",
|
217 |
+
"\u001b[1;31mTypeError\u001b[0m: iteration over a 0-d array"
|
218 |
+
]
|
219 |
+
}
|
220 |
+
],
|
221 |
+
"source": [
|
222 |
+
"import numpy as np\n",
|
223 |
+
"from inference import predict, load_model\n",
|
224 |
+
"predictions = []\n",
|
225 |
+
"\n",
|
226 |
+
"user_title = input(\"Enter title: \")\n",
|
227 |
+
"user_text = input(\"Enter text: \")\n",
|
228 |
+
"\n",
|
229 |
+
"\n",
|
230 |
+
"# Creating the DataFrame with the user's input\n",
|
231 |
+
"df = pd.DataFrame({'title': [user_title], 'text': [user_text]})\n",
|
232 |
+
"print(df.head())\n",
|
233 |
+
"\n",
|
234 |
+
"df['title'] = df['title'].apply(preprocess_text)\n",
|
235 |
+
"df['text'] = df['text'].apply(preprocess_text)\n",
|
236 |
+
"\n",
|
237 |
+
"tokenizer = load_tokenizer(f\"./output/version_{version}/tokenizer_{version}.pickle\")\n",
|
238 |
+
"model = load_model(f\"./output/version_{version}/best_model_{version}.pth\", len(tokenizer.word_index) + 1)\n",
|
239 |
+
"title = prepare_data(df[\"title\"], tokenizer)\n",
|
240 |
+
"text = prepare_data(df[\"text\"], tokenizer)\n",
|
241 |
+
"\n",
|
242 |
+
"# Create DataLoader\n",
|
243 |
+
"data_loader = create_data_loader(\n",
|
244 |
+
" title, text, batch_size=32, shuffle=False)\n",
|
245 |
+
"model.eval()\n",
|
246 |
+
"model.to(device)\n",
|
247 |
+
"for titles, texts in data_loader:\n",
|
248 |
+
" titles, texts = titles.to(device), texts.to(device)\n",
|
249 |
+
" outputs = predict(model, titles, texts, device)\n",
|
250 |
+
" print(f\"outputs: {outputs}\")\n",
|
251 |
+
" # predictions.extend(outputs.cpu().numpy())\n",
|
252 |
+
"predicted_labels = [1 if p > 0.5 else 0 for p in predictions]\n",
|
253 |
+
"\n",
|
254 |
+
"print(predictions)\n",
|
255 |
+
"print(predicted_labels)"
|
256 |
+
]
|
257 |
+
},
|
258 |
+
{
|
259 |
+
"cell_type": "code",
|
260 |
+
"execution_count": 18,
|
261 |
+
"metadata": {},
|
262 |
+
"outputs": [
|
263 |
+
{
|
264 |
+
"name": "stdout",
|
265 |
+
"output_type": "stream",
|
266 |
+
"text": [
|
267 |
+
" title text\n",
|
268 |
+
"0 hello title hello this is text\n"
|
269 |
+
]
|
270 |
+
}
|
271 |
+
],
|
272 |
+
"source": [
|
273 |
+
"user_title = input(\"Enter title: \")\n",
|
274 |
+
"user_text = input(\"Enter text: \")\n",
|
275 |
+
"\n",
|
276 |
+
"# Creating the DataFrame with the user's input\n",
|
277 |
+
"df = pd.DataFrame({'title': [user_title], 'text': [user_text]})\n",
|
278 |
+
"print(df.head())"
|
279 |
+
]
|
280 |
+
}
|
281 |
+
],
|
282 |
+
"metadata": {
|
283 |
+
"kernelspec": {
|
284 |
+
"display_name": "torch",
|
285 |
+
"language": "python",
|
286 |
+
"name": "python3"
|
287 |
+
},
|
288 |
+
"language_info": {
|
289 |
+
"codemirror_mode": {
|
290 |
+
"name": "ipython",
|
291 |
+
"version": 3
|
292 |
+
},
|
293 |
+
"file_extension": ".py",
|
294 |
+
"mimetype": "text/x-python",
|
295 |
+
"name": "python",
|
296 |
+
"nbconvert_exporter": "python",
|
297 |
+
"pygments_lexer": "ipython3",
|
298 |
+
"version": "3.10.11"
|
299 |
+
}
|
300 |
+
},
|
301 |
+
"nbformat": 4,
|
302 |
+
"nbformat_minor": 2
|
303 |
+
}
|
model.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
|
5 |
+
class LSTMModel(nn.Module):
|
6 |
+
def __init__(self, vocab_size, embedding_dim=128, hidden_size=256, num_layers=2, dropout=0.2):
|
7 |
+
super(LSTMModel, self).__init__()
|
8 |
+
self.embedding = nn.Embedding(
|
9 |
+
num_embeddings=vocab_size, embedding_dim=embedding_dim)
|
10 |
+
self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size,
|
11 |
+
num_layers=num_layers, batch_first=True, dropout=dropout)
|
12 |
+
self.fc = nn.Linear(hidden_size, 1)
|
13 |
+
|
14 |
+
def forward(self, title, text):
|
15 |
+
title_emb = self.embedding(title)
|
16 |
+
text_emb = self.embedding(text)
|
17 |
+
combined = torch.cat((title_emb, text_emb), dim=1)
|
18 |
+
output, (hidden, _) = self.lstm(combined)
|
19 |
+
out = self.fc(hidden[-1])
|
20 |
+
return torch.sigmoid(out)
|
output/version_7/best_model_7.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d5b5750829a8f672dcbd297143eddbf0621a024930055dd7f7363db34ac6e374
|
3 |
+
size 101492472
|
output/version_7/cleaned_inference_data_7.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:13c4f9369494fb6a24ca2c1415027736e0814b576854b0187a4dd93c5f6f344b
|
3 |
+
size 74695505
|
output/version_7/cleaned_news_data_7.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f7b95458a091aeaede51ebb58a6d039e21dbc8d50a78439563d8d2b6149c1150
|
3 |
+
size 154624396
|
output/version_7/confusion_matrix_data_7.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ca5e75060519cb2c81af94130fd48d3db06689baf6f32eb434625ab22aa168f
|
3 |
+
size 127312
|
output/version_7/tokenizer_7.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e129ef007dd8405eefe6ed17a5737e368f2066bef28e933c469a180499994a56
|
3 |
+
size 8812251
|
output/version_7/training_metrics_7.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:273d33e9f9cdd11a57149d86f8ded1ee222cab35508628fa05dbb3f31fae20cb
|
3 |
+
size 1252
|
preprocessing.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import spacy
|
3 |
+
from keras.preprocessing.text import Tokenizer
|
4 |
+
from keras_preprocessing.sequence import pad_sequences
|
5 |
+
import pickle
|
6 |
+
|
7 |
+
spacy.prefer_gpu()
|
8 |
+
print("GPU is available:", spacy.prefer_gpu())
|
9 |
+
|
10 |
+
# Load spaCy's English model
|
11 |
+
nlp = spacy.load('en_core_web_sm')
|
12 |
+
|
13 |
+
|
14 |
+
def preprocess_text(text):
|
15 |
+
# Remove patterns like "COUNTRY or STATE NAME (Reuters) -"
|
16 |
+
text = re.sub(r'\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-', '', text)
|
17 |
+
|
18 |
+
# Remove patterns like "Featured image via author name / image place"
|
19 |
+
text = re.sub(r'Featured image via .+ / .+', '', text)
|
20 |
+
|
21 |
+
# Process text with spaCy
|
22 |
+
doc = nlp(text)
|
23 |
+
|
24 |
+
# Improved lemmatization
|
25 |
+
lemmatized_text = []
|
26 |
+
for token in doc:
|
27 |
+
# Preserve named entities in their original form
|
28 |
+
if token.ent_type_:
|
29 |
+
lemmatized_text.append(token.text)
|
30 |
+
# Lemmatize other tokens and exclude non-alpha tokens if necessary
|
31 |
+
elif token.is_alpha and not token.is_stop:
|
32 |
+
lemmatized_text.append(token.lemma_.lower())
|
33 |
+
|
34 |
+
return ' '.join(lemmatized_text)
|
35 |
+
|
36 |
+
|
37 |
+
def load_tokenizer(tokenizer_path):
|
38 |
+
with open(tokenizer_path, 'rb') as handle:
|
39 |
+
tokenizer = pickle.load(handle)
|
40 |
+
return tokenizer
|
41 |
+
|
42 |
+
|
43 |
+
def prepare_data(texts, tokenizer, max_length=500):
|
44 |
+
sequences = tokenizer.texts_to_sequences(texts)
|
45 |
+
padded = pad_sequences(sequences, maxlen=max_length)
|
46 |
+
return padded
|
test.ipynb
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stdout",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"GPU is available: True\n"
|
13 |
+
]
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"name": "stderr",
|
17 |
+
"output_type": "stream",
|
18 |
+
"text": [
|
19 |
+
"c:\\Users\\kimi\\anaconda3\\envs\\torch\\lib\\site-packages\\spacy\\util.py:910: UserWarning: [W095] Model 'en_core_web_sm' (3.5.0) was trained with spaCy v3.5.0 and may not be 100% compatible with the current version (3.7.2). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n",
|
20 |
+
" warnings.warn(warn_msg)\n"
|
21 |
+
]
|
22 |
+
}
|
23 |
+
],
|
24 |
+
"source": [
|
25 |
+
"import torch\n",
|
26 |
+
"import torch.nn as nn\n",
|
27 |
+
"import pandas as pd\n",
|
28 |
+
"from model import LSTMModel\n",
|
29 |
+
"from preprocessing import preprocess_text\n",
|
30 |
+
"from data_loader import create_data_loader\n",
|
31 |
+
"from sklearn.model_selection import train_test_split\n",
|
32 |
+
"from sklearn.metrics import f1_score, roc_auc_score\n",
|
33 |
+
"from keras.preprocessing.text import Tokenizer\n",
|
34 |
+
"from keras_preprocessing.sequence import pad_sequences\n",
|
35 |
+
"import pickle\n",
|
36 |
+
"import train as tr\n",
|
37 |
+
"from torch.utils.data import Dataset, DataLoader\n",
|
38 |
+
"from data_loader import NewsDataset"
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"cell_type": "code",
|
43 |
+
"execution_count": null,
|
44 |
+
"metadata": {},
|
45 |
+
"outputs": [],
|
46 |
+
"source": [
|
47 |
+
"fake_path = './data_1/Fake.csv'\n",
|
48 |
+
"true_path = './data_1/True.csv'\n",
|
49 |
+
"\n",
|
50 |
+
"print(\"No cleaned data found. Cleaning data now...\")\n",
|
51 |
+
"# Load the datasets\n",
|
52 |
+
"true_news = pd.read_csv('data_1/True.csv')\n",
|
53 |
+
"fake_news = pd.read_csv('data_1/Fake.csv')\n",
|
54 |
+
"\n",
|
55 |
+
"# Add labels\n",
|
56 |
+
"true_news['label'] = 1\n",
|
57 |
+
"fake_news['label'] = 0\n",
|
58 |
+
"\n",
|
59 |
+
"# Combine the datasets\n",
|
60 |
+
"df = pd.concat([true_news, fake_news], ignore_index=True)\n",
|
61 |
+
"\n",
|
62 |
+
"# Drop unnecessary columns\n",
|
63 |
+
"df.drop(columns=['subject', 'date'], inplace=True)\n",
|
64 |
+
"\n",
|
65 |
+
"df['title'] = df[0]['title'].apply(preprocess_text)\n",
|
66 |
+
"df['text'] = df[0]['text'].apply(preprocess_text)\n",
|
67 |
+
"\n",
|
68 |
+
"df.to_csv('test.csv', index=False)"
|
69 |
+
]
|
70 |
+
}
|
71 |
+
],
|
72 |
+
"metadata": {
|
73 |
+
"kernelspec": {
|
74 |
+
"display_name": "torch",
|
75 |
+
"language": "python",
|
76 |
+
"name": "python3"
|
77 |
+
},
|
78 |
+
"language_info": {
|
79 |
+
"codemirror_mode": {
|
80 |
+
"name": "ipython",
|
81 |
+
"version": 3
|
82 |
+
},
|
83 |
+
"file_extension": ".py",
|
84 |
+
"mimetype": "text/x-python",
|
85 |
+
"name": "python",
|
86 |
+
"nbconvert_exporter": "python",
|
87 |
+
"pygments_lexer": "ipython3",
|
88 |
+
"version": "3.10.11"
|
89 |
+
}
|
90 |
+
},
|
91 |
+
"nbformat": 4,
|
92 |
+
"nbformat_minor": 2
|
93 |
+
}
|
train.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import pandas as pd
|
3 |
+
import time
|
4 |
+
from torch.nn.utils import clip_grad_norm_
|
5 |
+
|
6 |
+
|
7 |
+
def train(model, train_loader, val_loader, criterion, optimizer, epochs, device, version, max_grad_norm=1.0, early_stopping_patience=5, early_stopping_delta=0.001):
|
8 |
+
best_accuracy = 0.0
|
9 |
+
best_model_path = f'./output/version_{version}/best_model_{version}.pth'
|
10 |
+
best_epoch = 0
|
11 |
+
early_stopping_counter = 0
|
12 |
+
total_batches = len(train_loader)
|
13 |
+
metrics = {
|
14 |
+
'epoch': [], 'train_loss': [], 'val_loss': [], 'train_accuracy': [], 'val_accuracy': []
|
15 |
+
}
|
16 |
+
|
17 |
+
for epoch in range(epochs):
|
18 |
+
model.train()
|
19 |
+
total_loss, train_correct, train_total = 0, 0, 0
|
20 |
+
for batch_idx, (titles, texts, labels) in enumerate(train_loader):
|
21 |
+
start_time = time.time() # Start time for the batch
|
22 |
+
|
23 |
+
titles, texts, labels = titles.to(device), texts.to(
|
24 |
+
device), labels.to(device).float()
|
25 |
+
|
26 |
+
# Forward pass
|
27 |
+
outputs = model(titles, texts).squeeze()
|
28 |
+
loss = criterion(outputs, labels)
|
29 |
+
|
30 |
+
# Backward and optimize
|
31 |
+
optimizer.zero_grad()
|
32 |
+
loss.backward()
|
33 |
+
if max_grad_norm:
|
34 |
+
clip_grad_norm_(model.parameters(), max_norm=max_grad_norm)
|
35 |
+
optimizer.step()
|
36 |
+
|
37 |
+
total_loss += loss.item()
|
38 |
+
train_pred = (outputs > 0.5).float()
|
39 |
+
train_correct += (train_pred == labels).sum().item()
|
40 |
+
train_total += labels.size(0)
|
41 |
+
|
42 |
+
# Calculate and print batch processing time
|
43 |
+
batch_time = time.time() - start_time
|
44 |
+
print(
|
45 |
+
f'Epoch: {epoch+1}, Batch: {batch_idx+1}/{total_batches}, Batch Processing Time: {batch_time:.4f} seconds')
|
46 |
+
|
47 |
+
train_accuracy = 100 * train_correct / train_total
|
48 |
+
metrics['train_loss'].append(total_loss / len(train_loader))
|
49 |
+
metrics['train_accuracy'].append(train_accuracy)
|
50 |
+
|
51 |
+
# Validation
|
52 |
+
model.eval()
|
53 |
+
val_loss, val_correct, val_total = 0, 0, 0
|
54 |
+
with torch.no_grad():
|
55 |
+
for titles, texts, labels in val_loader:
|
56 |
+
titles, texts, labels = titles.to(device), texts.to(
|
57 |
+
device), labels.to(device).float()
|
58 |
+
outputs = model(titles, texts).squeeze()
|
59 |
+
loss = criterion(outputs, labels)
|
60 |
+
val_loss += loss.item()
|
61 |
+
predicted = (outputs > 0.5).float()
|
62 |
+
val_total += labels.size(0)
|
63 |
+
val_correct += (predicted == labels).sum().item()
|
64 |
+
|
65 |
+
val_accuracy = 100 * val_correct / val_total
|
66 |
+
metrics['val_loss'].append(val_loss / len(val_loader))
|
67 |
+
metrics['val_accuracy'].append(val_accuracy)
|
68 |
+
metrics['epoch'].append(epoch + 1)
|
69 |
+
|
70 |
+
# Early stopping logic
|
71 |
+
if val_accuracy > best_accuracy + early_stopping_delta:
|
72 |
+
best_accuracy = val_accuracy
|
73 |
+
early_stopping_counter = 0
|
74 |
+
best_epoch = epoch + 1
|
75 |
+
torch.save(model.state_dict(), best_model_path)
|
76 |
+
else:
|
77 |
+
early_stopping_counter += 1
|
78 |
+
|
79 |
+
if early_stopping_counter >= early_stopping_patience:
|
80 |
+
print(f"Early stopping triggered at epoch {epoch + 1}")
|
81 |
+
break
|
82 |
+
|
83 |
+
print(
|
84 |
+
f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}, Validation Accuracy: {val_accuracy:.2f}%')
|
85 |
+
|
86 |
+
pd.DataFrame(metrics).to_csv(
|
87 |
+
f'./output/version_{version}/training_metrics_{version}.csv', index=False)
|
88 |
+
|
89 |
+
return model, best_accuracy, best_epoch
|
train_main.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import pandas as pd
|
4 |
+
from model import LSTMModel
|
5 |
+
from preprocessing import preprocess_text
|
6 |
+
from data_loader import create_data_loader
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.metrics import f1_score, roc_auc_score
|
9 |
+
from keras.preprocessing.text import Tokenizer
|
10 |
+
from keras_preprocessing.sequence import pad_sequences
|
11 |
+
import pickle
|
12 |
+
import train as tr
|
13 |
+
from torch.utils.data import Dataset, DataLoader
|
14 |
+
from data_loader import NewsDataset
|
15 |
+
|
16 |
+
version = 7
|
17 |
+
|
18 |
+
if __name__ == "__main__":
|
19 |
+
|
20 |
+
# fake_path = './data_1/Fake.csv'
|
21 |
+
# true_path = './data_1/True.csv'
|
22 |
+
# cleaned_path = './cleaned_news_data.csv'
|
23 |
+
# # Load data
|
24 |
+
# try:
|
25 |
+
# df = pd.read_csv(cleaned_path)
|
26 |
+
# df.dropna(inplace=True)
|
27 |
+
# print("Cleaned data found.")
|
28 |
+
# except:
|
29 |
+
# print("No cleaned data found. Cleaning data now...")
|
30 |
+
# # Load the datasets
|
31 |
+
# true_news = pd.read_csv('data_1/True.csv')
|
32 |
+
# fake_news = pd.read_csv('data_1/Fake.csv')
|
33 |
+
|
34 |
+
# # Add labels
|
35 |
+
# true_news['label'] = 1
|
36 |
+
# fake_news['label'] = 0
|
37 |
+
|
38 |
+
# # Combine the datasets
|
39 |
+
# df = pd.concat([true_news, fake_news], ignore_index=True)
|
40 |
+
|
41 |
+
# # Drop unnecessary columns
|
42 |
+
# df.drop(columns=['subject', 'date'], inplace=True)
|
43 |
+
|
44 |
+
# df['title'] = df['title'].apply(preprocess_text)
|
45 |
+
# df['text'] = df['text'].apply(preprocess_text)
|
46 |
+
|
47 |
+
# df.to_csv('cleaned_news_data.csv', index=False)
|
48 |
+
# df.dropna(inplace=True)
|
49 |
+
|
50 |
+
data_path = './data_2/WELFake_Dataset.csv'
|
51 |
+
cleaned_path = f'./output/version_{version}/cleaned_news_data_{version}.csv'
|
52 |
+
# Load data
|
53 |
+
try:
|
54 |
+
df = pd.read_csv(cleaned_path)
|
55 |
+
df.dropna(inplace=True)
|
56 |
+
print("Cleaned data found.")
|
57 |
+
except:
|
58 |
+
print("No cleaned data found. Cleaning data now...")
|
59 |
+
df = pd.read_csv(data_path)
|
60 |
+
|
61 |
+
# Drop index
|
62 |
+
df.drop(df.columns[0], axis=1, inplace=True)
|
63 |
+
df.dropna(inplace=True)
|
64 |
+
|
65 |
+
# Swapping labels around since it originally is the opposite
|
66 |
+
df['label'] = df['label'].map({0: 1, 1: 0})
|
67 |
+
|
68 |
+
df['title'] = df['title'].apply(preprocess_text)
|
69 |
+
df['text'] = df['text'].apply(preprocess_text)
|
70 |
+
|
71 |
+
df.to_csv(cleaned_path, index=False)
|
72 |
+
print("Cleaned data saved.")
|
73 |
+
|
74 |
+
# Splitting the data
|
75 |
+
train_val, test = train_test_split(df, test_size=0.2, random_state=42)
|
76 |
+
train, val = train_test_split(
|
77 |
+
train_val, test_size=0.25, random_state=42) # 0.25 * 0.8 = 0.2
|
78 |
+
|
79 |
+
# Initialize the tokenizer
|
80 |
+
tokenizer = Tokenizer()
|
81 |
+
|
82 |
+
# Fit the tokenizer on the training data
|
83 |
+
tokenizer.fit_on_texts(train['title'] + train['text'])
|
84 |
+
|
85 |
+
with open(f'./output/version_{version}/tokenizer_{version}.pickle', 'wb') as handle:
|
86 |
+
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
87 |
+
|
88 |
+
# Tokenize the data
|
89 |
+
X_train_title = tokenizer.texts_to_sequences(train['title'])
|
90 |
+
X_train_text = tokenizer.texts_to_sequences(train['text'])
|
91 |
+
X_val_title = tokenizer.texts_to_sequences(val['title'])
|
92 |
+
X_val_text = tokenizer.texts_to_sequences(val['text'])
|
93 |
+
X_test_title = tokenizer.texts_to_sequences(test['title'])
|
94 |
+
X_test_text = tokenizer.texts_to_sequences(test['text'])
|
95 |
+
|
96 |
+
# Padding sequences
|
97 |
+
max_length = 500
|
98 |
+
X_train_title = pad_sequences(X_train_title, maxlen=max_length)
|
99 |
+
X_train_text = pad_sequences(X_train_text, maxlen=max_length)
|
100 |
+
X_val_title = pad_sequences(X_val_title, maxlen=max_length)
|
101 |
+
X_val_text = pad_sequences(X_val_text, maxlen=max_length)
|
102 |
+
X_test_title = pad_sequences(X_test_title, maxlen=max_length)
|
103 |
+
X_test_text = pad_sequences(X_test_text, maxlen=max_length)
|
104 |
+
|
105 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
106 |
+
print(f"Device: {device}")
|
107 |
+
|
108 |
+
model = LSTMModel(len(tokenizer.word_index) + 1).to(device)
|
109 |
+
|
110 |
+
# Convert data to PyTorch tensors
|
111 |
+
train_data = NewsDataset(torch.tensor(X_train_title), torch.tensor(
|
112 |
+
X_train_text), torch.tensor(train['label'].values))
|
113 |
+
val_data = NewsDataset(torch.tensor(X_val_title), torch.tensor(
|
114 |
+
X_val_text), torch.tensor(val['label'].values))
|
115 |
+
test_data = NewsDataset(torch.tensor(X_test_title), torch.tensor(
|
116 |
+
X_test_text), torch.tensor(test['label'].values))
|
117 |
+
|
118 |
+
train_loader = DataLoader(train_data, batch_size=32,
|
119 |
+
shuffle=True, num_workers=6, pin_memory=True, persistent_workers=True)
|
120 |
+
val_loader = DataLoader(val_data, batch_size=32,
|
121 |
+
shuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)
|
122 |
+
test_loader = DataLoader(test_data, batch_size=32,
|
123 |
+
shuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)
|
124 |
+
|
125 |
+
criterion = nn.BCELoss()
|
126 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
127 |
+
|
128 |
+
trained_model, best_accuracy, best_epoch = tr.train(
|
129 |
+
model=model,
|
130 |
+
train_loader=train_loader,
|
131 |
+
val_loader=val_loader,
|
132 |
+
criterion=criterion,
|
133 |
+
optimizer=optimizer,
|
134 |
+
version=version,
|
135 |
+
epochs=50,
|
136 |
+
device=device,
|
137 |
+
max_grad_norm=1.0,
|
138 |
+
early_stopping_patience=3,
|
139 |
+
early_stopping_delta=0.001
|
140 |
+
)
|
141 |
+
|
142 |
+
print(f'Best model was saved at epoch: {best_epoch}')
|
143 |
+
|
144 |
+
# Load the best model before testing
|
145 |
+
best_model_path = f'./output/version_{version}/best_model_{version}.pth'
|
146 |
+
model.load_state_dict(torch.load(best_model_path, map_location=device))
|
147 |
+
|
148 |
+
# Testing
|
149 |
+
model.eval()
|
150 |
+
true_labels = []
|
151 |
+
predicted_labels = []
|
152 |
+
predicted_probs = []
|
153 |
+
|
154 |
+
with torch.no_grad():
|
155 |
+
correct = 0
|
156 |
+
total = 0
|
157 |
+
for titles, texts, labels in test_loader:
|
158 |
+
titles, texts, labels = titles.to(device), texts.to(
|
159 |
+
device), labels.to(device).float()
|
160 |
+
outputs = model(titles, texts).squeeze()
|
161 |
+
|
162 |
+
predicted = (outputs > 0.5).float()
|
163 |
+
total += labels.size(0)
|
164 |
+
correct += (predicted == labels).sum().item()
|
165 |
+
true_labels.extend(labels.cpu().numpy())
|
166 |
+
predicted_labels.extend(predicted.cpu().numpy())
|
167 |
+
predicted_probs.extend(outputs.cpu().numpy())
|
168 |
+
|
169 |
+
test_accuracy = 100 * correct / total
|
170 |
+
f1 = f1_score(true_labels, predicted_labels)
|
171 |
+
auc_roc = roc_auc_score(true_labels, predicted_probs)
|
172 |
+
|
173 |
+
print(
|
174 |
+
f'Test Accuracy: {test_accuracy:.2f}%, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}')
|
175 |
+
|
176 |
+
# Create DataFrame and Save to CSV
|
177 |
+
confusion_data = pd.DataFrame(
|
178 |
+
{'True': true_labels, 'Predicted': predicted_labels})
|
179 |
+
confusion_data.to_csv(
|
180 |
+
f'./output/version_{version}/confusion_matrix_data_{version}.csv', index=False)
|