Updated preprocessing and inferenced on data_3
Browse files- analysis.ipynb +0 -0
- data_3/news_articles.csv +3 -0
- inference_main.py +53 -28
- output/version_8/best_model_8.pth +3 -0
- output/version_8/cleaned_news_data_8.csv +3 -0
- output/version_8/confusion_matrix_data_8.csv +3 -0
- output/version_8/tokenizer_8.pickle +3 -0
- output/version_8/training_metrics_8.csv +3 -0
- output/version_9/best_model_9.pth +3 -0
- output/version_9/cleaned_inference_data_9.csv +3 -0
- output/version_9/cleaned_news_data_9.csv +3 -0
- output/version_9/confusion_matrix_data_9.csv +3 -0
- output/version_9/tokenizer_9.pickle +3 -0
- output/version_9/training_metrics_9.csv +3 -0
- preprocessing.py +8 -9
- train_main.py +73 -39
analysis.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
data_3/news_articles.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53855240e9036a7d6c204e72bd0fa9d37a10f8e1bd2b2fdf34b962569ef271c6
|
3 |
+
size 10969548
|
inference_main.py
CHANGED
@@ -3,53 +3,79 @@ import pandas as pd
|
|
3 |
from preprocessing import preprocess_text, load_tokenizer, prepare_data
|
4 |
from data_loader import create_data_loader
|
5 |
from inference import load_model, evaluate_model
|
|
|
6 |
|
7 |
-
version =
|
8 |
|
9 |
|
10 |
def run_evaluation(model_path, tokenizer_path, device):
|
11 |
-
cleaned_path = f
|
12 |
# Load data
|
13 |
-
|
14 |
df = pd.read_csv(cleaned_path)
|
15 |
df.dropna(inplace=True)
|
16 |
print("Cleaned data found.")
|
17 |
-
|
18 |
print("No cleaned data found. Cleaning data now...")
|
19 |
-
# Load the datasets
|
20 |
-
true_news = pd.read_csv(
|
21 |
-
fake_news = pd.read_csv(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
# Combine the datasets
|
28 |
-
df = pd.concat([true_news, fake_news], ignore_index=True)
|
29 |
-
|
30 |
-
# Drop unnecessary columns
|
31 |
-
df.drop(columns=['subject', 'date'], inplace=True)
|
32 |
-
|
33 |
-
df['title'] = df['title'].apply(preprocess_text)
|
34 |
-
df['text'] = df['text'].apply(preprocess_text)
|
35 |
|
36 |
df.to_csv(cleaned_path, index=False)
|
37 |
df.dropna(inplace=True)
|
38 |
print("Cleaned data saved.")
|
39 |
|
40 |
-
labels = df[
|
41 |
|
42 |
# Load tokenizer and model
|
43 |
tokenizer = load_tokenizer(tokenizer_path)
|
44 |
model = load_model(model_path, len(tokenizer.word_index) + 1)
|
45 |
|
46 |
# Prepare data
|
47 |
-
titles = prepare_data(df[
|
48 |
-
texts = prepare_data(df[
|
49 |
|
50 |
# Create DataLoader
|
51 |
-
data_loader = create_data_loader(
|
52 |
-
titles, texts, batch_size=32, shuffle=False)
|
53 |
|
54 |
# Evaluate
|
55 |
accuracy, f1, auc_roc = evaluate_model(model, data_loader, device, labels)
|
@@ -57,11 +83,10 @@ def run_evaluation(model_path, tokenizer_path, device):
|
|
57 |
|
58 |
|
59 |
if __name__ == "__main__":
|
60 |
-
model_path = f
|
61 |
-
tokenizer_path = f
|
62 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
63 |
print(f"Device: {device}")
|
64 |
|
65 |
accuracy, f1, auc_roc = run_evaluation(model_path, tokenizer_path, device)
|
66 |
-
print(
|
67 |
-
f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}')
|
|
|
3 |
from preprocessing import preprocess_text, load_tokenizer, prepare_data
|
4 |
from data_loader import create_data_loader
|
5 |
from inference import load_model, evaluate_model
|
6 |
+
import os
|
7 |
|
8 |
+
version = 9
|
9 |
|
10 |
|
11 |
def run_evaluation(model_path, tokenizer_path, device):
|
12 |
+
cleaned_path = f"./output/version_{version}/cleaned_inference_data_{version}.csv"
|
13 |
# Load data
|
14 |
+
if os.path.exists(cleaned_path):
|
15 |
df = pd.read_csv(cleaned_path)
|
16 |
df.dropna(inplace=True)
|
17 |
print("Cleaned data found.")
|
18 |
+
else:
|
19 |
print("No cleaned data found. Cleaning data now...")
|
20 |
+
# # Load the datasets
|
21 |
+
# true_news = pd.read_csv("data_1/True.csv")
|
22 |
+
# fake_news = pd.read_csv("data_1/Fake.csv")
|
23 |
+
|
24 |
+
# # Add labels
|
25 |
+
# true_news["label"] = 1
|
26 |
+
# fake_news["label"] = 0
|
27 |
+
|
28 |
+
# # Combine the datasets
|
29 |
+
# df = pd.concat([true_news, fake_news], ignore_index=True)
|
30 |
+
|
31 |
+
# # Drop unnecessary columns
|
32 |
+
# df.drop(columns=["subject", "date"], inplace=True)
|
33 |
+
|
34 |
+
df = pd.read_csv("./data_3/news_articles.csv")
|
35 |
+
df.drop(
|
36 |
+
columns=[
|
37 |
+
"author",
|
38 |
+
"published",
|
39 |
+
"site_url",
|
40 |
+
"main_img_url",
|
41 |
+
"type",
|
42 |
+
"text_without_stopwords",
|
43 |
+
"title_without_stopwords",
|
44 |
+
"hasImage",
|
45 |
+
],
|
46 |
+
inplace=True,
|
47 |
+
)
|
48 |
+
# Map Real to 1 and Fake to 0
|
49 |
+
df["label"] = df["label"].map({"Real": 1, "Fake": 0})
|
50 |
+
df = df[df["label"].isin([1, 0])]
|
51 |
+
|
52 |
+
# Drop rows where the language is not 'english'
|
53 |
+
df = df[df["language"] == "english"]
|
54 |
+
df.drop(columns=["language"], inplace=True)
|
55 |
+
|
56 |
+
# Convert "no title" to empty string
|
57 |
+
df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x)
|
58 |
|
59 |
+
df.dropna(inplace=True)
|
60 |
+
df["title"] = df["title"].apply(preprocess_text)
|
61 |
+
df["text"] = df["text"].apply(preprocess_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
df.to_csv(cleaned_path, index=False)
|
64 |
df.dropna(inplace=True)
|
65 |
print("Cleaned data saved.")
|
66 |
|
67 |
+
labels = df["label"].values
|
68 |
|
69 |
# Load tokenizer and model
|
70 |
tokenizer = load_tokenizer(tokenizer_path)
|
71 |
model = load_model(model_path, len(tokenizer.word_index) + 1)
|
72 |
|
73 |
# Prepare data
|
74 |
+
titles = prepare_data(df["title"], tokenizer)
|
75 |
+
texts = prepare_data(df["text"], tokenizer)
|
76 |
|
77 |
# Create DataLoader
|
78 |
+
data_loader = create_data_loader(titles, texts, batch_size=32, shuffle=False)
|
|
|
79 |
|
80 |
# Evaluate
|
81 |
accuracy, f1, auc_roc = evaluate_model(model, data_loader, device, labels)
|
|
|
83 |
|
84 |
|
85 |
if __name__ == "__main__":
|
86 |
+
model_path = f"./output/version_{version}/best_model_{version}.pth"
|
87 |
+
tokenizer_path = f"./output/version_{version}/tokenizer_{version}.pickle"
|
88 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
89 |
print(f"Device: {device}")
|
90 |
|
91 |
accuracy, f1, auc_roc = run_evaluation(model_path, tokenizer_path, device)
|
92 |
+
print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}")
|
|
output/version_8/best_model_8.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:addb4e8467fca9234e566636574ad2bd0544e91f629c5cf27ec88b173eb6df69
|
3 |
+
size 101405944
|
output/version_8/cleaned_news_data_8.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c0cae611f708ed033cb431b4ff525901cdfbc27e81eeacc872087a4efd6e8310
|
3 |
+
size 154593478
|
output/version_8/confusion_matrix_data_8.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:37c139bea7aa1bc4d747f5ad9cfbb93fafa113689865bfb329d480a818367134
|
3 |
+
size 127312
|
output/version_8/tokenizer_8.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:420dc8ff87e3271e6a2f4e12f503f0a746a17bf3284bcf8e450808894109d0e4
|
3 |
+
size 8809775
|
output/version_8/training_metrics_8.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:06c222696c315a66c1bfa0433472784fc560e11384ddc9055d724a5a13921f1f
|
3 |
+
size 2296
|
output/version_9/best_model_9.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50c1dc4473380483255e98c19d58f34d2aab132ab17be2c7cc31d6eb88551dc8
|
3 |
+
size 101405944
|
output/version_9/cleaned_inference_data_9.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:29cd7b40d7e925e4613e986b5e68420c0ca252544aa3fa6a435723b11d2a0a01
|
3 |
+
size 3873531
|
output/version_9/cleaned_news_data_9.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c0cae611f708ed033cb431b4ff525901cdfbc27e81eeacc872087a4efd6e8310
|
3 |
+
size 154593478
|
output/version_9/confusion_matrix_data_9.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5c07e7992962a370e4e35680f5dc5ba6ee49b1c8c9ce056fa292f296ceb424a5
|
3 |
+
size 127312
|
output/version_9/tokenizer_9.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16950eb0e8dc9cb8d1eadeda4f15387ce0fed66b5b3349229aa5e371a918602d
|
3 |
+
size 8809775
|
output/version_9/training_metrics_9.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd2fc4ee2c7804cbb0a2f6a74bab16af6ecacb7b05ce7efcedbdaae7757352eb
|
3 |
+
size 843
|
preprocessing.py
CHANGED
@@ -4,24 +4,23 @@ from keras.preprocessing.text import Tokenizer
|
|
4 |
from keras_preprocessing.sequence import pad_sequences
|
5 |
import pickle
|
6 |
|
7 |
-
spacy.prefer_gpu()
|
8 |
-
print("GPU is available:", spacy.prefer_gpu())
|
9 |
|
10 |
# Load spaCy's English model
|
11 |
-
nlp = spacy.load(
|
12 |
|
13 |
|
14 |
def preprocess_text(text):
|
15 |
-
# Remove patterns like "COUNTRY or STATE NAME (Reuters) -"
|
16 |
-
text = re.sub(
|
|
|
|
|
17 |
|
18 |
# Remove patterns like "Featured image via author name / image place"
|
19 |
-
text = re.sub(r
|
20 |
|
21 |
# Process text with spaCy
|
22 |
doc = nlp(text)
|
23 |
|
24 |
-
# Improved lemmatization
|
25 |
lemmatized_text = []
|
26 |
for token in doc:
|
27 |
# Preserve named entities in their original form
|
@@ -31,11 +30,11 @@ def preprocess_text(text):
|
|
31 |
elif token.is_alpha and not token.is_stop:
|
32 |
lemmatized_text.append(token.lemma_.lower())
|
33 |
|
34 |
-
return
|
35 |
|
36 |
|
37 |
def load_tokenizer(tokenizer_path):
|
38 |
-
with open(tokenizer_path,
|
39 |
tokenizer = pickle.load(handle)
|
40 |
return tokenizer
|
41 |
|
|
|
4 |
from keras_preprocessing.sequence import pad_sequences
|
5 |
import pickle
|
6 |
|
|
|
|
|
7 |
|
8 |
# Load spaCy's English model
|
9 |
+
nlp = spacy.load("en_core_web_sm")
|
10 |
|
11 |
|
12 |
def preprocess_text(text):
|
13 |
+
# Remove patterns like "COUNTRY or STATE NAME (Reuters) -" or just "(Reuters)"
|
14 |
+
text = re.sub(
|
15 |
+
r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", text
|
16 |
+
)
|
17 |
|
18 |
# Remove patterns like "Featured image via author name / image place"
|
19 |
+
text = re.sub(r"Featured image via .+?\.($|\s)", "", text)
|
20 |
|
21 |
# Process text with spaCy
|
22 |
doc = nlp(text)
|
23 |
|
|
|
24 |
lemmatized_text = []
|
25 |
for token in doc:
|
26 |
# Preserve named entities in their original form
|
|
|
30 |
elif token.is_alpha and not token.is_stop:
|
31 |
lemmatized_text.append(token.lemma_.lower())
|
32 |
|
33 |
+
return " ".join(lemmatized_text)
|
34 |
|
35 |
|
36 |
def load_tokenizer(tokenizer_path):
|
37 |
+
with open(tokenizer_path, "rb") as handle:
|
38 |
tokenizer = pickle.load(handle)
|
39 |
return tokenizer
|
40 |
|
train_main.py
CHANGED
@@ -12,11 +12,11 @@ import pickle
|
|
12 |
import train as tr
|
13 |
from torch.utils.data import Dataset, DataLoader
|
14 |
from data_loader import NewsDataset
|
|
|
15 |
|
16 |
-
version =
|
17 |
|
18 |
if __name__ == "__main__":
|
19 |
-
|
20 |
# fake_path = './data_1/Fake.csv'
|
21 |
# true_path = './data_1/True.csv'
|
22 |
# cleaned_path = './cleaned_news_data.csv'
|
@@ -47,8 +47,8 @@ if __name__ == "__main__":
|
|
47 |
# df.to_csv('cleaned_news_data.csv', index=False)
|
48 |
# df.dropna(inplace=True)
|
49 |
|
50 |
-
data_path =
|
51 |
-
cleaned_path = f
|
52 |
# Load data
|
53 |
try:
|
54 |
df = pd.read_csv(cleaned_path)
|
@@ -63,35 +63,38 @@ if __name__ == "__main__":
|
|
63 |
df.dropna(inplace=True)
|
64 |
|
65 |
# Swapping labels around since it originally is the opposite
|
66 |
-
df[
|
67 |
|
68 |
-
df[
|
69 |
-
df[
|
70 |
|
|
|
|
|
71 |
df.to_csv(cleaned_path, index=False)
|
72 |
print("Cleaned data saved.")
|
73 |
|
74 |
# Splitting the data
|
75 |
train_val, test = train_test_split(df, test_size=0.2, random_state=42)
|
76 |
train, val = train_test_split(
|
77 |
-
train_val, test_size=0.25, random_state=42
|
|
|
78 |
|
79 |
# Initialize the tokenizer
|
80 |
tokenizer = Tokenizer()
|
81 |
|
82 |
# Fit the tokenizer on the training data
|
83 |
-
tokenizer.fit_on_texts(train[
|
84 |
|
85 |
-
with open(f
|
86 |
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
87 |
|
88 |
# Tokenize the data
|
89 |
-
X_train_title = tokenizer.texts_to_sequences(train[
|
90 |
-
X_train_text = tokenizer.texts_to_sequences(train[
|
91 |
-
X_val_title = tokenizer.texts_to_sequences(val[
|
92 |
-
X_val_text = tokenizer.texts_to_sequences(val[
|
93 |
-
X_test_title = tokenizer.texts_to_sequences(test[
|
94 |
-
X_test_text = tokenizer.texts_to_sequences(test[
|
95 |
|
96 |
# Padding sequences
|
97 |
max_length = 500
|
@@ -108,19 +111,46 @@ if __name__ == "__main__":
|
|
108 |
model = LSTMModel(len(tokenizer.word_index) + 1).to(device)
|
109 |
|
110 |
# Convert data to PyTorch tensors
|
111 |
-
train_data = NewsDataset(
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
criterion = nn.BCELoss()
|
126 |
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
@@ -132,17 +162,17 @@ if __name__ == "__main__":
|
|
132 |
criterion=criterion,
|
133 |
optimizer=optimizer,
|
134 |
version=version,
|
135 |
-
epochs=
|
136 |
device=device,
|
137 |
max_grad_norm=1.0,
|
138 |
early_stopping_patience=3,
|
139 |
-
early_stopping_delta=0.
|
140 |
)
|
141 |
|
142 |
-
print(f
|
143 |
|
144 |
# Load the best model before testing
|
145 |
-
best_model_path = f
|
146 |
model.load_state_dict(torch.load(best_model_path, map_location=device))
|
147 |
|
148 |
# Testing
|
@@ -155,8 +185,11 @@ if __name__ == "__main__":
|
|
155 |
correct = 0
|
156 |
total = 0
|
157 |
for titles, texts, labels in test_loader:
|
158 |
-
titles, texts, labels =
|
159 |
-
|
|
|
|
|
|
|
160 |
outputs = model(titles, texts).squeeze()
|
161 |
|
162 |
predicted = (outputs > 0.5).float()
|
@@ -171,10 +204,11 @@ if __name__ == "__main__":
|
|
171 |
auc_roc = roc_auc_score(true_labels, predicted_probs)
|
172 |
|
173 |
print(
|
174 |
-
f
|
|
|
175 |
|
176 |
# Create DataFrame and Save to CSV
|
177 |
-
confusion_data = pd.DataFrame(
|
178 |
-
{'True': true_labels, 'Predicted': predicted_labels})
|
179 |
confusion_data.to_csv(
|
180 |
-
f
|
|
|
|
12 |
import train as tr
|
13 |
from torch.utils.data import Dataset, DataLoader
|
14 |
from data_loader import NewsDataset
|
15 |
+
import os
|
16 |
|
17 |
+
version = 9
|
18 |
|
19 |
if __name__ == "__main__":
|
|
|
20 |
# fake_path = './data_1/Fake.csv'
|
21 |
# true_path = './data_1/True.csv'
|
22 |
# cleaned_path = './cleaned_news_data.csv'
|
|
|
47 |
# df.to_csv('cleaned_news_data.csv', index=False)
|
48 |
# df.dropna(inplace=True)
|
49 |
|
50 |
+
data_path = "./data_2/WELFake_Dataset.csv"
|
51 |
+
cleaned_path = f"./output/version_{version}/cleaned_news_data_{version}.csv"
|
52 |
# Load data
|
53 |
try:
|
54 |
df = pd.read_csv(cleaned_path)
|
|
|
63 |
df.dropna(inplace=True)
|
64 |
|
65 |
# Swapping labels around since it originally is the opposite
|
66 |
+
df["label"] = df["label"].map({0: 1, 1: 0})
|
67 |
|
68 |
+
df["title"] = df["title"].apply(preprocess_text)
|
69 |
+
df["text"] = df["text"].apply(preprocess_text)
|
70 |
|
71 |
+
# Create the directory if it does not exist
|
72 |
+
os.makedirs(os.path.dirname(cleaned_path), exist_ok=True)
|
73 |
df.to_csv(cleaned_path, index=False)
|
74 |
print("Cleaned data saved.")
|
75 |
|
76 |
# Splitting the data
|
77 |
train_val, test = train_test_split(df, test_size=0.2, random_state=42)
|
78 |
train, val = train_test_split(
|
79 |
+
train_val, test_size=0.25, random_state=42
|
80 |
+
) # 0.25 * 0.8 = 0.2
|
81 |
|
82 |
# Initialize the tokenizer
|
83 |
tokenizer = Tokenizer()
|
84 |
|
85 |
# Fit the tokenizer on the training data
|
86 |
+
tokenizer.fit_on_texts(train["title"] + train["text"])
|
87 |
|
88 |
+
with open(f"./output/version_{version}/tokenizer_{version}.pickle", "wb") as handle:
|
89 |
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
90 |
|
91 |
# Tokenize the data
|
92 |
+
X_train_title = tokenizer.texts_to_sequences(train["title"])
|
93 |
+
X_train_text = tokenizer.texts_to_sequences(train["text"])
|
94 |
+
X_val_title = tokenizer.texts_to_sequences(val["title"])
|
95 |
+
X_val_text = tokenizer.texts_to_sequences(val["text"])
|
96 |
+
X_test_title = tokenizer.texts_to_sequences(test["title"])
|
97 |
+
X_test_text = tokenizer.texts_to_sequences(test["text"])
|
98 |
|
99 |
# Padding sequences
|
100 |
max_length = 500
|
|
|
111 |
model = LSTMModel(len(tokenizer.word_index) + 1).to(device)
|
112 |
|
113 |
# Convert data to PyTorch tensors
|
114 |
+
train_data = NewsDataset(
|
115 |
+
torch.tensor(X_train_title),
|
116 |
+
torch.tensor(X_train_text),
|
117 |
+
torch.tensor(train["label"].values),
|
118 |
+
)
|
119 |
+
val_data = NewsDataset(
|
120 |
+
torch.tensor(X_val_title),
|
121 |
+
torch.tensor(X_val_text),
|
122 |
+
torch.tensor(val["label"].values),
|
123 |
+
)
|
124 |
+
test_data = NewsDataset(
|
125 |
+
torch.tensor(X_test_title),
|
126 |
+
torch.tensor(X_test_text),
|
127 |
+
torch.tensor(test["label"].values),
|
128 |
+
)
|
129 |
+
|
130 |
+
train_loader = DataLoader(
|
131 |
+
train_data,
|
132 |
+
batch_size=32,
|
133 |
+
shuffle=True,
|
134 |
+
num_workers=6,
|
135 |
+
pin_memory=True,
|
136 |
+
persistent_workers=True,
|
137 |
+
)
|
138 |
+
val_loader = DataLoader(
|
139 |
+
val_data,
|
140 |
+
batch_size=32,
|
141 |
+
shuffle=False,
|
142 |
+
num_workers=6,
|
143 |
+
pin_memory=True,
|
144 |
+
persistent_workers=True,
|
145 |
+
)
|
146 |
+
test_loader = DataLoader(
|
147 |
+
test_data,
|
148 |
+
batch_size=32,
|
149 |
+
shuffle=False,
|
150 |
+
num_workers=6,
|
151 |
+
pin_memory=True,
|
152 |
+
persistent_workers=True,
|
153 |
+
)
|
154 |
|
155 |
criterion = nn.BCELoss()
|
156 |
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
|
|
162 |
criterion=criterion,
|
163 |
optimizer=optimizer,
|
164 |
version=version,
|
165 |
+
epochs=10,
|
166 |
device=device,
|
167 |
max_grad_norm=1.0,
|
168 |
early_stopping_patience=3,
|
169 |
+
early_stopping_delta=0.01,
|
170 |
)
|
171 |
|
172 |
+
print(f"Best model was saved at epoch: {best_epoch}")
|
173 |
|
174 |
# Load the best model before testing
|
175 |
+
best_model_path = f"./output/version_{version}/best_model_{version}.pth"
|
176 |
model.load_state_dict(torch.load(best_model_path, map_location=device))
|
177 |
|
178 |
# Testing
|
|
|
185 |
correct = 0
|
186 |
total = 0
|
187 |
for titles, texts, labels in test_loader:
|
188 |
+
titles, texts, labels = (
|
189 |
+
titles.to(device),
|
190 |
+
texts.to(device),
|
191 |
+
labels.to(device).float(),
|
192 |
+
)
|
193 |
outputs = model(titles, texts).squeeze()
|
194 |
|
195 |
predicted = (outputs > 0.5).float()
|
|
|
204 |
auc_roc = roc_auc_score(true_labels, predicted_probs)
|
205 |
|
206 |
print(
|
207 |
+
f"Test Accuracy: {test_accuracy:.2f}%, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}"
|
208 |
+
)
|
209 |
|
210 |
# Create DataFrame and Save to CSV
|
211 |
+
confusion_data = pd.DataFrame({"True": true_labels, "Predicted": predicted_labels})
|
|
|
212 |
confusion_data.to_csv(
|
213 |
+
f"./output/version_{version}/confusion_matrix_data_{version}.csv", index=False
|
214 |
+
)
|