kimic commited on
Commit
64c01a0
1 Parent(s): c5cd586

Updated preprocessing and inferenced on data_3

Browse files
analysis.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
data_3/news_articles.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53855240e9036a7d6c204e72bd0fa9d37a10f8e1bd2b2fdf34b962569ef271c6
3
+ size 10969548
inference_main.py CHANGED
@@ -3,53 +3,79 @@ import pandas as pd
3
  from preprocessing import preprocess_text, load_tokenizer, prepare_data
4
  from data_loader import create_data_loader
5
  from inference import load_model, evaluate_model
 
6
 
7
- version = 7
8
 
9
 
10
  def run_evaluation(model_path, tokenizer_path, device):
11
- cleaned_path = f'./output/version_{version}/cleaned_inference_data_{version}.csv'
12
  # Load data
13
- try:
14
  df = pd.read_csv(cleaned_path)
15
  df.dropna(inplace=True)
16
  print("Cleaned data found.")
17
- except:
18
  print("No cleaned data found. Cleaning data now...")
19
- # Load the datasets
20
- true_news = pd.read_csv('data_1/True.csv')
21
- fake_news = pd.read_csv('data_1/Fake.csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- # Add labels
24
- true_news['label'] = 1
25
- fake_news['label'] = 0
26
-
27
- # Combine the datasets
28
- df = pd.concat([true_news, fake_news], ignore_index=True)
29
-
30
- # Drop unnecessary columns
31
- df.drop(columns=['subject', 'date'], inplace=True)
32
-
33
- df['title'] = df['title'].apply(preprocess_text)
34
- df['text'] = df['text'].apply(preprocess_text)
35
 
36
  df.to_csv(cleaned_path, index=False)
37
  df.dropna(inplace=True)
38
  print("Cleaned data saved.")
39
 
40
- labels = df['label'].values
41
 
42
  # Load tokenizer and model
43
  tokenizer = load_tokenizer(tokenizer_path)
44
  model = load_model(model_path, len(tokenizer.word_index) + 1)
45
 
46
  # Prepare data
47
- titles = prepare_data(df['title'], tokenizer)
48
- texts = prepare_data(df['text'], tokenizer)
49
 
50
  # Create DataLoader
51
- data_loader = create_data_loader(
52
- titles, texts, batch_size=32, shuffle=False)
53
 
54
  # Evaluate
55
  accuracy, f1, auc_roc = evaluate_model(model, data_loader, device, labels)
@@ -57,11 +83,10 @@ def run_evaluation(model_path, tokenizer_path, device):
57
 
58
 
59
  if __name__ == "__main__":
60
- model_path = f'./output/version_{version}/best_model_{version}.pth'
61
- tokenizer_path = f'./output/version_{version}/tokenizer_{version}.pickle'
62
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
63
  print(f"Device: {device}")
64
 
65
  accuracy, f1, auc_roc = run_evaluation(model_path, tokenizer_path, device)
66
- print(
67
- f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}')
 
3
  from preprocessing import preprocess_text, load_tokenizer, prepare_data
4
  from data_loader import create_data_loader
5
  from inference import load_model, evaluate_model
6
+ import os
7
 
8
+ version = 9
9
 
10
 
11
  def run_evaluation(model_path, tokenizer_path, device):
12
+ cleaned_path = f"./output/version_{version}/cleaned_inference_data_{version}.csv"
13
  # Load data
14
+ if os.path.exists(cleaned_path):
15
  df = pd.read_csv(cleaned_path)
16
  df.dropna(inplace=True)
17
  print("Cleaned data found.")
18
+ else:
19
  print("No cleaned data found. Cleaning data now...")
20
+ # # Load the datasets
21
+ # true_news = pd.read_csv("data_1/True.csv")
22
+ # fake_news = pd.read_csv("data_1/Fake.csv")
23
+
24
+ # # Add labels
25
+ # true_news["label"] = 1
26
+ # fake_news["label"] = 0
27
+
28
+ # # Combine the datasets
29
+ # df = pd.concat([true_news, fake_news], ignore_index=True)
30
+
31
+ # # Drop unnecessary columns
32
+ # df.drop(columns=["subject", "date"], inplace=True)
33
+
34
+ df = pd.read_csv("./data_3/news_articles.csv")
35
+ df.drop(
36
+ columns=[
37
+ "author",
38
+ "published",
39
+ "site_url",
40
+ "main_img_url",
41
+ "type",
42
+ "text_without_stopwords",
43
+ "title_without_stopwords",
44
+ "hasImage",
45
+ ],
46
+ inplace=True,
47
+ )
48
+ # Map Real to 1 and Fake to 0
49
+ df["label"] = df["label"].map({"Real": 1, "Fake": 0})
50
+ df = df[df["label"].isin([1, 0])]
51
+
52
+ # Drop rows where the language is not 'english'
53
+ df = df[df["language"] == "english"]
54
+ df.drop(columns=["language"], inplace=True)
55
+
56
+ # Convert "no title" to empty string
57
+ df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x)
58
 
59
+ df.dropna(inplace=True)
60
+ df["title"] = df["title"].apply(preprocess_text)
61
+ df["text"] = df["text"].apply(preprocess_text)
 
 
 
 
 
 
 
 
 
62
 
63
  df.to_csv(cleaned_path, index=False)
64
  df.dropna(inplace=True)
65
  print("Cleaned data saved.")
66
 
67
+ labels = df["label"].values
68
 
69
  # Load tokenizer and model
70
  tokenizer = load_tokenizer(tokenizer_path)
71
  model = load_model(model_path, len(tokenizer.word_index) + 1)
72
 
73
  # Prepare data
74
+ titles = prepare_data(df["title"], tokenizer)
75
+ texts = prepare_data(df["text"], tokenizer)
76
 
77
  # Create DataLoader
78
+ data_loader = create_data_loader(titles, texts, batch_size=32, shuffle=False)
 
79
 
80
  # Evaluate
81
  accuracy, f1, auc_roc = evaluate_model(model, data_loader, device, labels)
 
83
 
84
 
85
  if __name__ == "__main__":
86
+ model_path = f"./output/version_{version}/best_model_{version}.pth"
87
+ tokenizer_path = f"./output/version_{version}/tokenizer_{version}.pickle"
88
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
89
  print(f"Device: {device}")
90
 
91
  accuracy, f1, auc_roc = run_evaluation(model_path, tokenizer_path, device)
92
+ print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}")
 
output/version_8/best_model_8.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:addb4e8467fca9234e566636574ad2bd0544e91f629c5cf27ec88b173eb6df69
3
+ size 101405944
output/version_8/cleaned_news_data_8.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0cae611f708ed033cb431b4ff525901cdfbc27e81eeacc872087a4efd6e8310
3
+ size 154593478
output/version_8/confusion_matrix_data_8.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37c139bea7aa1bc4d747f5ad9cfbb93fafa113689865bfb329d480a818367134
3
+ size 127312
output/version_8/tokenizer_8.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:420dc8ff87e3271e6a2f4e12f503f0a746a17bf3284bcf8e450808894109d0e4
3
+ size 8809775
output/version_8/training_metrics_8.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06c222696c315a66c1bfa0433472784fc560e11384ddc9055d724a5a13921f1f
3
+ size 2296
output/version_9/best_model_9.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50c1dc4473380483255e98c19d58f34d2aab132ab17be2c7cc31d6eb88551dc8
3
+ size 101405944
output/version_9/cleaned_inference_data_9.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29cd7b40d7e925e4613e986b5e68420c0ca252544aa3fa6a435723b11d2a0a01
3
+ size 3873531
output/version_9/cleaned_news_data_9.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0cae611f708ed033cb431b4ff525901cdfbc27e81eeacc872087a4efd6e8310
3
+ size 154593478
output/version_9/confusion_matrix_data_9.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c07e7992962a370e4e35680f5dc5ba6ee49b1c8c9ce056fa292f296ceb424a5
3
+ size 127312
output/version_9/tokenizer_9.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16950eb0e8dc9cb8d1eadeda4f15387ce0fed66b5b3349229aa5e371a918602d
3
+ size 8809775
output/version_9/training_metrics_9.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd2fc4ee2c7804cbb0a2f6a74bab16af6ecacb7b05ce7efcedbdaae7757352eb
3
+ size 843
preprocessing.py CHANGED
@@ -4,24 +4,23 @@ from keras.preprocessing.text import Tokenizer
4
  from keras_preprocessing.sequence import pad_sequences
5
  import pickle
6
 
7
- spacy.prefer_gpu()
8
- print("GPU is available:", spacy.prefer_gpu())
9
 
10
  # Load spaCy's English model
11
- nlp = spacy.load('en_core_web_sm')
12
 
13
 
14
  def preprocess_text(text):
15
- # Remove patterns like "COUNTRY or STATE NAME (Reuters) -"
16
- text = re.sub(r'\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-', '', text)
 
 
17
 
18
  # Remove patterns like "Featured image via author name / image place"
19
- text = re.sub(r'Featured image via .+ / .+', '', text)
20
 
21
  # Process text with spaCy
22
  doc = nlp(text)
23
 
24
- # Improved lemmatization
25
  lemmatized_text = []
26
  for token in doc:
27
  # Preserve named entities in their original form
@@ -31,11 +30,11 @@ def preprocess_text(text):
31
  elif token.is_alpha and not token.is_stop:
32
  lemmatized_text.append(token.lemma_.lower())
33
 
34
- return ' '.join(lemmatized_text)
35
 
36
 
37
  def load_tokenizer(tokenizer_path):
38
- with open(tokenizer_path, 'rb') as handle:
39
  tokenizer = pickle.load(handle)
40
  return tokenizer
41
 
 
4
  from keras_preprocessing.sequence import pad_sequences
5
  import pickle
6
 
 
 
7
 
8
  # Load spaCy's English model
9
+ nlp = spacy.load("en_core_web_sm")
10
 
11
 
12
  def preprocess_text(text):
13
+ # Remove patterns like "COUNTRY or STATE NAME (Reuters) -" or just "(Reuters)"
14
+ text = re.sub(
15
+ r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", text
16
+ )
17
 
18
  # Remove patterns like "Featured image via author name / image place"
19
+ text = re.sub(r"Featured image via .+?\.($|\s)", "", text)
20
 
21
  # Process text with spaCy
22
  doc = nlp(text)
23
 
 
24
  lemmatized_text = []
25
  for token in doc:
26
  # Preserve named entities in their original form
 
30
  elif token.is_alpha and not token.is_stop:
31
  lemmatized_text.append(token.lemma_.lower())
32
 
33
+ return " ".join(lemmatized_text)
34
 
35
 
36
  def load_tokenizer(tokenizer_path):
37
+ with open(tokenizer_path, "rb") as handle:
38
  tokenizer = pickle.load(handle)
39
  return tokenizer
40
 
train_main.py CHANGED
@@ -12,11 +12,11 @@ import pickle
12
  import train as tr
13
  from torch.utils.data import Dataset, DataLoader
14
  from data_loader import NewsDataset
 
15
 
16
- version = 7
17
 
18
  if __name__ == "__main__":
19
-
20
  # fake_path = './data_1/Fake.csv'
21
  # true_path = './data_1/True.csv'
22
  # cleaned_path = './cleaned_news_data.csv'
@@ -47,8 +47,8 @@ if __name__ == "__main__":
47
  # df.to_csv('cleaned_news_data.csv', index=False)
48
  # df.dropna(inplace=True)
49
 
50
- data_path = './data_2/WELFake_Dataset.csv'
51
- cleaned_path = f'./output/version_{version}/cleaned_news_data_{version}.csv'
52
  # Load data
53
  try:
54
  df = pd.read_csv(cleaned_path)
@@ -63,35 +63,38 @@ if __name__ == "__main__":
63
  df.dropna(inplace=True)
64
 
65
  # Swapping labels around since it originally is the opposite
66
- df['label'] = df['label'].map({0: 1, 1: 0})
67
 
68
- df['title'] = df['title'].apply(preprocess_text)
69
- df['text'] = df['text'].apply(preprocess_text)
70
 
 
 
71
  df.to_csv(cleaned_path, index=False)
72
  print("Cleaned data saved.")
73
 
74
  # Splitting the data
75
  train_val, test = train_test_split(df, test_size=0.2, random_state=42)
76
  train, val = train_test_split(
77
- train_val, test_size=0.25, random_state=42) # 0.25 * 0.8 = 0.2
 
78
 
79
  # Initialize the tokenizer
80
  tokenizer = Tokenizer()
81
 
82
  # Fit the tokenizer on the training data
83
- tokenizer.fit_on_texts(train['title'] + train['text'])
84
 
85
- with open(f'./output/version_{version}/tokenizer_{version}.pickle', 'wb') as handle:
86
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
87
 
88
  # Tokenize the data
89
- X_train_title = tokenizer.texts_to_sequences(train['title'])
90
- X_train_text = tokenizer.texts_to_sequences(train['text'])
91
- X_val_title = tokenizer.texts_to_sequences(val['title'])
92
- X_val_text = tokenizer.texts_to_sequences(val['text'])
93
- X_test_title = tokenizer.texts_to_sequences(test['title'])
94
- X_test_text = tokenizer.texts_to_sequences(test['text'])
95
 
96
  # Padding sequences
97
  max_length = 500
@@ -108,19 +111,46 @@ if __name__ == "__main__":
108
  model = LSTMModel(len(tokenizer.word_index) + 1).to(device)
109
 
110
  # Convert data to PyTorch tensors
111
- train_data = NewsDataset(torch.tensor(X_train_title), torch.tensor(
112
- X_train_text), torch.tensor(train['label'].values))
113
- val_data = NewsDataset(torch.tensor(X_val_title), torch.tensor(
114
- X_val_text), torch.tensor(val['label'].values))
115
- test_data = NewsDataset(torch.tensor(X_test_title), torch.tensor(
116
- X_test_text), torch.tensor(test['label'].values))
117
-
118
- train_loader = DataLoader(train_data, batch_size=32,
119
- shuffle=True, num_workers=6, pin_memory=True, persistent_workers=True)
120
- val_loader = DataLoader(val_data, batch_size=32,
121
- shuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)
122
- test_loader = DataLoader(test_data, batch_size=32,
123
- shuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  criterion = nn.BCELoss()
126
  optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
@@ -132,17 +162,17 @@ if __name__ == "__main__":
132
  criterion=criterion,
133
  optimizer=optimizer,
134
  version=version,
135
- epochs=50,
136
  device=device,
137
  max_grad_norm=1.0,
138
  early_stopping_patience=3,
139
- early_stopping_delta=0.001
140
  )
141
 
142
- print(f'Best model was saved at epoch: {best_epoch}')
143
 
144
  # Load the best model before testing
145
- best_model_path = f'./output/version_{version}/best_model_{version}.pth'
146
  model.load_state_dict(torch.load(best_model_path, map_location=device))
147
 
148
  # Testing
@@ -155,8 +185,11 @@ if __name__ == "__main__":
155
  correct = 0
156
  total = 0
157
  for titles, texts, labels in test_loader:
158
- titles, texts, labels = titles.to(device), texts.to(
159
- device), labels.to(device).float()
 
 
 
160
  outputs = model(titles, texts).squeeze()
161
 
162
  predicted = (outputs > 0.5).float()
@@ -171,10 +204,11 @@ if __name__ == "__main__":
171
  auc_roc = roc_auc_score(true_labels, predicted_probs)
172
 
173
  print(
174
- f'Test Accuracy: {test_accuracy:.2f}%, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}')
 
175
 
176
  # Create DataFrame and Save to CSV
177
- confusion_data = pd.DataFrame(
178
- {'True': true_labels, 'Predicted': predicted_labels})
179
  confusion_data.to_csv(
180
- f'./output/version_{version}/confusion_matrix_data_{version}.csv', index=False)
 
 
12
  import train as tr
13
  from torch.utils.data import Dataset, DataLoader
14
  from data_loader import NewsDataset
15
+ import os
16
 
17
+ version = 9
18
 
19
  if __name__ == "__main__":
 
20
  # fake_path = './data_1/Fake.csv'
21
  # true_path = './data_1/True.csv'
22
  # cleaned_path = './cleaned_news_data.csv'
 
47
  # df.to_csv('cleaned_news_data.csv', index=False)
48
  # df.dropna(inplace=True)
49
 
50
+ data_path = "./data_2/WELFake_Dataset.csv"
51
+ cleaned_path = f"./output/version_{version}/cleaned_news_data_{version}.csv"
52
  # Load data
53
  try:
54
  df = pd.read_csv(cleaned_path)
 
63
  df.dropna(inplace=True)
64
 
65
  # Swapping labels around since it originally is the opposite
66
+ df["label"] = df["label"].map({0: 1, 1: 0})
67
 
68
+ df["title"] = df["title"].apply(preprocess_text)
69
+ df["text"] = df["text"].apply(preprocess_text)
70
 
71
+ # Create the directory if it does not exist
72
+ os.makedirs(os.path.dirname(cleaned_path), exist_ok=True)
73
  df.to_csv(cleaned_path, index=False)
74
  print("Cleaned data saved.")
75
 
76
  # Splitting the data
77
  train_val, test = train_test_split(df, test_size=0.2, random_state=42)
78
  train, val = train_test_split(
79
+ train_val, test_size=0.25, random_state=42
80
+ ) # 0.25 * 0.8 = 0.2
81
 
82
  # Initialize the tokenizer
83
  tokenizer = Tokenizer()
84
 
85
  # Fit the tokenizer on the training data
86
+ tokenizer.fit_on_texts(train["title"] + train["text"])
87
 
88
+ with open(f"./output/version_{version}/tokenizer_{version}.pickle", "wb") as handle:
89
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
90
 
91
  # Tokenize the data
92
+ X_train_title = tokenizer.texts_to_sequences(train["title"])
93
+ X_train_text = tokenizer.texts_to_sequences(train["text"])
94
+ X_val_title = tokenizer.texts_to_sequences(val["title"])
95
+ X_val_text = tokenizer.texts_to_sequences(val["text"])
96
+ X_test_title = tokenizer.texts_to_sequences(test["title"])
97
+ X_test_text = tokenizer.texts_to_sequences(test["text"])
98
 
99
  # Padding sequences
100
  max_length = 500
 
111
  model = LSTMModel(len(tokenizer.word_index) + 1).to(device)
112
 
113
  # Convert data to PyTorch tensors
114
+ train_data = NewsDataset(
115
+ torch.tensor(X_train_title),
116
+ torch.tensor(X_train_text),
117
+ torch.tensor(train["label"].values),
118
+ )
119
+ val_data = NewsDataset(
120
+ torch.tensor(X_val_title),
121
+ torch.tensor(X_val_text),
122
+ torch.tensor(val["label"].values),
123
+ )
124
+ test_data = NewsDataset(
125
+ torch.tensor(X_test_title),
126
+ torch.tensor(X_test_text),
127
+ torch.tensor(test["label"].values),
128
+ )
129
+
130
+ train_loader = DataLoader(
131
+ train_data,
132
+ batch_size=32,
133
+ shuffle=True,
134
+ num_workers=6,
135
+ pin_memory=True,
136
+ persistent_workers=True,
137
+ )
138
+ val_loader = DataLoader(
139
+ val_data,
140
+ batch_size=32,
141
+ shuffle=False,
142
+ num_workers=6,
143
+ pin_memory=True,
144
+ persistent_workers=True,
145
+ )
146
+ test_loader = DataLoader(
147
+ test_data,
148
+ batch_size=32,
149
+ shuffle=False,
150
+ num_workers=6,
151
+ pin_memory=True,
152
+ persistent_workers=True,
153
+ )
154
 
155
  criterion = nn.BCELoss()
156
  optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
 
162
  criterion=criterion,
163
  optimizer=optimizer,
164
  version=version,
165
+ epochs=10,
166
  device=device,
167
  max_grad_norm=1.0,
168
  early_stopping_patience=3,
169
+ early_stopping_delta=0.01,
170
  )
171
 
172
+ print(f"Best model was saved at epoch: {best_epoch}")
173
 
174
  # Load the best model before testing
175
+ best_model_path = f"./output/version_{version}/best_model_{version}.pth"
176
  model.load_state_dict(torch.load(best_model_path, map_location=device))
177
 
178
  # Testing
 
185
  correct = 0
186
  total = 0
187
  for titles, texts, labels in test_loader:
188
+ titles, texts, labels = (
189
+ titles.to(device),
190
+ texts.to(device),
191
+ labels.to(device).float(),
192
+ )
193
  outputs = model(titles, texts).squeeze()
194
 
195
  predicted = (outputs > 0.5).float()
 
204
  auc_roc = roc_auc_score(true_labels, predicted_probs)
205
 
206
  print(
207
+ f"Test Accuracy: {test_accuracy:.2f}%, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}"
208
+ )
209
 
210
  # Create DataFrame and Save to CSV
211
+ confusion_data = pd.DataFrame({"True": true_labels, "Predicted": predicted_labels})
 
212
  confusion_data.to_csv(
213
+ f"./output/version_{version}/confusion_matrix_data_{version}.csv", index=False
214
+ )