kimic commited on
Commit
1bb2bdd
1 Parent(s): 64c01a0

Added cm and updated graph titles for clarity

Browse files
analysis.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
inference.py CHANGED
@@ -29,11 +29,12 @@ def evaluate_model(model, data_loader, device, labels):
29
  outputs = predict(model, titles, texts, device)
30
  predictions.extend(outputs.cpu().numpy())
31
 
32
- labels = labels.cpu()
33
- # Calculate metrics
34
  predicted_labels = [1 if p > 0.5 else 0 for p in predictions]
 
 
35
  accuracy = accuracy_score(labels, predicted_labels)
36
  f1 = f1_score(labels, predicted_labels)
37
  auc_roc = roc_auc_score(labels, predictions)
38
 
39
- return accuracy, f1, auc_roc
 
29
  outputs = predict(model, titles, texts, device)
30
  predictions.extend(outputs.cpu().numpy())
31
 
32
+ labels = labels.cpu().numpy() # Convert labels to NumPy array for consistency
 
33
  predicted_labels = [1 if p > 0.5 else 0 for p in predictions]
34
+
35
+ # Calculate metrics
36
  accuracy = accuracy_score(labels, predicted_labels)
37
  f1 = f1_score(labels, predicted_labels)
38
  auc_roc = roc_auc_score(labels, predictions)
39
 
40
+ return accuracy, f1, auc_roc, labels, predicted_labels
inference_analysis.ipynb ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 5,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "C:\\Users\\kimi\\AppData\\Local\\Temp\\ipykernel_6488\\2691833235.py:5: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.\n",
13
+ " plt.style.use(\"seaborn-whitegrid\")\n"
14
+ ]
15
+ },
16
+ {
17
+ "data": {
18
+ "image/png": "",
19
+ "text/plain": [
20
+ "<Figure size 800x600 with 2 Axes>"
21
+ ]
22
+ },
23
+ "metadata": {},
24
+ "output_type": "display_data"
25
+ }
26
+ ],
27
+ "source": [
28
+ "import matplotlib.pyplot as plt\n",
29
+ "import seaborn as sns\n",
30
+ "import pandas as pd\n",
31
+ "\n",
32
+ "plt.style.use(\"seaborn-whitegrid\")\n",
33
+ "\n",
34
+ "version = 9\n",
35
+ "\n",
36
+ "# Read confusion matrix from CSV\n",
37
+ "cm_df = pd.read_csv(\n",
38
+ " f\"./output/version_{version}/confusion_matrix_inference_{version}.csv\"\n",
39
+ ")\n",
40
+ "cm = cm_df.values\n",
41
+ "\n",
42
+ "# Plotting\n",
43
+ "plt.figure(figsize=(8, 6))\n",
44
+ "sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\")\n",
45
+ "plt.title(\"Confusion Matrix (LSTM, Holdout Set)\")\n",
46
+ "plt.ylabel(\"True label\")\n",
47
+ "plt.xlabel(\"Predicted label\")\n",
48
+ "plt.show()"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": null,
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": []
57
+ }
58
+ ],
59
+ "metadata": {
60
+ "kernelspec": {
61
+ "display_name": "torch",
62
+ "language": "python",
63
+ "name": "python3"
64
+ },
65
+ "language_info": {
66
+ "codemirror_mode": {
67
+ "name": "ipython",
68
+ "version": 3
69
+ },
70
+ "file_extension": ".py",
71
+ "mimetype": "text/x-python",
72
+ "name": "python",
73
+ "nbconvert_exporter": "python",
74
+ "pygments_lexer": "ipython3",
75
+ "version": "3.10.11"
76
+ }
77
+ },
78
+ "nbformat": 4,
79
+ "nbformat_minor": 2
80
+ }
inference_main.py CHANGED
@@ -3,6 +3,7 @@ import pandas as pd
3
  from preprocessing import preprocess_text, load_tokenizer, prepare_data
4
  from data_loader import create_data_loader
5
  from inference import load_model, evaluate_model
 
6
  import os
7
 
8
  version = 9
@@ -78,7 +79,16 @@ def run_evaluation(model_path, tokenizer_path, device):
78
  data_loader = create_data_loader(titles, texts, batch_size=32, shuffle=False)
79
 
80
  # Evaluate
81
- accuracy, f1, auc_roc = evaluate_model(model, data_loader, device, labels)
 
 
 
 
 
 
 
 
 
82
  return accuracy, f1, auc_roc
83
 
84
 
 
3
  from preprocessing import preprocess_text, load_tokenizer, prepare_data
4
  from data_loader import create_data_loader
5
  from inference import load_model, evaluate_model
6
+ from sklearn.metrics import confusion_matrix
7
  import os
8
 
9
  version = 9
 
79
  data_loader = create_data_loader(titles, texts, batch_size=32, shuffle=False)
80
 
81
  # Evaluate
82
+ accuracy, f1, auc_roc, y_true, y_pred = evaluate_model(
83
+ model, data_loader, device, labels
84
+ )
85
+
86
+ # Generate and save confusion matrix
87
+ cm = confusion_matrix(y_true, y_pred)
88
+ cm_df = pd.DataFrame(cm)
89
+ cm_filename = f"./output/version_{version}/confusion_matrix_inference_{version}.csv"
90
+ cm_df.to_csv(cm_filename, index=False)
91
+ print(f"Confusion Matrix saved to {cm_filename}")
92
  return accuracy, f1, auc_roc
93
 
94
 
inference_more.ipynb DELETED
@@ -1,303 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [
8
- {
9
- "name": "stdout",
10
- "output_type": "stream",
11
- "text": [
12
- "GPU is available: True\n"
13
- ]
14
- }
15
- ],
16
- "source": [
17
- "import torch\n",
18
- "import torch.nn as nn\n",
19
- "import pandas as pd\n",
20
- "from model import LSTMModel\n",
21
- "from data_loader import create_data_loader\n",
22
- "from sklearn.model_selection import train_test_split\n",
23
- "from sklearn.metrics import f1_score, roc_auc_score\n",
24
- "from keras_preprocessing.sequence import pad_sequences\n",
25
- "from torch.utils.data import DataLoader\n",
26
- "from data_loader import NewsDataset"
27
- ]
28
- },
29
- {
30
- "cell_type": "code",
31
- "execution_count": 1,
32
- "metadata": {},
33
- "outputs": [],
34
- "source": [
35
- "version = 7"
36
- ]
37
- },
38
- {
39
- "cell_type": "code",
40
- "execution_count": 2,
41
- "metadata": {},
42
- "outputs": [
43
- {
44
- "name": "stdout",
45
- "output_type": "stream",
46
- "text": [
47
- "Cleaned data found.\n"
48
- ]
49
- }
50
- ],
51
- "source": [
52
- "data_path = './data_2/WELFake_Dataset.csv'\n",
53
- "cleaned_path = f'./output/version_{version}/cleaned_news_data_{version}.csv'\n",
54
- "\n",
55
- "# Load data\n",
56
- "df = pd.read_csv(cleaned_path)\n",
57
- "df.dropna(inplace=True)\n",
58
- "print(\"Cleaned data found.\")"
59
- ]
60
- },
61
- {
62
- "cell_type": "code",
63
- "execution_count": 3,
64
- "metadata": {},
65
- "outputs": [],
66
- "source": [
67
- "from preprocessing import preprocess_text, load_tokenizer, prepare_data\n",
68
- "tokenizer = load_tokenizer(f'./output/version_{version}/tokenizer_{version}.pickle')"
69
- ]
70
- },
71
- {
72
- "cell_type": "code",
73
- "execution_count": 4,
74
- "metadata": {},
75
- "outputs": [],
76
- "source": [
77
- "train_val, test = train_test_split(df, test_size=0.2, random_state=42)\n",
78
- "train, val = train_test_split(\n",
79
- "\ttrain_val, test_size=0.25, random_state=42) # 0.25 * 0.8 = 0.2"
80
- ]
81
- },
82
- {
83
- "cell_type": "code",
84
- "execution_count": 5,
85
- "metadata": {},
86
- "outputs": [],
87
- "source": [
88
- "# Tokenize the data\n",
89
- "X_train_title = tokenizer.texts_to_sequences(train['title'])\n",
90
- "X_train_text = tokenizer.texts_to_sequences(train['text'])\n",
91
- "X_val_title = tokenizer.texts_to_sequences(val['title'])\n",
92
- "X_val_text = tokenizer.texts_to_sequences(val['text'])\n",
93
- "X_test_title = tokenizer.texts_to_sequences(test['title'])\n",
94
- "X_test_text = tokenizer.texts_to_sequences(test['text'])\n",
95
- "\n",
96
- "# Padding sequences\n",
97
- "max_length = 500\n",
98
- "X_train_title = pad_sequences(X_train_title, maxlen=max_length)\n",
99
- "X_train_text = pad_sequences(X_train_text, maxlen=max_length)\n",
100
- "X_val_title = pad_sequences(X_val_title, maxlen=max_length)\n",
101
- "X_val_text = pad_sequences(X_val_text, maxlen=max_length)\n",
102
- "X_test_title = pad_sequences(X_test_title, maxlen=max_length)\n",
103
- "X_test_text = pad_sequences(X_test_text, maxlen=max_length)"
104
- ]
105
- },
106
- {
107
- "cell_type": "code",
108
- "execution_count": 6,
109
- "metadata": {},
110
- "outputs": [],
111
- "source": [
112
- "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
113
- ]
114
- },
115
- {
116
- "cell_type": "code",
117
- "execution_count": 7,
118
- "metadata": {},
119
- "outputs": [],
120
- "source": [
121
- "model = LSTMModel(len(tokenizer.word_index) + 1).to(device)\n",
122
- "\n",
123
- "# Convert data to PyTorch tensors\n",
124
- "train_data = NewsDataset(torch.tensor(X_train_title), torch.tensor(\n",
125
- "\tX_train_text), torch.tensor(train['label'].values))\n",
126
- "val_data = NewsDataset(torch.tensor(X_val_title), torch.tensor(\n",
127
- "\tX_val_text), torch.tensor(val['label'].values))\n",
128
- "test_data = NewsDataset(torch.tensor(X_test_title), torch.tensor(\n",
129
- "\tX_test_text), torch.tensor(test['label'].values))\n",
130
- "\n",
131
- "train_loader = DataLoader(train_data, batch_size=32,\n",
132
- "\t\t\t\t\t\t\tshuffle=True, num_workers=6, pin_memory=True, persistent_workers=True)\n",
133
- "val_loader = DataLoader(val_data, batch_size=32,\n",
134
- "\t\t\t\t\t\tshuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)\n",
135
- "test_loader = DataLoader(test_data, batch_size=32,\n",
136
- "\t\t\t\t\t\t\tshuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)\n",
137
- "\n",
138
- "criterion = nn.BCELoss()\n",
139
- "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)"
140
- ]
141
- },
142
- {
143
- "cell_type": "code",
144
- "execution_count": 8,
145
- "metadata": {},
146
- "outputs": [
147
- {
148
- "name": "stdout",
149
- "output_type": "stream",
150
- "text": [
151
- "Test Accuracy: 98.70%, F1 Score: 0.9868, AUC-ROC: 0.9984\n"
152
- ]
153
- }
154
- ],
155
- "source": [
156
- "model.load_state_dict(torch.load(f\"./output/version_{version}/best_model_{version}.pth\", map_location=device))\n",
157
- "\n",
158
- "# Testing\n",
159
- "model.eval()\n",
160
- "true_labels = []\n",
161
- "predicted_labels = []\n",
162
- "predicted_probs = []\n",
163
- "\n",
164
- "with torch.no_grad():\n",
165
- "\tcorrect = 0\n",
166
- "\ttotal = 0\n",
167
- "\tfor titles, texts, labels in test_loader:\n",
168
- "\t\ttitles, texts, labels = titles.to(device), texts.to(\n",
169
- "\t\t\tdevice), labels.to(device).float()\n",
170
- "\t\toutputs = model(titles, texts).squeeze()\n",
171
- "\n",
172
- "\t\tpredicted = (outputs > 0.5).float()\n",
173
- "\t\ttotal += labels.size(0)\n",
174
- "\t\tcorrect += (predicted == labels).sum().item()\n",
175
- "\t\ttrue_labels.extend(labels.cpu().numpy())\n",
176
- "\t\tpredicted_labels.extend(predicted.cpu().numpy())\n",
177
- "\t\tpredicted_probs.extend(outputs.cpu().numpy())\n",
178
- "\n",
179
- "test_accuracy = 100 * correct / total\n",
180
- "f1 = f1_score(true_labels, predicted_labels)\n",
181
- "auc_roc = roc_auc_score(true_labels, predicted_probs)\n",
182
- "\n",
183
- "print(\n",
184
- "\tf'Test Accuracy: {test_accuracy:.2f}%, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}')\n",
185
- "\n",
186
- "# Create DataFrame and Save to CSV\n",
187
- "confusion_data = pd.DataFrame(\n",
188
- "\t{'True': true_labels, 'Predicted': predicted_labels})\n",
189
- "confusion_data.to_csv('confusion_matrix_data.csv', index=False)"
190
- ]
191
- },
192
- {
193
- "cell_type": "code",
194
- "execution_count": 36,
195
- "metadata": {},
196
- "outputs": [
197
- {
198
- "name": "stdout",
199
- "output_type": "stream",
200
- "text": [
201
- " title \\\n",
202
- "0 Trump’s creating just the kind of legal chaos ... \n",
203
- "\n",
204
- " text \n",
205
- "0 Donald Trump’s request to the Supreme Court o... \n",
206
- "outputs: 0.5209237933158875\n"
207
- ]
208
- },
209
- {
210
- "ename": "TypeError",
211
- "evalue": "iteration over a 0-d array",
212
- "output_type": "error",
213
- "traceback": [
214
- "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
215
- "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
216
- "Cell \u001b[1;32mIn[36], line 30\u001b[0m\n\u001b[0;32m 28\u001b[0m outputs \u001b[38;5;241m=\u001b[39m predict(model, titles, texts, device)\n\u001b[0;32m 29\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutputs: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00moutputs\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 30\u001b[0m \u001b[43mpredictions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mextend\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutputs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcpu\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnumpy\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 31\u001b[0m predicted_labels \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m1\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m p \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0.5\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;241m0\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m predictions]\n\u001b[0;32m 33\u001b[0m \u001b[38;5;28mprint\u001b[39m(predictions)\n",
217
- "\u001b[1;31mTypeError\u001b[0m: iteration over a 0-d array"
218
- ]
219
- }
220
- ],
221
- "source": [
222
- "import numpy as np\n",
223
- "from inference import predict, load_model\n",
224
- "predictions = []\n",
225
- "\n",
226
- "user_title = input(\"Enter title: \")\n",
227
- "user_text = input(\"Enter text: \")\n",
228
- "\n",
229
- "\n",
230
- "# Creating the DataFrame with the user's input\n",
231
- "df = pd.DataFrame({'title': [user_title], 'text': [user_text]})\n",
232
- "print(df.head())\n",
233
- "\n",
234
- "df['title'] = df['title'].apply(preprocess_text)\n",
235
- "df['text'] = df['text'].apply(preprocess_text)\n",
236
- "\n",
237
- "tokenizer = load_tokenizer(f\"./output/version_{version}/tokenizer_{version}.pickle\")\n",
238
- "model = load_model(f\"./output/version_{version}/best_model_{version}.pth\", len(tokenizer.word_index) + 1)\n",
239
- "title = prepare_data(df[\"title\"], tokenizer)\n",
240
- "text = prepare_data(df[\"text\"], tokenizer)\n",
241
- "\n",
242
- "# Create DataLoader\n",
243
- "data_loader = create_data_loader(\n",
244
- " title, text, batch_size=32, shuffle=False)\n",
245
- "model.eval()\n",
246
- "model.to(device)\n",
247
- "for titles, texts in data_loader:\n",
248
- " titles, texts = titles.to(device), texts.to(device)\n",
249
- " outputs = predict(model, titles, texts, device)\n",
250
- " print(f\"outputs: {outputs}\")\n",
251
- " # predictions.extend(outputs.cpu().numpy())\n",
252
- "predicted_labels = [1 if p > 0.5 else 0 for p in predictions]\n",
253
- "\n",
254
- "print(predictions)\n",
255
- "print(predicted_labels)"
256
- ]
257
- },
258
- {
259
- "cell_type": "code",
260
- "execution_count": 18,
261
- "metadata": {},
262
- "outputs": [
263
- {
264
- "name": "stdout",
265
- "output_type": "stream",
266
- "text": [
267
- " title text\n",
268
- "0 hello title hello this is text\n"
269
- ]
270
- }
271
- ],
272
- "source": [
273
- "user_title = input(\"Enter title: \")\n",
274
- "user_text = input(\"Enter text: \")\n",
275
- "\n",
276
- "# Creating the DataFrame with the user's input\n",
277
- "df = pd.DataFrame({'title': [user_title], 'text': [user_text]})\n",
278
- "print(df.head())"
279
- ]
280
- }
281
- ],
282
- "metadata": {
283
- "kernelspec": {
284
- "display_name": "torch",
285
- "language": "python",
286
- "name": "python3"
287
- },
288
- "language_info": {
289
- "codemirror_mode": {
290
- "name": "ipython",
291
- "version": 3
292
- },
293
- "file_extension": ".py",
294
- "mimetype": "text/x-python",
295
- "name": "python",
296
- "nbconvert_exporter": "python",
297
- "pygments_lexer": "ipython3",
298
- "version": "3.10.11"
299
- }
300
- },
301
- "nbformat": 4,
302
- "nbformat_minor": 2
303
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
output/version_9/confusion_matrix_inference_9.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09d71ceae9eb13f58f314dd4c57a5437eab58a7f9419cb5ffa85857cabe9e6b1
3
+ size 22
test.ipynb DELETED
@@ -1,93 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [
8
- {
9
- "name": "stdout",
10
- "output_type": "stream",
11
- "text": [
12
- "GPU is available: True\n"
13
- ]
14
- },
15
- {
16
- "name": "stderr",
17
- "output_type": "stream",
18
- "text": [
19
- "c:\\Users\\kimi\\anaconda3\\envs\\torch\\lib\\site-packages\\spacy\\util.py:910: UserWarning: [W095] Model 'en_core_web_sm' (3.5.0) was trained with spaCy v3.5.0 and may not be 100% compatible with the current version (3.7.2). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n",
20
- " warnings.warn(warn_msg)\n"
21
- ]
22
- }
23
- ],
24
- "source": [
25
- "import torch\n",
26
- "import torch.nn as nn\n",
27
- "import pandas as pd\n",
28
- "from model import LSTMModel\n",
29
- "from preprocessing import preprocess_text\n",
30
- "from data_loader import create_data_loader\n",
31
- "from sklearn.model_selection import train_test_split\n",
32
- "from sklearn.metrics import f1_score, roc_auc_score\n",
33
- "from keras.preprocessing.text import Tokenizer\n",
34
- "from keras_preprocessing.sequence import pad_sequences\n",
35
- "import pickle\n",
36
- "import train as tr\n",
37
- "from torch.utils.data import Dataset, DataLoader\n",
38
- "from data_loader import NewsDataset"
39
- ]
40
- },
41
- {
42
- "cell_type": "code",
43
- "execution_count": null,
44
- "metadata": {},
45
- "outputs": [],
46
- "source": [
47
- "fake_path = './data_1/Fake.csv'\n",
48
- "true_path = './data_1/True.csv'\n",
49
- "\n",
50
- "print(\"No cleaned data found. Cleaning data now...\")\n",
51
- "# Load the datasets\n",
52
- "true_news = pd.read_csv('data_1/True.csv')\n",
53
- "fake_news = pd.read_csv('data_1/Fake.csv')\n",
54
- "\n",
55
- "# Add labels\n",
56
- "true_news['label'] = 1\n",
57
- "fake_news['label'] = 0\n",
58
- "\n",
59
- "# Combine the datasets\n",
60
- "df = pd.concat([true_news, fake_news], ignore_index=True)\n",
61
- "\n",
62
- "# Drop unnecessary columns\n",
63
- "df.drop(columns=['subject', 'date'], inplace=True)\n",
64
- "\n",
65
- "df['title'] = df[0]['title'].apply(preprocess_text)\n",
66
- "df['text'] = df[0]['text'].apply(preprocess_text)\n",
67
- "\n",
68
- "df.to_csv('test.csv', index=False)"
69
- ]
70
- }
71
- ],
72
- "metadata": {
73
- "kernelspec": {
74
- "display_name": "torch",
75
- "language": "python",
76
- "name": "python3"
77
- },
78
- "language_info": {
79
- "codemirror_mode": {
80
- "name": "ipython",
81
- "version": 3
82
- },
83
- "file_extension": ".py",
84
- "mimetype": "text/x-python",
85
- "name": "python",
86
- "nbconvert_exporter": "python",
87
- "pygments_lexer": "ipython3",
88
- "version": "3.10.11"
89
- }
90
- },
91
- "nbformat": 4,
92
- "nbformat_minor": 2
93
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_analysis.ipynb ADDED
The diff for this file is too large to render. See raw diff