kimic
/

fake-news-detector-LSTM

Model card Files Files and versions Community

kimic commited on Dec 22, 2023

Commit

c5cd586

•

1 Parent(s): 85431a0

Initial commit

Browse files

Files changed (21) hide show

.gitattributes +1 -0
.gitignore +160 -0
analysis.ipynb +0 -0
data_1/Fake.csv +3 -0
data_1/True.csv +3 -0
data_2/WELFake_Dataset.csv +3 -0
data_loader.py +22 -0
inference.py +39 -0
inference_main.py +67 -0
inference_more.ipynb +303 -0
model.py +20 -0
output/version_7/best_model_7.pth +3 -0
output/version_7/cleaned_inference_data_7.csv +3 -0
output/version_7/cleaned_news_data_7.csv +3 -0
output/version_7/confusion_matrix_data_7.csv +3 -0
output/version_7/tokenizer_7.pickle +3 -0
output/version_7/training_metrics_7.csv +3 -0
preprocessing.py +46 -0
test.ipynb +93 -0
train.py +89 -0
train_main.py +180 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

data_1/Fake.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bebf8bcfe95678bf2c732bf413a2ce5f621af0102c82bf08083b2e5d3c693d0c
+size 62789876

data_1/True.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba0844414a65dc6ae7402b8eee5306da24b6b56488d6767135af466c7dcb2775
+size 53582940

data_2/WELFake_Dataset.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:665331424230fc452e9482c3547a6a199a2c29745ade8d236950d1d105223773
+size 245086152

data_loader.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from torch.utils.data import Dataset, DataLoader
+import torch
+class NewsDataset(Dataset):
+    def __init__(self, titles, texts, labels=None):
+        self.titles = titles
+        self.texts = texts
+        self.labels = labels
+    def __len__(self):
+        return len(self.titles)
+    def __getitem__(self, idx):
+        if self.labels is not None:
+            return self.titles[idx], self.texts[idx], self.labels[idx]
+        return self.titles[idx], self.texts[idx]
+def create_data_loader(titles, texts, labels=None, batch_size=32, shuffle=False, num_workers=6):
+    dataset = NewsDataset(titles, texts, labels)
+    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True, persistent_workers=True)

inference.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+import pandas as pd
+from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
+from model import LSTMModel
+def load_model(model_path, vocab_size):
+    model = LSTMModel(vocab_size)
+    model.load_state_dict(torch.load(model_path))
+    model.eval()
+    return model
+def predict(model, titles, texts, device):
+    titles, texts = titles.to(device), texts.to(device)
+    model.to(device)
+    with torch.no_grad():
+        outputs = model(titles, texts).squeeze()
+    return outputs
+def evaluate_model(model, data_loader, device, labels):
+    model.to(device)
+    model.eval()
+    predictions = []
+    labels = torch.tensor(labels).to(device)
+    for titles, texts in data_loader:
+        titles, texts = titles.to(device), texts.to(device)
+        outputs = predict(model, titles, texts, device)
+        predictions.extend(outputs.cpu().numpy())
+    labels = labels.cpu()
+    # Calculate metrics
+    predicted_labels = [1 if p > 0.5 else 0 for p in predictions]
+    accuracy = accuracy_score(labels, predicted_labels)
+    f1 = f1_score(labels, predicted_labels)
+    auc_roc = roc_auc_score(labels, predictions)
+    return accuracy, f1, auc_roc

inference_main.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+import pandas as pd
+from preprocessing import preprocess_text, load_tokenizer, prepare_data
+from data_loader import create_data_loader
+from inference import load_model, evaluate_model
+version = 7
+def run_evaluation(model_path, tokenizer_path, device):
+    cleaned_path = f'./output/version_{version}/cleaned_inference_data_{version}.csv'
+    # Load data
+    try:
+        df = pd.read_csv(cleaned_path)
+        df.dropna(inplace=True)
+        print("Cleaned data found.")
+    except:
+        print("No cleaned data found. Cleaning data now...")
+        # Load the datasets
+        true_news = pd.read_csv('data_1/True.csv')
+        fake_news = pd.read_csv('data_1/Fake.csv')
+        # Add labels
+        true_news['label'] = 1
+        fake_news['label'] = 0
+        # Combine the datasets
+        df = pd.concat([true_news, fake_news], ignore_index=True)
+        # Drop unnecessary columns
+        df.drop(columns=['subject', 'date'], inplace=True)
+        df['title'] = df['title'].apply(preprocess_text)
+        df['text'] = df['text'].apply(preprocess_text)
+        df.to_csv(cleaned_path, index=False)
+        df.dropna(inplace=True)
+        print("Cleaned data saved.")
+    labels = df['label'].values
+    # Load tokenizer and model
+    tokenizer = load_tokenizer(tokenizer_path)
+    model = load_model(model_path, len(tokenizer.word_index) + 1)
+    # Prepare data
+    titles = prepare_data(df['title'], tokenizer)
+    texts = prepare_data(df['text'], tokenizer)
+    # Create DataLoader
+    data_loader = create_data_loader(
+        titles, texts, batch_size=32, shuffle=False)
+    # Evaluate
+    accuracy, f1, auc_roc = evaluate_model(model, data_loader, device, labels)
+    return accuracy, f1, auc_roc
+if __name__ == "__main__":
+    model_path = f'./output/version_{version}/best_model_{version}.pth'
+    tokenizer_path = f'./output/version_{version}/tokenizer_{version}.pickle'
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Device: {device}")
+    accuracy, f1, auc_roc = run_evaluation(model_path, tokenizer_path, device)
+    print(
+        f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}')

inference_more.ipynb ADDED Viewed

	@@ -0,0 +1,303 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GPU is available: True\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import pandas as pd\n",
+    "from model import LSTMModel\n",
+    "from data_loader import create_data_loader\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import f1_score, roc_auc_score\n",
+    "from keras_preprocessing.sequence import pad_sequences\n",
+    "from torch.utils.data import DataLoader\n",
+    "from data_loader import NewsDataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "version = 7"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Cleaned data found.\n"
+     ]
+    }
+   ],
+   "source": [
+    "data_path = './data_2/WELFake_Dataset.csv'\n",
+    "cleaned_path = f'./output/version_{version}/cleaned_news_data_{version}.csv'\n",
+    "\n",
+    "# Load data\n",
+    "df = pd.read_csv(cleaned_path)\n",
+    "df.dropna(inplace=True)\n",
+    "print(\"Cleaned data found.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from preprocessing import preprocess_text, load_tokenizer, prepare_data\n",
+    "tokenizer = load_tokenizer(f'./output/version_{version}/tokenizer_{version}.pickle')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_val, test = train_test_split(df, test_size=0.2, random_state=42)\n",
+    "train, val = train_test_split(\n",
+    "\ttrain_val, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Tokenize the data\n",
+    "X_train_title = tokenizer.texts_to_sequences(train['title'])\n",
+    "X_train_text = tokenizer.texts_to_sequences(train['text'])\n",
+    "X_val_title = tokenizer.texts_to_sequences(val['title'])\n",
+    "X_val_text = tokenizer.texts_to_sequences(val['text'])\n",
+    "X_test_title = tokenizer.texts_to_sequences(test['title'])\n",
+    "X_test_text = tokenizer.texts_to_sequences(test['text'])\n",
+    "\n",
+    "# Padding sequences\n",
+    "max_length = 500\n",
+    "X_train_title = pad_sequences(X_train_title, maxlen=max_length)\n",
+    "X_train_text = pad_sequences(X_train_text, maxlen=max_length)\n",
+    "X_val_title = pad_sequences(X_val_title, maxlen=max_length)\n",
+    "X_val_text = pad_sequences(X_val_text, maxlen=max_length)\n",
+    "X_test_title = pad_sequences(X_test_title, maxlen=max_length)\n",
+    "X_test_text = pad_sequences(X_test_text, maxlen=max_length)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = LSTMModel(len(tokenizer.word_index) + 1).to(device)\n",
+    "\n",
+    "# Convert data to PyTorch tensors\n",
+    "train_data = NewsDataset(torch.tensor(X_train_title), torch.tensor(\n",
+    "\tX_train_text), torch.tensor(train['label'].values))\n",
+    "val_data = NewsDataset(torch.tensor(X_val_title), torch.tensor(\n",
+    "\tX_val_text), torch.tensor(val['label'].values))\n",
+    "test_data = NewsDataset(torch.tensor(X_test_title), torch.tensor(\n",
+    "\tX_test_text), torch.tensor(test['label'].values))\n",
+    "\n",
+    "train_loader = DataLoader(train_data, batch_size=32,\n",
+    "\t\t\t\t\t\t\tshuffle=True, num_workers=6, pin_memory=True, persistent_workers=True)\n",
+    "val_loader = DataLoader(val_data, batch_size=32,\n",
+    "\t\t\t\t\t\tshuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)\n",
+    "test_loader = DataLoader(test_data, batch_size=32,\n",
+    "\t\t\t\t\t\t\tshuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)\n",
+    "\n",
+    "criterion = nn.BCELoss()\n",
+    "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Test Accuracy: 98.70%, F1 Score: 0.9868, AUC-ROC: 0.9984\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.load_state_dict(torch.load(f\"./output/version_{version}/best_model_{version}.pth\", map_location=device))\n",
+    "\n",
+    "# Testing\n",
+    "model.eval()\n",
+    "true_labels = []\n",
+    "predicted_labels = []\n",
+    "predicted_probs = []\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "\tcorrect = 0\n",
+    "\ttotal = 0\n",
+    "\tfor titles, texts, labels in test_loader:\n",
+    "\t\ttitles, texts, labels = titles.to(device), texts.to(\n",
+    "\t\t\tdevice), labels.to(device).float()\n",
+    "\t\toutputs = model(titles, texts).squeeze()\n",
+    "\n",
+    "\t\tpredicted = (outputs > 0.5).float()\n",
+    "\t\ttotal += labels.size(0)\n",
+    "\t\tcorrect += (predicted == labels).sum().item()\n",
+    "\t\ttrue_labels.extend(labels.cpu().numpy())\n",
+    "\t\tpredicted_labels.extend(predicted.cpu().numpy())\n",
+    "\t\tpredicted_probs.extend(outputs.cpu().numpy())\n",
+    "\n",
+    "test_accuracy = 100 * correct / total\n",
+    "f1 = f1_score(true_labels, predicted_labels)\n",
+    "auc_roc = roc_auc_score(true_labels, predicted_probs)\n",
+    "\n",
+    "print(\n",
+    "\tf'Test Accuracy: {test_accuracy:.2f}%, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}')\n",
+    "\n",
+    "# Create DataFrame and Save to CSV\n",
+    "confusion_data = pd.DataFrame(\n",
+    "\t{'True': true_labels, 'Predicted': predicted_labels})\n",
+    "confusion_data.to_csv('confusion_matrix_data.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                               title  \\\n",
+      "0  Trump’s creating just the kind of legal chaos ...   \n",
+      "\n",
+      "                                                text  \n",
+      "0   Donald Trump’s request to the Supreme Court o...  \n",
+      "outputs: 0.5209237933158875\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "iteration over a 0-d array",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[36], line 30\u001b[0m\n\u001b[0;32m     28\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m predict(model, titles, texts, device)\n\u001b[0;32m     29\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutputs: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00moutputs\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 30\u001b[0m     \u001b[43mpredictions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mextend\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutputs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcpu\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnumpy\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     31\u001b[0m predicted_labels \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m1\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m p \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0.5\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;241m0\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m predictions]\n\u001b[0;32m     33\u001b[0m \u001b[38;5;28mprint\u001b[39m(predictions)\n",
+      "\u001b[1;31mTypeError\u001b[0m: iteration over a 0-d array"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "from inference import predict, load_model\n",
+    "predictions = []\n",
+    "\n",
+    "user_title = input(\"Enter title: \")\n",
+    "user_text = input(\"Enter text: \")\n",
+    "\n",
+    "\n",
+    "# Creating the DataFrame with the user's input\n",
+    "df = pd.DataFrame({'title': [user_title], 'text': [user_text]})\n",
+    "print(df.head())\n",
+    "\n",
+    "df['title'] = df['title'].apply(preprocess_text)\n",
+    "df['text'] = df['text'].apply(preprocess_text)\n",
+    "\n",
+    "tokenizer = load_tokenizer(f\"./output/version_{version}/tokenizer_{version}.pickle\")\n",
+    "model = load_model(f\"./output/version_{version}/best_model_{version}.pth\", len(tokenizer.word_index) + 1)\n",
+    "title = prepare_data(df[\"title\"], tokenizer)\n",
+    "text = prepare_data(df[\"text\"], tokenizer)\n",
+    "\n",
+    "# Create DataLoader\n",
+    "data_loader = create_data_loader(\n",
+    "    title, text, batch_size=32, shuffle=False)\n",
+    "model.eval()\n",
+    "model.to(device)\n",
+    "for titles, texts in data_loader:\n",
+    "    titles, texts = titles.to(device), texts.to(device)\n",
+    "    outputs = predict(model, titles, texts, device)\n",
+    "    print(f\"outputs: {outputs}\")\n",
+    "    # predictions.extend(outputs.cpu().numpy())\n",
+    "predicted_labels = [1 if p > 0.5 else 0 for p in predictions]\n",
+    "\n",
+    "print(predictions)\n",
+    "print(predicted_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "         title                text\n",
+      "0  hello title  hello this is text\n"
+     ]
+    }
+   ],
+   "source": [
+    "user_title = input(\"Enter title: \")\n",
+    "user_text = input(\"Enter text: \")\n",
+    "\n",
+    "# Creating the DataFrame with the user's input\n",
+    "df = pd.DataFrame({'title': [user_title], 'text': [user_text]})\n",
+    "print(df.head())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

model.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+import torch.nn as nn
+class LSTMModel(nn.Module):
+    def __init__(self, vocab_size, embedding_dim=128, hidden_size=256, num_layers=2, dropout=0.2):
+        super(LSTMModel, self).__init__()
+        self.embedding = nn.Embedding(
+            num_embeddings=vocab_size, embedding_dim=embedding_dim)
+        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size,
+                            num_layers=num_layers, batch_first=True, dropout=dropout)
+        self.fc = nn.Linear(hidden_size, 1)
+    def forward(self, title, text):
+        title_emb = self.embedding(title)
+        text_emb = self.embedding(text)
+        combined = torch.cat((title_emb, text_emb), dim=1)
+        output, (hidden, _) = self.lstm(combined)
+        out = self.fc(hidden[-1])
+        return torch.sigmoid(out)

output/version_7/best_model_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5b5750829a8f672dcbd297143eddbf0621a024930055dd7f7363db34ac6e374
+size 101492472

output/version_7/cleaned_inference_data_7.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13c4f9369494fb6a24ca2c1415027736e0814b576854b0187a4dd93c5f6f344b
+size 74695505

output/version_7/cleaned_news_data_7.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7b95458a091aeaede51ebb58a6d039e21dbc8d50a78439563d8d2b6149c1150
+size 154624396

output/version_7/confusion_matrix_data_7.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ca5e75060519cb2c81af94130fd48d3db06689baf6f32eb434625ab22aa168f
+size 127312

output/version_7/tokenizer_7.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e129ef007dd8405eefe6ed17a5737e368f2066bef28e933c469a180499994a56
+size 8812251

output/version_7/training_metrics_7.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:273d33e9f9cdd11a57149d86f8ded1ee222cab35508628fa05dbb3f31fae20cb
+size 1252

preprocessing.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import re
+import spacy
+from keras.preprocessing.text import Tokenizer
+from keras_preprocessing.sequence import pad_sequences
+import pickle
+spacy.prefer_gpu()
+print("GPU is available:", spacy.prefer_gpu())
+# Load spaCy's English model
+nlp = spacy.load('en_core_web_sm')
+def preprocess_text(text):
+    # Remove patterns like "COUNTRY or STATE NAME (Reuters) -"
+    text = re.sub(r'\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-', '', text)
+    # Remove patterns like "Featured image via author name / image place"
+    text = re.sub(r'Featured image via .+ / .+', '', text)
+    # Process text with spaCy
+    doc = nlp(text)
+    # Improved lemmatization
+    lemmatized_text = []
+    for token in doc:
+        # Preserve named entities in their original form
+        if token.ent_type_:
+            lemmatized_text.append(token.text)
+        # Lemmatize other tokens and exclude non-alpha tokens if necessary
+        elif token.is_alpha and not token.is_stop:
+            lemmatized_text.append(token.lemma_.lower())
+    return ' '.join(lemmatized_text)
+def load_tokenizer(tokenizer_path):
+    with open(tokenizer_path, 'rb') as handle:
+        tokenizer = pickle.load(handle)
+    return tokenizer
+def prepare_data(texts, tokenizer, max_length=500):
+    sequences = tokenizer.texts_to_sequences(texts)
+    padded = pad_sequences(sequences, maxlen=max_length)
+    return padded

test.ipynb ADDED Viewed

	@@ -0,0 +1,93 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GPU is available: True\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\kimi\\anaconda3\\envs\\torch\\lib\\site-packages\\spacy\\util.py:910: UserWarning: [W095] Model 'en_core_web_sm' (3.5.0) was trained with spaCy v3.5.0 and may not be 100% compatible with the current version (3.7.2). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n",
+      "  warnings.warn(warn_msg)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import pandas as pd\n",
+    "from model import LSTMModel\n",
+    "from preprocessing import preprocess_text\n",
+    "from data_loader import create_data_loader\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import f1_score, roc_auc_score\n",
+    "from keras.preprocessing.text import Tokenizer\n",
+    "from keras_preprocessing.sequence import pad_sequences\n",
+    "import pickle\n",
+    "import train as tr\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "from data_loader import NewsDataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fake_path = './data_1/Fake.csv'\n",
+    "true_path = './data_1/True.csv'\n",
+    "\n",
+    "print(\"No cleaned data found. Cleaning data now...\")\n",
+    "# Load the datasets\n",
+    "true_news = pd.read_csv('data_1/True.csv')\n",
+    "fake_news = pd.read_csv('data_1/Fake.csv')\n",
+    "\n",
+    "# Add labels\n",
+    "true_news['label'] = 1\n",
+    "fake_news['label'] = 0\n",
+    "\n",
+    "# Combine the datasets\n",
+    "df = pd.concat([true_news, fake_news], ignore_index=True)\n",
+    "\n",
+    "# Drop unnecessary columns\n",
+    "df.drop(columns=['subject', 'date'], inplace=True)\n",
+    "\n",
+    "df['title'] = df[0]['title'].apply(preprocess_text)\n",
+    "df['text'] = df[0]['text'].apply(preprocess_text)\n",
+    "\n",
+    "df.to_csv('test.csv', index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

train.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import torch
+import pandas as pd
+import time
+from torch.nn.utils import clip_grad_norm_
+def train(model, train_loader, val_loader, criterion, optimizer, epochs, device, version, max_grad_norm=1.0, early_stopping_patience=5, early_stopping_delta=0.001):
+    best_accuracy = 0.0
+    best_model_path = f'./output/version_{version}/best_model_{version}.pth'
+    best_epoch = 0
+    early_stopping_counter = 0
+    total_batches = len(train_loader)
+    metrics = {
+        'epoch': [], 'train_loss': [], 'val_loss': [], 'train_accuracy': [], 'val_accuracy': []
+    }
+    for epoch in range(epochs):
+        model.train()
+        total_loss, train_correct, train_total = 0, 0, 0
+        for batch_idx, (titles, texts, labels) in enumerate(train_loader):
+            start_time = time.time()  # Start time for the batch
+            titles, texts, labels = titles.to(device), texts.to(
+                device), labels.to(device).float()
+            # Forward pass
+            outputs = model(titles, texts).squeeze()
+            loss = criterion(outputs, labels)
+            # Backward and optimize
+            optimizer.zero_grad()
+            loss.backward()
+            if max_grad_norm:
+                clip_grad_norm_(model.parameters(), max_norm=max_grad_norm)
+            optimizer.step()
+            total_loss += loss.item()
+            train_pred = (outputs > 0.5).float()
+            train_correct += (train_pred == labels).sum().item()
+            train_total += labels.size(0)
+            # Calculate and print batch processing time
+            batch_time = time.time() - start_time
+            print(
+                f'Epoch: {epoch+1}, Batch: {batch_idx+1}/{total_batches}, Batch Processing Time: {batch_time:.4f} seconds')
+        train_accuracy = 100 * train_correct / train_total
+        metrics['train_loss'].append(total_loss / len(train_loader))
+        metrics['train_accuracy'].append(train_accuracy)
+        # Validation
+        model.eval()
+        val_loss, val_correct, val_total = 0, 0, 0
+        with torch.no_grad():
+            for titles, texts, labels in val_loader:
+                titles, texts, labels = titles.to(device), texts.to(
+                    device), labels.to(device).float()
+                outputs = model(titles, texts).squeeze()
+                loss = criterion(outputs, labels)
+                val_loss += loss.item()
+                predicted = (outputs > 0.5).float()
+                val_total += labels.size(0)
+                val_correct += (predicted == labels).sum().item()
+        val_accuracy = 100 * val_correct / val_total
+        metrics['val_loss'].append(val_loss / len(val_loader))
+        metrics['val_accuracy'].append(val_accuracy)
+        metrics['epoch'].append(epoch + 1)
+        # Early stopping logic
+        if val_accuracy > best_accuracy + early_stopping_delta:
+            best_accuracy = val_accuracy
+            early_stopping_counter = 0
+            best_epoch = epoch + 1
+            torch.save(model.state_dict(), best_model_path)
+        else:
+            early_stopping_counter += 1
+        if early_stopping_counter >= early_stopping_patience:
+            print(f"Early stopping triggered at epoch {epoch + 1}")
+            break
+        print(
+            f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}, Validation Accuracy: {val_accuracy:.2f}%')
+    pd.DataFrame(metrics).to_csv(
+        f'./output/version_{version}/training_metrics_{version}.csv', index=False)
+    return model, best_accuracy, best_epoch

train_main.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import torch
+import torch.nn as nn
+import pandas as pd
+from model import LSTMModel
+from preprocessing import preprocess_text
+from data_loader import create_data_loader
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import f1_score, roc_auc_score
+from keras.preprocessing.text import Tokenizer
+from keras_preprocessing.sequence import pad_sequences
+import pickle
+import train as tr
+from torch.utils.data import Dataset, DataLoader
+from data_loader import NewsDataset
+version = 7
+if __name__ == "__main__":
+    # fake_path = './data_1/Fake.csv'
+    # true_path = './data_1/True.csv'
+    # cleaned_path = './cleaned_news_data.csv'
+    # # Load data
+    # try:
+    #     df = pd.read_csv(cleaned_path)
+    #     df.dropna(inplace=True)
+    #     print("Cleaned data found.")
+    # except:
+    #     print("No cleaned data found. Cleaning data now...")
+    #     # Load the datasets
+    #     true_news = pd.read_csv('data_1/True.csv')
+    #     fake_news = pd.read_csv('data_1/Fake.csv')
+    #     # Add labels
+    #     true_news['label'] = 1
+    #     fake_news['label'] = 0
+    #     # Combine the datasets
+    #     df = pd.concat([true_news, fake_news], ignore_index=True)
+    #     # Drop unnecessary columns
+    #     df.drop(columns=['subject', 'date'], inplace=True)
+    #     df['title'] = df['title'].apply(preprocess_text)
+    #     df['text'] = df['text'].apply(preprocess_text)
+    #     df.to_csv('cleaned_news_data.csv', index=False)
+    #     df.dropna(inplace=True)
+    data_path = './data_2/WELFake_Dataset.csv'
+    cleaned_path = f'./output/version_{version}/cleaned_news_data_{version}.csv'
+    # Load data
+    try:
+        df = pd.read_csv(cleaned_path)
+        df.dropna(inplace=True)
+        print("Cleaned data found.")
+    except:
+        print("No cleaned data found. Cleaning data now...")
+        df = pd.read_csv(data_path)
+        # Drop index
+        df.drop(df.columns[0], axis=1, inplace=True)
+        df.dropna(inplace=True)
+        # Swapping labels around since it originally is the opposite
+        df['label'] = df['label'].map({0: 1, 1: 0})
+        df['title'] = df['title'].apply(preprocess_text)
+        df['text'] = df['text'].apply(preprocess_text)
+        df.to_csv(cleaned_path, index=False)
+        print("Cleaned data saved.")
+    # Splitting the data
+    train_val, test = train_test_split(df, test_size=0.2, random_state=42)
+    train, val = train_test_split(
+        train_val, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2
+    # Initialize the tokenizer
+    tokenizer = Tokenizer()
+    # Fit the tokenizer on the training data
+    tokenizer.fit_on_texts(train['title'] + train['text'])
+    with open(f'./output/version_{version}/tokenizer_{version}.pickle', 'wb') as handle:
+        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    # Tokenize the data
+    X_train_title = tokenizer.texts_to_sequences(train['title'])
+    X_train_text = tokenizer.texts_to_sequences(train['text'])
+    X_val_title = tokenizer.texts_to_sequences(val['title'])
+    X_val_text = tokenizer.texts_to_sequences(val['text'])
+    X_test_title = tokenizer.texts_to_sequences(test['title'])
+    X_test_text = tokenizer.texts_to_sequences(test['text'])
+    # Padding sequences
+    max_length = 500
+    X_train_title = pad_sequences(X_train_title, maxlen=max_length)
+    X_train_text = pad_sequences(X_train_text, maxlen=max_length)
+    X_val_title = pad_sequences(X_val_title, maxlen=max_length)
+    X_val_text = pad_sequences(X_val_text, maxlen=max_length)
+    X_test_title = pad_sequences(X_test_title, maxlen=max_length)
+    X_test_text = pad_sequences(X_test_text, maxlen=max_length)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Device: {device}")
+    model = LSTMModel(len(tokenizer.word_index) + 1).to(device)
+    # Convert data to PyTorch tensors
+    train_data = NewsDataset(torch.tensor(X_train_title), torch.tensor(
+        X_train_text), torch.tensor(train['label'].values))
+    val_data = NewsDataset(torch.tensor(X_val_title), torch.tensor(
+        X_val_text), torch.tensor(val['label'].values))
+    test_data = NewsDataset(torch.tensor(X_test_title), torch.tensor(
+        X_test_text), torch.tensor(test['label'].values))
+    train_loader = DataLoader(train_data, batch_size=32,
+                              shuffle=True, num_workers=6, pin_memory=True, persistent_workers=True)
+    val_loader = DataLoader(val_data, batch_size=32,
+                            shuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)
+    test_loader = DataLoader(test_data, batch_size=32,
+                             shuffle=False, num_workers=6, pin_memory=True, persistent_workers=True)
+    criterion = nn.BCELoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+    trained_model, best_accuracy, best_epoch = tr.train(
+        model=model,
+        train_loader=train_loader,
+        val_loader=val_loader,
+        criterion=criterion,
+        optimizer=optimizer,
+        version=version,
+        epochs=50,
+        device=device,
+        max_grad_norm=1.0,
+        early_stopping_patience=3,
+        early_stopping_delta=0.001
+    )
+    print(f'Best model was saved at epoch: {best_epoch}')
+    # Load the best model before testing
+    best_model_path = f'./output/version_{version}/best_model_{version}.pth'
+    model.load_state_dict(torch.load(best_model_path, map_location=device))
+    # Testing
+    model.eval()
+    true_labels = []
+    predicted_labels = []
+    predicted_probs = []
+    with torch.no_grad():
+        correct = 0
+        total = 0
+        for titles, texts, labels in test_loader:
+            titles, texts, labels = titles.to(device), texts.to(
+                device), labels.to(device).float()
+            outputs = model(titles, texts).squeeze()
+            predicted = (outputs > 0.5).float()
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+            true_labels.extend(labels.cpu().numpy())
+            predicted_labels.extend(predicted.cpu().numpy())
+            predicted_probs.extend(outputs.cpu().numpy())
+    test_accuracy = 100 * correct / total
+    f1 = f1_score(true_labels, predicted_labels)
+    auc_roc = roc_auc_score(true_labels, predicted_probs)
+    print(
+        f'Test Accuracy: {test_accuracy:.2f}%, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}')
+    # Create DataFrame and Save to CSV
+    confusion_data = pd.DataFrame(
+        {'True': true_labels, 'Predicted': predicted_labels})
+    confusion_data.to_csv(
+        f'./output/version_{version}/confusion_matrix_data_{version}.csv', index=False)