{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m383s\u001b[0m 48ms/step - accuracy: 0.7637 - loss: 0.4815 - val_accuracy: 0.8195 - val_loss: 0.3929 - learning_rate: 0.0010\n", "Epoch 2/10\n", "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m360s\u001b[0m 45ms/step - accuracy: 0.8561 - loss: 0.3267 - val_accuracy: 0.8256 - val_loss: 0.3854 - learning_rate: 0.0010\n", "Epoch 3/10\n", "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m373s\u001b[0m 47ms/step - accuracy: 0.8937 - loss: 0.2503 - val_accuracy: 0.8250 - val_loss: 0.4444 - learning_rate: 0.0010\n", "Epoch 4/10\n", "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m377s\u001b[0m 47ms/step - accuracy: 0.9269 - loss: 0.1794 - val_accuracy: 0.8173 - val_loss: 0.4580 - learning_rate: 0.0010\n", "Epoch 5/10\n", "\u001b[1m7964/7964\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m385s\u001b[0m 48ms/step - accuracy: 0.9496 - loss: 0.1284 - val_accuracy: 0.8147 - val_loss: 0.5704 - learning_rate: 0.0010\n", "\u001b[1m2213/2213\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m20s\u001b[0m 9ms/step - accuracy: 0.8228 - loss: 0.3848\n", "Test Accuracy: 0.8214734792709351\n", "\u001b[1m2213/2213\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m24s\u001b[0m 11ms/step\n", "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.84 0.90 0.87 46733\n", " 1 0.77 0.68 0.72 24052\n", "\n", " accuracy 0.82 70785\n", " macro avg 0.81 0.79 0.79 70785\n", "weighted avg 0.82 0.82 0.82 70785\n", "\n", "\n", "Confusion Matrix:\n", "[[41892 4841]\n", " [ 7796 16256]]\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout\n", "from tensorflow.keras.utils import to_categorical\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.metrics import classification_report, confusion_matrix\n", "from tensorflow.keras.callbacks import ReduceLROnPlateau, TensorBoard, EarlyStopping\n", "\n", "# load data\n", "df = pd.read_csv('combined_data.csv')\n", "\n", "# Tokenize the text\n", "tokenizer = Tokenizer()\n", "tokenizer.fit_on_texts(df['title'])\n", "X = tokenizer.texts_to_sequences(df['title'])\n", "X = pad_sequences(X)\n", "\n", "# Encode the target variable\n", "encoder = LabelEncoder()\n", "y = encoder.fit_transform(df['source'])\n", "y = to_categorical(y)\n", "\n", "# Split the data\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "# Build the LSTM model\n", "model = Sequential()\n", "model.add(Embedding(len(tokenizer.word_index) + 1, 128))\n", "model.add(LSTM(128, return_sequences=True))\n", "model.add(Dropout(0.5))\n", "model.add(LSTM(64))\n", "model.add(Dropout(0.5))\n", "model.add(Dense(len(encoder.classes_), activation='softmax'))\n", "model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])\n", "\n", "# Learning rate scheduler\n", "lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5)\n", "\n", "# TensorBoard callback for logging\n", "tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=1)\n", "\n", "# Early stopping to prevent overfitting\n", "early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)\n", "\n", "# Train the model with callbacks\n", "model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, \n", " callbacks=[lr_scheduler, tensorboard_callback, early_stopping])\n", "\n", "# Evaluate the model\n", "loss, accuracy = model.evaluate(X_test, y_test)\n", "print(f\"Test Accuracy: {accuracy}\")\n", "\n", "# Predictions and evaluation\n", "y_pred = model.predict(X_test)\n", "y_pred_classes = y_pred.argmax(axis=1)\n", "y_test_classes = y_test.argmax(axis=1)\n", "\n", "print(\"\\nClassification Report:\")\n", "print(classification_report(y_test_classes, y_pred_classes))\n", "\n", "print(\"\\nConfusion Matrix:\")\n", "print(confusion_matrix(y_test_classes, y_pred_classes))\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n" ] } ], "source": [ "# save model\n", "model.save('news_classifier.h5')\n", "\n", "# save tokenizer\n", "import pickle\n", "with open('tokenizer.pickle', 'wb') as handle:\n", " pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", " \n", "# save encoder\n", "with open('encoder.pickle', 'wb') as handle:\n", " pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# deploy the model\n", "# user give the title and the model will predict the source\n", "# Load the model and tokenizer\n", "from tensorflow.keras.models import load_model\n", "import pickle\n", "\n", "# Load the tokenizer\n", "with open('tokenizer.pickle', 'rb') as handle:\n", " tokenizer = pickle.load(handle)\n", "\n", "# Load the encoder\n", "with open('encoder.pickle', 'rb') as handle:\n", " encoder = pickle.load(handle)\n", "\n", "\n", "def predict_source(title):\n", " # Load the model\n", " model = load_model('news_classifier.h5')\n", " # Tokenize the input\n", " X = tokenizer.texts_to_sequences([title])\n", " X = pad_sequences(X)\n", " # Predict the source\n", " y_pred = model.predict(X)\n", " source = encoder.inverse_transform(y_pred.argmax(axis=1))\n", " return source[0]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 109ms/step\n", "Predicted Source: foxnews\n" ] } ], "source": [ "# Test the function\n", "# user input\n", "title = input(\"Enter the title: \")\n", "source = predict_source(title)\n", "print(f\"Predicted Source: {source}\")" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }