In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.callbacks import ReduceLROnPlateau, TensorBoard, EarlyStopping

# load data
df = pd.read_csv('combined_data.csv')

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['title'])
X = tokenizer.texts_to_sequences(df['title'])
X = pad_sequences(X)

# Encode the target variable
encoder = LabelEncoder()
y = encoder.fit_transform(df['source'])
y = to_categorical(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 128))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(len(encoder.classes_), activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Learning rate scheduler
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5)

# TensorBoard callback for logging
tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=1)

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with callbacks
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, 
 callbacks=[lr_scheduler, tensorboard_callback, early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

# Predictions and evaluation
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)
y_test_classes = y_test.argmax(axis=1)

print("\nClassification Report:")
print(classification_report(y_test_classes, y_pred_classes))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_classes, y_pred_classes))


Epoch 1/10
[1m7964/7964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 48ms/step - accuracy: 0.7637 - loss: 0.4815 - val_accuracy: 0.8195 - val_loss: 0.3929 - learning_rate: 0.0010
Epoch 2/10
[1m7964/7964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m360s[0m 45ms/step - accuracy: 0.8561 - loss: 0.3267 - val_accuracy: 0.8256 - val_loss: 0.3854 - learning_rate: 0.0010
Epoch 3/10
[1m7964/7964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m373s[0m 47ms/step - accuracy: 0.8937 - loss: 0.2503 - val_accuracy: 0.8250 - val_loss: 0.4444 - learning_rate: 0.0010
Epoch 4/10
[1m7964/7964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m377s[0m 47ms/step - accuracy: 0.9269 - loss: 0.1794 - val_accuracy: 0.8173 - val_loss: 0.4580 - learning_rate: 0.0010
Epoch 5/10
[1m7964/7964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 48ms/step - accuracy: 0.9496 - loss: 0.1284 - val_accuracy: 0.8147 - val_loss: 0.5704 - learning_rate: 0.0010
[1m2213/2213[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [6]:
# save model
model.save('news_classifier.h5')

# save tokenizer
import pickle
with open('tokenizer.pickle', 'wb') as handle:
 pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
# save encoder
with open('encoder.pickle', 'wb') as handle:
 pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)




In [14]:
# deploy the model
# user give the title and the model will predict the source
# Load the model and tokenizer
from tensorflow.keras.models import load_model
import pickle

# Load the tokenizer
with open('tokenizer.pickle', 'rb') as handle:
 tokenizer = pickle.load(handle)

# Load the encoder
with open('encoder.pickle', 'rb') as handle:
 encoder = pickle.load(handle)


def predict_source(title):
 # Load the model
 model = load_model('news_classifier.h5')
 # Tokenize the input
 X = tokenizer.texts_to_sequences([title])
 X = pad_sequences(X)
 # Predict the source
 y_pred = model.predict(X)
 source = encoder.inverse_transform(y_pred.argmax(axis=1))
 return source[0]

In [26]:
# Test the function
# user input
title = input("Enter the title: ")
source = predict_source(title)
print(f"Predicted Source: {source}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
Predicted Source: foxnews
