|
import gradio as gr |
|
from transformers import DistilBertTokenizer, TFDistilBertModel |
|
import numpy as np |
|
import tensorflow as tf |
|
from sklearn.model_selection import train_test_split |
|
import pandas as pd |
|
|
|
|
|
df = pd.read_csv("final_bn_data.csv") |
|
|
|
|
|
sentences = df["text"] |
|
labels = df["label"] |
|
|
|
|
|
distil_bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') |
|
distil_bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased') |
|
|
|
|
|
max_len = 40 |
|
input_ids = [] |
|
attention_masks = [] |
|
|
|
for sent in sentences: |
|
distil_bert_input_shape = distil_bert_tokenizer.encode_plus(sent, add_special_tokens=True, max_length=max_len, pad_to_max_length=True, return_attention_mask=True, truncation=True) |
|
input_ids.append(distil_bert_input_shape['input_ids']) |
|
attention_masks.append(distil_bert_input_shape['attention_mask']) |
|
|
|
input_ids = np.array(input_ids) |
|
attention_masks = np.array(attention_masks) |
|
|
|
|
|
X_train_input, X_test_input, Y_train_label, Y_test_label, train_mask, test_mask = train_test_split(input_ids, labels, attention_masks, test_size=0.3, random_state=42, shuffle=True) |
|
|
|
|
|
def create_model(): |
|
input_shape = tf.keras.Input(shape=(max_len,), dtype='int32') |
|
masks = tf.keras.Input(shape=(max_len,), dtype='int32') |
|
distil_bert_layer = distil_bert_model(input_ids=input_shape, attention_mask=masks)[0] |
|
X = tf.keras.layers.GRU(128, return_sequences=True)(distil_bert_layer) |
|
X = tf.keras.layers.GlobalMaxPool1D()(X) |
|
X = tf.keras.layers.Dense(64, activation="tanh")(X) |
|
X = tf.keras.layers.Dense(32)(X) |
|
X = tf.keras.layers.Dense(16)(X) |
|
X = tf.keras.layers.Dense(8)(X) |
|
X = tf.keras.layers.Dense(4)(X) |
|
X = tf.keras.layers.Dropout(0.5)(X) |
|
X = tf.keras.layers.Dense(2, activation='softmax')(X) |
|
model = tf.keras.Model(inputs=[input_shape, masks], outputs=X) |
|
return model |
|
|
|
|
|
model = create_model() |
|
|
|
|
|
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) |
|
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') |
|
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5) |
|
model.compile(loss=loss, optimizer=optimizer, metrics=[metric]) |
|
|
|
|
|
history = model.fit([X_train_input, train_mask], Y_train_label, batch_size=32, epochs=20, validation_data=([X_test_input, test_mask], Y_test_label)) |
|
|
|
|
|
def classify_bangla_fake_news(description): |
|
input_ids = distil_bert_tokenizer.encode(description, add_special_tokens=True, max_length=40, truncation=True, padding='max_length') |
|
input_mask = [1] * len(input_ids) |
|
input_ids = np.asarray(input_ids).reshape(1, -1) |
|
input_mask = np.asarray(input_mask).reshape(1, -1) |
|
prediction = model.predict([input_ids, input_mask])[0] |
|
predicted_class = np.argmax(prediction) |
|
return "Fake" if predicted_class == 0 else "Real" |
|
|
|
iface = gr.Interface( |
|
fn=classify_bangla_fake_news, |
|
inputs="text", |
|
outputs="label", |
|
title="Bangla Fake News Detection", |
|
description="Enter a Bangla news article and get prediction whether it's real or fake." |
|
) |
|
|
|
|
|
iface.launch(inline=False) |
|
|