Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
import tensorflow as tf | |
import pickle | |
import spacy | |
from tqdm import tqdm | |
import gc | |
import os | |
import torch | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
from fastai.vision.all import * | |
from fastai.text.all import * | |
from torch.utils.data import Dataset | |
from DeBERTaV3 import ModelLoader | |
model_lst = ["DeBERTaV3", "BiLSTM"] | |
# BiLSTM Model | |
## Download the SpaCy model | |
os.system("python -m spacy download en_core_web_lg") | |
## Load models | |
model_1 = tf.keras.models.load_model("BiLSTM/model_1.h5") | |
model_2 = tf.keras.models.load_model("BiLSTM/model_2.h5") | |
model_3 = tf.keras.models.load_model("BiLSTM/model_3.h5") | |
model_4 = tf.keras.models.load_model("BiLSTM/model_4.h5") | |
## Load dictionaries | |
with open('BiLSTM/word_dict.pkl', 'rb') as f: | |
word_dict = pickle.load(f) | |
## Load SpaCy NLP model | |
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger']) | |
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP) | |
## tokenizer | |
def preprocess_text(text): | |
"""Preprocess the input text using SpaCy and return word indices.""" | |
docs = nlp.pipe([text], n_process=1) | |
word_seq = [] | |
for doc in docs: | |
for token in doc: | |
if token.pos_ != "PUNCT": | |
if token.text not in word_dict: | |
word_dict[token.text] = 0 # OOV_INDEX | |
word_seq.append(word_dict[token.text]) | |
return word_seq | |
# DeBERTaV3 Model | |
## Load tokenizer and model | |
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base") | |
class QuestionDataset(Dataset): | |
def __init__(self, X, y, tokenizer): | |
self.text = X | |
self.targets = y | |
self.tok = tokenizer | |
def __len__(self): | |
return len(self.text) | |
def __getitem__(self, idx): | |
text = self.text[idx] | |
targ = self.targets[idx] | |
return self.tok(text, padding='max_length', | |
truncation=True, | |
max_length=30, | |
return_tensors="pt")["input_ids"][0], tensor(targ) | |
def new_empty(self): | |
return QuestionDataset([], [], self.tok) | |
model_loader = ModelLoader() | |
learner = model_loader.get_learner() | |
print("Learner loaded successfully.") | |
## DataLoader | |
class TestDS: | |
def __init__(self, tensors): | |
self.tensors = tensors | |
def __len__(self): | |
return len(self.tensors) | |
def __getitem__(self, idx): | |
t = self.tensors[idx] | |
return t, tensor(0) | |
class DeBERTaV3Model: | |
def __init__(self): | |
pass | |
def predict(self, text): | |
# Preprocess the text | |
test_tensor = tokenizer(text, padding="max_length", truncation=True, max_length=55, return_tensors="pt")["input_ids"] | |
test_dl = DataLoader(TestDS(test_tensor), bs=128) | |
# Get predictions | |
preds = learner.get_preds(dl=test_dl) | |
label = "Insincere" if (F.softmax(preds[0], dim=1)[:, 1]>0.4878) else "Sincere" | |
probs = { | |
"Probability": float(F.softmax(preds[0], dim=1)[:, 1]), | |
"Sequence": test_tensor[test_tensor != 0], | |
"Decoded Sequence": tokenizer.decode(test_tensor[test_tensor != 0], skip_special_tokens=True) | |
} | |
return label, probs | |
class BiLSTMModel: | |
def __init__(self): | |
pass | |
def predict(self, text): | |
# Preprocess the text | |
seq = preprocess_text(text) | |
padded_seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=55) | |
BATCH_SIZE = 512 | |
# Get predictions from each model | |
pred1 = 0.15 * np.squeeze(model_1.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2)) | |
pred2 = 0.35 * np.squeeze(model_2.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2)) | |
pred3 = 0.15 * np.squeeze(model_3.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2)) | |
pred4 = 0.35 * np.squeeze(model_4.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2)) | |
# Combine predictions | |
avg_pred = pred1 + pred2 + pred3 + pred4 | |
label = "Insincere" if avg_pred > 0.35 else "Sincere" | |
probs = { | |
"Probability": float(avg_pred), | |
"Model Probabilities": {"Model 1": float(pred1), "Model 2": float(pred2), "Model 3": float(pred3), "Model 4": float(pred4)}, | |
"Sequence": seq, | |
"Decoded Sequence": " ".join([list(word_dict.keys())[list(word_dict.values()).index(i)] for i in seq]) | |
} | |
return label, probs | |
class QuestionClassifier: | |
"""Main Class to manage the models""" | |
def __init__(self): | |
self.models = { | |
"DeBERTaV3": DeBERTaV3Model(), | |
"BiLSTM": BiLSTMModel() | |
} | |
def classify(self, model_name, text): | |
return self.models[model_name].predict(text) | |
# Example questions | |
examples = [ | |
["DeBERTaV3", "How do you train a pigeon to send messages?"], | |
["DeBERTaV3", "Is USA a shithole country owing to a shithole president?"], | |
["DeBERTaV3", "Why is Indian education a total bullshit?"], | |
["DeBERTaV3", "Which person has given the least f**ks and still turned out successful?"], | |
["BiLSTM", "How do you train a pigeon to send messages?"], | |
["BiLSTM", "Is USA a shithole country owing to a shithole president?"], | |
["BiLSTM", "Why is Indian education a total bullshit?"], | |
["BiLSTM", "Which person has given the least f**ks and still turned out successful?"] | |
] | |
def create_gradio_interface(): | |
classifier = QuestionClassifier() | |
def classify_question(model_name, text): | |
return classifier.classify(model_name, text) | |
interface = gr.Interface( | |
fn=classify_question, | |
inputs=[ | |
gr.Dropdown(choices=["DeBERTaV3", "BiLSTM"], label="Select Model", value="BiLSTM"), | |
gr.Textbox(lines=2, placeholder="Enter your question here...", label="Input Question") | |
], | |
outputs=[ | |
gr.Textbox(label="Prediction"), | |
gr.JSON(label="Model Probabilities") | |
], | |
title="Quora Insincere Questions Classifier", | |
examples=examples, | |
description="Enter your question to classify it as sincere or insincere. Select an example question below." | |
) | |
interface.launch() | |
if __name__ == "__main__": | |
create_gradio_interface() |