Spaces:
Sleeping
Sleeping
File size: 5,753 Bytes
e07e824 6687c9a 870d31e 964d107 a54f158 870d31e 964d107 870d31e e07e824 964d107 e07e824 964d107 6687c9a 964d107 6687c9a 964d107 6687c9a 417c147 6687c9a 417c147 f5b14b4 ca3c933 f5b14b4 6687c9a 964d107 a54f158 964d107 a54f158 964d107 6687c9a 964d107 a54f158 964d107 6687c9a f227fd1 6687c9a 964d107 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import gradio as gr
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
import spacy
from tqdm import tqdm
import gc
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from fastai.vision.all import *
from fastai.text.all import *
from torch.utils.data import Dataset
from DeBERTaV3 import ModelLoader
model_lst = ["DeBERTaV3", "BiLSTM"]
# BiLSTM Model
## Download the SpaCy model
os.system("python -m spacy download en_core_web_lg")
## Load models
model_1 = tf.keras.models.load_model("BiLSTM/model_1.h5")
model_2 = tf.keras.models.load_model("BiLSTM/model_2.h5")
model_3 = tf.keras.models.load_model("BiLSTM/model_3.h5")
model_4 = tf.keras.models.load_model("BiLSTM/model_4.h5")
## Load dictionaries
with open('BiLSTM/word_dict.pkl', 'rb') as f:
word_dict = pickle.load(f)
## Load SpaCy NLP model
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
## tokenizer
def preprocess_text(text):
"""Preprocess the input text using SpaCy and return word indices."""
docs = nlp.pipe([text], n_process=1)
word_seq = []
for doc in docs:
for token in doc:
if token.pos_ != "PUNCT":
if token.text not in word_dict:
word_dict[token.text] = 0 # OOV_INDEX
word_seq.append(word_dict[token.text])
return word_seq
# DeBERTaV3 Model
## Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
class QuestionDataset(Dataset):
def __init__(self, X, y, tokenizer):
self.text = X
self.targets = y
self.tok = tokenizer
def __len__(self):
return len(self.text)
def __getitem__(self, idx):
text = self.text[idx]
targ = self.targets[idx]
return self.tok(text, padding='max_length',
truncation=True,
max_length=30,
return_tensors="pt")["input_ids"][0], tensor(targ)
def new_empty(self):
return QuestionDataset([], [], self.tok)
model_loader = ModelLoader()
learner = model_loader.get_learner()
print("Learner loaded successfully.")
## DataLoader
class TestDS:
def __init__(self, tensors):
self.tensors = tensors
def __len__(self):
return len(self.tensors)
def __getitem__(self, idx):
t = self.tensors[idx]
return t, tensor(0)
class DeBERTaV3Model:
def __init__(self):
pass
def predict(self, text):
# Preprocess the text
test_tensor = tokenizer(text, padding="max_length", truncation=True, max_length=55, return_tensors="pt")["input_ids"]
test_dl = DataLoader(TestDS(test_tensor), bs=128)
# Get predictions
preds = learner.get_preds(dl=test_dl)
label = "Insincere" if (F.softmax(preds[0], dim=1)[:, 1]>0.4878) else "Sincere"
probs = {
"Probability": float(F.softmax(preds[0], dim=1)[:, 1]),
"Sequence": test_tensor
}
return label, probs
class BiLSTMModel:
def __init__(self):
pass
def predict(self, text):
# Preprocess the text
seq = preprocess_text(text)
padded_seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=55)
BATCH_SIZE = 512
# Get predictions from each model
pred1 = 0.15 * np.squeeze(model_1.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
pred2 = 0.35 * np.squeeze(model_2.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
pred3 = 0.15 * np.squeeze(model_3.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
pred4 = 0.35 * np.squeeze(model_4.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
# Combine predictions
avg_pred = pred1 + pred2 + pred3 + pred4
label = "Insincere" if avg_pred > 0.35 else "Sincere"
probs = {
"Probability": float(avg_pred),
"Model Probabilities": {"Model 1": float(pred1), "Model 2": float(pred2), "Model 3": float(pred3), "Model 4": float(pred4)},
"Sequence": seq
}
return label, probs
class QuestionClassifier:
"""Main Class to manage the models"""
def __init__(self):
self.models = {
"DeBERTaV3": DeBERTaV3Model(),
"BiLSTM": BiLSTMModel()
}
def classify(self, model_name, text):
return self.models[model_name].predict(text)
# Example questions
examples = [
"How do you train a pigeon to send messages?",
"Is USA a shithole country owing to a shithole president?",
"Why is Indian educationa total bullshit?",
"Which person has given the least f**ks and still turned out successful?"
]
def create_gradio_interface():
classifier = QuestionClassifier()
def classify_question(model_name, text):
return classifier.classify(model_name, text)
interface = gr.Interface(
fn=classify_question,
inputs=[
gr.Dropdown(choices=["DeBERTaV3", "BiLSTM"], label="Select Model", value="BiLSTM"),
gr.Textbox(lines=2, placeholder="Enter your question here...", label="Input Question")
],
outputs=[
gr.Textbox(label="Prediction"),
gr.JSON(label="Model Probabilities")
],
title="Quora Insincere Questions Classifier",
examples=examples,
description="Enter your question to classify it as sincere or insincere. Select an example question below."
)
interface.launch()
if __name__ == "__main__":
create_gradio_interface() |