File size: 5,753 Bytes
e07e824
6687c9a
 
 
 
 
 
 
870d31e
964d107
 
 
 
 
a54f158
870d31e
964d107
 
 
 
870d31e
e07e824
964d107
 
 
 
 
e07e824
964d107
 
6687c9a
 
964d107
6687c9a
 
 
964d107
6687c9a
 
417c147
6687c9a
417c147
 
 
f5b14b4
ca3c933
f5b14b4
6687c9a
 
964d107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a54f158
 
964d107
 
a54f158
964d107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6687c9a
964d107
a54f158
964d107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6687c9a
f227fd1
 
 
 
 
6687c9a
 
964d107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import gradio as gr
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
import spacy
from tqdm import tqdm
import gc
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from fastai.vision.all import *
from fastai.text.all import *
from torch.utils.data import Dataset
from DeBERTaV3 import ModelLoader

model_lst = ["DeBERTaV3", "BiLSTM"]

# BiLSTM Model
## Download the SpaCy model
os.system("python -m spacy download en_core_web_lg")

## Load models
model_1 = tf.keras.models.load_model("BiLSTM/model_1.h5")
model_2 = tf.keras.models.load_model("BiLSTM/model_2.h5")
model_3 = tf.keras.models.load_model("BiLSTM/model_3.h5")
model_4 = tf.keras.models.load_model("BiLSTM/model_4.h5")

## Load dictionaries
with open('BiLSTM/word_dict.pkl', 'rb') as f:
    word_dict = pickle.load(f)

## Load SpaCy NLP model
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)

## tokenizer 
def preprocess_text(text):
    """Preprocess the input text using SpaCy and return word indices."""
    docs = nlp.pipe([text], n_process=1)
    word_seq = []
    for doc in docs:
        for token in doc:
            if token.pos_ != "PUNCT":
                if token.text not in word_dict:
                    word_dict[token.text] = 0 # OOV_INDEX
                word_seq.append(word_dict[token.text])
    return word_seq

# DeBERTaV3 Model
## Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

class QuestionDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.text = X
        self.targets = y
        self.tok = tokenizer
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        
        text = self.text[idx]
        targ = self.targets[idx]
        
        return self.tok(text, padding='max_length', 
                        truncation=True,
                        max_length=30,
                        return_tensors="pt")["input_ids"][0], tensor(targ)
    
    def new_empty(self):
        return QuestionDataset([], [], self.tok)
    
model_loader = ModelLoader()
learner = model_loader.get_learner()
print("Learner loaded successfully.")

## DataLoader
class TestDS:
    def __init__(self, tensors):
        self.tensors = tensors
    
    def __len__(self):
        return len(self.tensors)
    
    def __getitem__(self, idx):
        t = self.tensors[idx]
        return t, tensor(0)

class DeBERTaV3Model:
    def __init__(self):
        pass

    def predict(self, text):
        # Preprocess the text
        test_tensor = tokenizer(text, padding="max_length", truncation=True, max_length=55, return_tensors="pt")["input_ids"]
        test_dl = DataLoader(TestDS(test_tensor), bs=128)

        # Get predictions
        preds = learner.get_preds(dl=test_dl)
        label = "Insincere" if (F.softmax(preds[0], dim=1)[:, 1]>0.4878) else "Sincere"
        probs = {
            "Probability": float(F.softmax(preds[0], dim=1)[:, 1]),
            "Sequence": test_tensor
        }
        return label, probs

class BiLSTMModel:
    def __init__(self):
        pass

    def predict(self, text):
        # Preprocess the text
        seq = preprocess_text(text)
        padded_seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=55)

        BATCH_SIZE = 512
        # Get predictions from each model
        pred1 = 0.15 * np.squeeze(model_1.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
        pred2 = 0.35 * np.squeeze(model_2.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
        pred3 = 0.15 * np.squeeze(model_3.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
        pred4 = 0.35 * np.squeeze(model_4.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))

        # Combine predictions
        avg_pred = pred1 + pred2 + pred3 + pred4
        label = "Insincere" if avg_pred > 0.35 else "Sincere"

        probs = {
            "Probability": float(avg_pred),
            "Model Probabilities": {"Model 1": float(pred1), "Model 2": float(pred2), "Model 3": float(pred3), "Model 4": float(pred4)},
            "Sequence": seq
        }
        return label, probs
    
class QuestionClassifier:
    """Main Class to manage the models"""
    def __init__(self):
        self.models = {
            "DeBERTaV3": DeBERTaV3Model(),
            "BiLSTM": BiLSTMModel()
        }

    def classify(self, model_name, text):
        return self.models[model_name].predict(text)
    
# Example questions
examples = [
    "How do you train a pigeon to send messages?",
    "Is USA a shithole country owing to a shithole president?",
    "Why is Indian educationa total bullshit?",
    "Which person has given the least f**ks and still turned out successful?"
]

def create_gradio_interface():
    classifier = QuestionClassifier()

    def classify_question(model_name, text):
        return classifier.classify(model_name, text)

    interface = gr.Interface(
        fn=classify_question,
        inputs=[
            gr.Dropdown(choices=["DeBERTaV3", "BiLSTM"], label="Select Model", value="BiLSTM"),
            gr.Textbox(lines=2, placeholder="Enter your question here...", label="Input Question")
        ],
        outputs=[
            gr.Textbox(label="Prediction"),
            gr.JSON(label="Model Probabilities")
        ],
        title="Quora Insincere Questions Classifier",
        examples=examples,
        description="Enter your question to classify it as sincere or insincere. Select an example question below."
    )
    interface.launch()


if __name__ == "__main__":
    create_gradio_interface()