Spaces:

non2013
/

SincereQuestions

Sleeping

App Files Files Community

SincereQuestions / app.py

non2013

added Decode

6b53e40 8 months ago

raw

history blame contribute delete

6.35 kB

	import gradio as gr
	import numpy as np
	import pandas as pd
	import tensorflow as tf
	import pickle
	import spacy
	from tqdm import tqdm
	import gc
	import os
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	from fastai.vision.all import *
	from fastai.text.all import *
	from torch.utils.data import Dataset
	from DeBERTaV3 import ModelLoader

	model_lst = ["DeBERTaV3", "BiLSTM"]

	# BiLSTM Model
	## Download the SpaCy model
	os.system("python -m spacy download en_core_web_lg")

	## Load models
	model_1 = tf.keras.models.load_model("BiLSTM/model_1.h5")
	model_2 = tf.keras.models.load_model("BiLSTM/model_2.h5")
	model_3 = tf.keras.models.load_model("BiLSTM/model_3.h5")
	model_4 = tf.keras.models.load_model("BiLSTM/model_4.h5")

	## Load dictionaries
	with open('BiLSTM/word_dict.pkl', 'rb') as f:
	word_dict = pickle.load(f)

	## Load SpaCy NLP model
	nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
	nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)

	## tokenizer
	def preprocess_text(text):
	"""Preprocess the input text using SpaCy and return word indices."""
	docs = nlp.pipe([text], n_process=1)
	word_seq = []
	for doc in docs:
	for token in doc:
	if token.pos_ != "PUNCT":
	if token.text not in word_dict:
	word_dict[token.text] = 0 # OOV_INDEX
	word_seq.append(word_dict[token.text])
	return word_seq

	# DeBERTaV3 Model
	## Load tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

	class QuestionDataset(Dataset):
	def __init__(self, X, y, tokenizer):
	self.text = X
	self.targets = y
	self.tok = tokenizer

	def __len__(self):
	return len(self.text)

	def __getitem__(self, idx):

	text = self.text[idx]
	targ = self.targets[idx]

	return self.tok(text, padding='max_length',
	truncation=True,
	max_length=30,
	return_tensors="pt")["input_ids"][0], tensor(targ)

	def new_empty(self):
	return QuestionDataset([], [], self.tok)

	model_loader = ModelLoader()
	learner = model_loader.get_learner()
	print("Learner loaded successfully.")

	## DataLoader
	class TestDS:
	def __init__(self, tensors):
	self.tensors = tensors

	def __len__(self):
	return len(self.tensors)

	def __getitem__(self, idx):
	t = self.tensors[idx]
	return t, tensor(0)

	class DeBERTaV3Model:
	def __init__(self):
	pass

	def predict(self, text):
	# Preprocess the text
	test_tensor = tokenizer(text, padding="max_length", truncation=True, max_length=55, return_tensors="pt")["input_ids"]
	test_dl = DataLoader(TestDS(test_tensor), bs=128)

	# Get predictions
	preds = learner.get_preds(dl=test_dl)
	label = "Insincere" if (F.softmax(preds[0], dim=1)[:, 1]>0.4878) else "Sincere"
	probs = {
	"Probability": float(F.softmax(preds[0], dim=1)[:, 1]),
	"Sequence": test_tensor[test_tensor != 0],
	"Decoded Sequence": tokenizer.decode(test_tensor[test_tensor != 0], skip_special_tokens=True)
	}
	return label, probs

	class BiLSTMModel:
	def __init__(self):
	pass

	def predict(self, text):
	# Preprocess the text
	seq = preprocess_text(text)
	padded_seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=55)

	BATCH_SIZE = 512
	# Get predictions from each model
	pred1 = 0.15 * np.squeeze(model_1.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
	pred2 = 0.35 * np.squeeze(model_2.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
	pred3 = 0.15 * np.squeeze(model_3.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
	pred4 = 0.35 * np.squeeze(model_4.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))

	# Combine predictions
	avg_pred = pred1 + pred2 + pred3 + pred4
	label = "Insincere" if avg_pred > 0.35 else "Sincere"

	probs = {
	"Probability": float(avg_pred),
	"Model Probabilities": {"Model 1": float(pred1), "Model 2": float(pred2), "Model 3": float(pred3), "Model 4": float(pred4)},
	"Sequence": seq,
	"Decoded Sequence": " ".join([list(word_dict.keys())[list(word_dict.values()).index(i)] for i in seq])
	}
	return label, probs

	class QuestionClassifier:
	"""Main Class to manage the models"""
	def __init__(self):
	self.models = {
	"DeBERTaV3": DeBERTaV3Model(),
	"BiLSTM": BiLSTMModel()
	}

	def classify(self, model_name, text):
	return self.models[model_name].predict(text)

	# Example questions
	examples = [
	["DeBERTaV3", "How do you train a pigeon to send messages?"],
	["DeBERTaV3", "Is USA a shithole country owing to a shithole president?"],
	["DeBERTaV3", "Why is Indian education a total bullshit?"],
	["DeBERTaV3", "Which person has given the least f**ks and still turned out successful?"],
	["BiLSTM", "How do you train a pigeon to send messages?"],
	["BiLSTM", "Is USA a shithole country owing to a shithole president?"],
	["BiLSTM", "Why is Indian education a total bullshit?"],
	["BiLSTM", "Which person has given the least f**ks and still turned out successful?"]
	]

	def create_gradio_interface():
	classifier = QuestionClassifier()

	def classify_question(model_name, text):
	return classifier.classify(model_name, text)

	interface = gr.Interface(
	fn=classify_question,
	inputs=[
	gr.Dropdown(choices=["DeBERTaV3", "BiLSTM"], label="Select Model", value="BiLSTM"),
	gr.Textbox(lines=2, placeholder="Enter your question here...", label="Input Question")
	],
	outputs=[
	gr.Textbox(label="Prediction"),
	gr.JSON(label="Model Probabilities")
	],
	title="Quora Insincere Questions Classifier",
	examples=examples,
	description="Enter your question to classify it as sincere or insincere. Select an example question below."
	)
	interface.launch()


	if __name__ == "__main__":
	create_gradio_interface()