|
import tensorflow as tf |
|
import gradio as gr |
|
import pandas as pd |
|
from transformers import AutoTokenizer |
|
|
|
model_save_path = "Multilingual_toxic_comment_classifier/" |
|
|
|
loaded_model = tf.keras.models.load_model(model_save_path) |
|
|
|
tokenizer_ = AutoTokenizer.from_pretrained("xlm-roberta-large") |
|
|
|
examples_list = [ |
|
[example] |
|
for example in pd.read_csv("examples/sample_comments.csv")["comment_text"].tolist() |
|
] |
|
|
|
|
|
def prep_data(text, tokenizer, max_len=192): |
|
tokens = tokenizer( |
|
text, |
|
max_length=max_len, |
|
truncation=True, |
|
padding="max_length", |
|
add_special_tokens=True, |
|
return_tensors="tf", |
|
) |
|
|
|
return { |
|
"input_ids": tokens["input_ids"], |
|
"attention_mask": tokens["attention_mask"], |
|
} |
|
|
|
|
|
def predict(text): |
|
prob_of_toxic_comment = loaded_model.predict( |
|
prep_data(text=text, tokenizer=tokenizer_, max_len=192) |
|
)[0][0] |
|
prob_of_non_toxic_comment = 1 - prob_of_toxic_comment |
|
prob_of_toxic_comment, prob_of_non_toxic_comment |
|
probs = { |
|
"prob_of_toxic_comment": float(prob_of_toxic_comment), |
|
"prob_of_non_toxic_comment": float(prob_of_non_toxic_comment), |
|
} |
|
return probs |
|
|
|
|
|
interface = gr.Interface( |
|
fn=predict, |
|
inputs=gr.components.Textbox(lines=4, label="Comment"), |
|
outputs=[gr.Label(label="Probabilities")], |
|
examples=examples_list, |
|
title="Multi-Lingual Toxic Comment Classification.", |
|
description="XLM-Roberta Large model", |
|
) |
|
interface.launch(debug=False) |
|
|