Spaces:

RobCaamano
/

Finetuning_Language_Models-Toxic_Tweets

Sleeping

File size: 2,634 Bytes

fd77815
0dd6279
ad72de2
fd77815
 
 
29406f8
d0d0af6
fd77815
d0d0af6
e0c3551
 
f83600d
4c23b4e
 
d0d0af6
 
 
848f6ee
fd77815
682174e
 
fd77815
cf53edf
f1865c0
 
51a09b7
cf53edf
 
 
682174e
cf53edf
be5dc38
cf53edf
682174e
cf53edf
08728c1
528da04
682174e
 
39e1615
 
 
4a0592e
fd77815
 
682174e
39e1615
682174e
51a09b7
3847b95
 
 
 
 
 
 
d0d0af6
3847b95
 
490091e
3847b95
 
d0d0af6
3847b95
490091e
3847b95
 
 
 
682174e
be5dc38
3847b95
 
490091e
2d942ee
0dd6279
cb4608c
be5dc38
51a09b7

import streamlit as st
import pandas as pd
from transformers import AutoTokenizer, pipeline
from transformers import (
    TFAutoModelForSequenceClassification as AutoModelForSequenceClassification,
)

st.title("Classifier")

demo_options = {
    "Non-toxic": "Had a wonderful weekend at the park. Enjoyed the beautiful weather!",
    "Obscene": "I don't give a fuck about your opinion",
    "Threat": "I will find and kill you",
    "Insult": "You are so stupid",
    "Identity Hate": "I hate gay people. Its just my opinion.",
}

selected_demo = st.selectbox("Demos", options=list(demo_options.keys()))
text = st.text_area("Input text", demo_options[selected_demo], height=250)

submit = False
model_name = ""

model_mapping = {
    "Toxicity - 1 Epoch": "RobCaamano/toxicity",
    "Toxicity - 8 Epochs": "RobCaamano/toxicity_update",
    "Toxicity - Weighted": "RobCaamano/toxicity_weighted",
    "DistilBERT Base Uncased (SST-2)": "distilbert-base-uncased-finetuned-sst-2-english",
}

with st.container():
    selected_model_display = st.selectbox(
        "Select Model",
        options=list(model_mapping.keys())
    )
    model_name = model_mapping[selected_model_display]
    submit = st.button("Submit", type="primary")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
clf = pipeline(
    "sentiment-analysis", model=model, tokenizer=tokenizer, return_all_scores=True
)

input = tokenizer(text, return_tensors="tf")

if submit:
    results = dict(d.values() for d in clf(text)[0])

    if model_name in ["RobCaamano/toxicity", "RobCaamano/toxicity_update", "RobCaamano/toxicity_weighted"]:
        classes = {k: results[k] for k in results.keys() if not k == "toxic"}

        max_class = max(classes, key=classes.get)
        probability = classes[max_class]

        if results['toxic'] >= 0.5:
            result_df = pd.DataFrame({
                'Toxic': 'Yes',
                'Toxicity Class': [max_class],
                'Probability': [probability]
            }, index=[0])
        else:
            result_df = pd.DataFrame({
                'Toxic': 'No',
                'Toxicity Class': 'This text is not toxic',
            }, index=[0])

    elif model_name == "distilbert-base-uncased-finetuned-sst-2-english":
        result = max(results, key=results.get)
        probability = results[result]

        result_df = pd.DataFrame({
            'Result': [result],
            'Probability': [probability],
        }, index=[0])

    st.table(result_df)

    expander = st.expander("View Raw output")
    expander.write(results)