Spaces:
Runtime error
Runtime error
File size: 6,348 Bytes
b8be36c 9f1060c b8be36c d276596 b8be36c db77a12 b8be36c ab87d79 87f42d6 ab87d79 b8be36c 02758b3 4d8b08c 775bc76 02758b3 b8be36c 058bb31 b8be36c d276596 cb4cca6 b8be36c d276596 3743ac7 d276596 0b7786c d276596 0e8e07c d276596 0622968 0b7786c d276596 0622968 8e0fe7a 0622968 ebb696e 0622968 8e0fe7a 0622968 64b545b 8e0fe7a 64b545b d276596 de62b1e ebb696e b8be36c d276596 b8be36c d276596 b8be36c ebb696e 167485f 91f89cf ebb696e b8be36c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import streamlit as st
from evaluate import evaluator
import evaluate
import datasets
from huggingface_hub import HfApi, ModelFilter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import pipeline, AutoConfig
import matplotlib.pyplot as plt
st.title("Metric Compare")
st.markdown("### Choose the dataset you want to use for the comparison:")
api = HfApi()
dsets = [d.id for d in api.list_datasets(filter="task_categories:text-classification", sort = "downloads", direction=-1, limit = 20) if d.id !='glue']
dset = st.selectbox('Choose a dataset from the Hub', options=dsets)
info = datasets.get_dataset_infos(dset)
dset_config = st.selectbox('What config do you want to use?', options=list(info))
splitlist= []
for s in info[dset_config].splits:
if s != 'train':
splitlist.append(s)
dset_split = st.selectbox('Choose a dataset split for evaluation', options=splitlist)
st.markdown("### Select up to 5 models to compare their performance:")
filt = ModelFilter(trained_dataset=dset)
all_models = [m.modelId for m in api.list_models(filter=filt, sort = "downloads", direction=-1, limit = 20) if 't5' not in m.tags]
models = st.multiselect(
'Choose the models that have been trained/finetuned on this dataset',
options=all_models)
if len(models) > 5:
st.exception("Please choose less than 5 models!")
st.markdown("### What two metrics do you want to compare?")
metrics = st.multiselect(
'Choose the metrics for the comparison',
options=['f1', 'accuracy', 'precision', 'recall'],
default=["f1", "accuracy"])
st.markdown("### Please wait for the dataset and models to load (this can take some time if they are big!")
### Loading data
@st.cache
def loaddset(d, d_split):
data = datasets.load_dataset(d, split=d_split)
return(data)
data = loaddset(dset,dset_split)
### Defining Evaluator
eval = evaluator("text-classification")
### Loading models
@st.cache
def load_models(mod_names):
model_list=[]
for i in range (len(mod_names)):
try:
globals()[f"tokenizer_{i}"] = AutoTokenizer.from_pretrained(mod_names[i])
globals()[f"model_{i}"] = AutoModelForSequenceClassification.from_pretrained(mod_names[i])
model_list.append(mod_names[i])
except:
continue
return(model_list)
### Defining pipelines
@st.cache
def load_pipes(mod_list):
pipe_list=[]
for i in range (len(mod_list)):
globals()[f"pipe_{i}"] = pipeline("text-classification", model = models[i], tokenizer = models[i], device=-1)
return(pipe_list)
model_list= load_models(models)
pipes = load_pipes(model_list)
### Defining metrics
for i in range (len(metrics)):
globals()[f"metrics[i]"] = evaluate.load(metrics[i])
## Label mapping
st.markdown("### Help us pick the right labels for your models")
st.text("The labels for your dataset are: "+ str(data.features['label'].names))
for i in range (len(model_list)):
st.text("The labels for " + str(model_list[i]) + "are: "+ str(AutoConfig.from_pretrained(model_list[i]).id2label))
for j in range (len(data.features['label'].names)):
globals()[f"model[i]_label[j]"] = st.selectbox("The label corresponding to " + str(data.features['label'].names[i]) + " is:", AutoConfig.from_pretrained(model_list[i]).id2label)
_ = """
res_accuracy1 = eval.compute(model_or_pipeline=pipe1, data=data, metric=accuracy,
label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)
res_f11 = eval.compute(model_or_pipeline=pipe1, data=data, metric=f1,
label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)
print({**res_accuracy1, **res_f11})
pipe2 = pipeline("text-classification", model=model2, tokenizer= tokenizer2, device=0)
res_accuracy2 = eval.compute(model_or_pipeline=pipe2, data=data, metric=accuracy,
label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
res_f12 = eval.compute(model_or_pipeline=pipe2, data=data, metric=f1,
label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
print({**res_accuracy2, **res_f12})
pipe3 = pipeline("text-classification", model=model3, tokenizer= tokenizer3, device=0)
res_accuracy3 = eval.compute(model_or_pipeline=pipe3, data=data, metric=accuracy,
label_mapping={"neg": 0, "pos": 1},)
res_f13 = eval.compute(model_or_pipeline=pipe3, data=data, metric=f1,
label_mapping={"neg": 0, "pos": 1},)
print({**res_accuracy3, **res_f13})
pipe4 = pipeline("text-classification", model=model4, tokenizer= tokenizer4, device=0)
res_accuracy4 = eval.compute(model_or_pipeline=pipe4, data=data, metric=accuracy,
label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
res_f14 = eval.compute(model_or_pipeline=pipe4, data=data, metric=f1,
label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
print({**res_accuracy4, **res_f14})
pipe5 = pipeline("text-classification", model=model5, tokenizer= tokenizer5, device=0)
res_accuracy5 = eval.compute(model_or_pipeline=pipe5, data=data, metric=accuracy,
label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
res_f15 = eval.compute(model_or_pipeline=pipe5, data=data, metric=f1,
label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
print({**res_accuracy5, **res_f15})
plt.plot(res_accuracy1['accuracy'], res_f11['f1'], marker='o', markersize=6, color="red")
plt.annotate('distilbert', xy=(res_accuracy1['accuracy']+0.001, res_f11['f1']))
plt.plot(res_accuracy2['accuracy'], res_f12['f1'], marker='o', markersize=6, color="blue")
plt.annotate('distilbert-base-uncased-finetuned', xy=(res_accuracy2['accuracy']+0.001, res_f12['f1']))
plt.plot(res_accuracy3['accuracy'], res_f13['f1'], marker='o', markersize=6, color="green")
plt.annotate('roberta-base', xy=(res_accuracy3['accuracy']-0.009, res_f13['f1']))
plt.plot(res_accuracy4['accuracy'], res_f14['f1'], marker='o', markersize=6, color="purple")
plt.annotate('funnel-transformer-small', xy=(res_accuracy4['accuracy']-0.015, res_f14['f1']))
plt.plot(res_accuracy5['accuracy'], res_f15['f1'], marker='o', markersize=6, color="black")
plt.annotate('SENATOR', xy=(res_accuracy5['accuracy']+0.001, res_f15['f1']))
plt.xlabel('Accuracy')
plt.ylabel('F1 Score')
#plt.xlim([0.9, 1.0])
#plt.ylim([0.9, 1.0])
plt.title('Comparing the Models')
""" |