File size: 6,352 Bytes
b8be36c
 
 
 
 
 
 
9f1060c
b8be36c
 
 
 
d276596
b8be36c
 
db77a12
b8be36c
ab87d79
87f42d6
ab87d79
b8be36c
02758b3
 
 
 
4d8b08c
775bc76
02758b3
 
b8be36c
 
058bb31
b8be36c
d276596
cb4cca6
b8be36c
d276596
 
 
3743ac7
 
d276596
 
 
 
 
0b7786c
 
d276596
 
0e8e07c
d276596
 
0622968
 
 
 
 
 
0b7786c
 
 
 
d276596
0622968
 
 
 
 
 
 
 
 
8e0fe7a
 
0622968
ebb696e
0622968
 
8e0fe7a
0622968
64b545b
8e0fe7a
64b545b
 
 
d276596
 
 
de62b1e
ebb696e
 
b8be36c
d276596
b8be36c
d276596
b8be36c
ebb696e
 
167485f
6195bae
91f89cf
ebb696e
b8be36c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import streamlit as st
from evaluate import evaluator
import evaluate
import datasets
from huggingface_hub import HfApi, ModelFilter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import pipeline, AutoConfig
import matplotlib.pyplot as plt

st.title("Metric Compare")

st.markdown("### Choose the dataset you want to use for the comparison:")

api = HfApi()
dsets = [d.id for d in api.list_datasets(filter="task_categories:text-classification", sort = "downloads", direction=-1, limit = 20) if d.id !='glue']

dset = st.selectbox('Choose a dataset from the Hub', options=dsets)

info = datasets.get_dataset_infos(dset)

dset_config = st.selectbox('What config do you want to use?', options=list(info))

splitlist= []
for s in info[dset_config].splits:
  if s != 'train':
    splitlist.append(s)

dset_split = st.selectbox('Choose a dataset split for evaluation', options=splitlist)


st.markdown("### Select up to 5 models to compare their performance:")

filt = ModelFilter(trained_dataset=dset)
all_models = [m.modelId for m in api.list_models(filter=filt, sort = "downloads", direction=-1, limit = 20) if 't5' not in m.tags]

models = st.multiselect(
     'Choose the models that have been trained/finetuned on this dataset', 
     options=all_models)
if len(models) > 5:
  st.exception("Please choose less than 5 models!")

st.markdown("### What two metrics do you want to compare?")

metrics = st.multiselect(
     'Choose the metrics for the comparison', 
     options=['f1', 'accuracy', 'precision', 'recall'],
     default=["f1", "accuracy"])
     
     
st.markdown("### Please wait for the dataset and models to load (this can take some time if they are big!")

### Loading data
@st.cache 
def loaddset(d, d_split):
  data = datasets.load_dataset(d, split=d_split)
  return(data)

data = loaddset(dset,dset_split)

### Defining Evaluator
eval = evaluator("text-classification")

### Loading models
@st.cache 
def load_models(mod_names):
  model_list=[]
  for i in range (len(mod_names)):
    try:
      globals()[f"tokenizer_{i}"] = AutoTokenizer.from_pretrained(mod_names[i])
      globals()[f"model_{i}"] = AutoModelForSequenceClassification.from_pretrained(mod_names[i])
      model_list.append(mod_names[i])
    except:
      continue
  return(model_list)

### Defining pipelines
@st.cache 
def load_pipes(mod_list):
  pipe_list=[]
  for i in range (len(mod_list)):
    globals()[f"pipe_{i}"] = pipeline("text-classification", model = models[i], tokenizer = models[i], device=-1) 
  return(pipe_list) 
  
model_list= load_models(models)
pipes = load_pipes(model_list)

### Defining metrics 
for i in range (len(metrics)):
  globals()[f"metrics[i]"] = evaluate.load(metrics[i])
  
## Label mapping

st.markdown("### Help us pick the right labels for your models")

st.text("The labels for your dataset are: "+ str(data.features['label'].names))

for i in range (len(model_list)):
  st.text("The labels for " + str(model_list[i]) + "are: "+ str(AutoConfig.from_pretrained(model_list[i]).id2label))
  for j in range (len(data.features['label'].names)):
    globals()[f"model[i]_label[j]"] = st.selectbox("The label corresponding to **" + str(data.features['label'].names[i]) + "** is:", AutoConfig.from_pretrained(model_list[i]).id2label)
  
_ = """
res_accuracy1 = eval.compute(model_or_pipeline=pipe1, data=data, metric=accuracy,
                       label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)
res_f11 = eval.compute(model_or_pipeline=pipe1, data=data, metric=f1,
                       label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)
print({**res_accuracy1, **res_f11})

pipe2 = pipeline("text-classification", model=model2, tokenizer= tokenizer2, device=0)
res_accuracy2 = eval.compute(model_or_pipeline=pipe2, data=data, metric=accuracy,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
res_f12 = eval.compute(model_or_pipeline=pipe2, data=data, metric=f1,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
print({**res_accuracy2, **res_f12})

pipe3 = pipeline("text-classification", model=model3, tokenizer= tokenizer3, device=0)
res_accuracy3 = eval.compute(model_or_pipeline=pipe3, data=data, metric=accuracy,
                       label_mapping={"neg": 0, "pos": 1},)
res_f13 = eval.compute(model_or_pipeline=pipe3, data=data, metric=f1,
                       label_mapping={"neg": 0, "pos": 1},)
print({**res_accuracy3, **res_f13})

pipe4 = pipeline("text-classification", model=model4, tokenizer= tokenizer4, device=0)
res_accuracy4 = eval.compute(model_or_pipeline=pipe4, data=data, metric=accuracy,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
res_f14 = eval.compute(model_or_pipeline=pipe4, data=data, metric=f1,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
print({**res_accuracy4, **res_f14})

pipe5 = pipeline("text-classification", model=model5, tokenizer= tokenizer5, device=0)
res_accuracy5 = eval.compute(model_or_pipeline=pipe5, data=data, metric=accuracy,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
res_f15 = eval.compute(model_or_pipeline=pipe5, data=data, metric=f1,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
print({**res_accuracy5, **res_f15})

plt.plot(res_accuracy1['accuracy'], res_f11['f1'], marker='o', markersize=6, color="red")
plt.annotate('distilbert', xy=(res_accuracy1['accuracy']+0.001, res_f11['f1']))
plt.plot(res_accuracy2['accuracy'], res_f12['f1'], marker='o', markersize=6, color="blue")
plt.annotate('distilbert-base-uncased-finetuned', xy=(res_accuracy2['accuracy']+0.001, res_f12['f1']))
plt.plot(res_accuracy3['accuracy'], res_f13['f1'], marker='o', markersize=6, color="green")
plt.annotate('roberta-base', xy=(res_accuracy3['accuracy']-0.009, res_f13['f1']))
plt.plot(res_accuracy4['accuracy'], res_f14['f1'], marker='o', markersize=6, color="purple")
plt.annotate('funnel-transformer-small', xy=(res_accuracy4['accuracy']-0.015, res_f14['f1']))
plt.plot(res_accuracy5['accuracy'], res_f15['f1'], marker='o', markersize=6, color="black")
plt.annotate('SENATOR', xy=(res_accuracy5['accuracy']+0.001, res_f15['f1']))

plt.xlabel('Accuracy')
plt.ylabel('F1 Score') 
#plt.xlim([0.9, 1.0])
#plt.ylim([0.9, 1.0])
plt.title('Comparing the Models')
"""