danieldux commited on
Commit
a5b84dd
·
1 Parent(s): e847a58

Add tests notebook

Browse files
isco_rel_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"accuracy": 0.8695796975954173, "hierarchical_precision": 0.9876106194690265, "hierarchical_recall": 0.9911190053285968, "hierarchical_fmeasure": 0.9893617021276595}
isco_test_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"accuracy": 0.8611914401388086, "hierarchical_precision": 0.989010989010989, "hierarchical_recall": 0.9836065573770492, "hierarchical_fmeasure": 0.9863013698630136}
isco_validation_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"accuracy": 0.8576800694243564, "hierarchical_precision": 0.9757462686567164, "hierarchical_recall": 0.9812382739212008, "hierarchical_fmeasure": 0.9784845650140319}
language_results.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Language,Accuracy,Hierarchical Precision,Hierarchical Recall,Hierarchical F1
2
+ da,0.7149425287356321,0.9314641744548287,0.8898809523809523,0.9101978691019786
3
+ en,0.9075297225891678,0.9578651685393258,0.9742857142857143,0.9660056657223796
4
+ es,0.8794080604534005,0.9774590163934426,0.9655870445344129,0.9714867617107942
5
+ fi,0.9286376274328082,0.9591836734693877,0.9733727810650887,0.9662261380323054
6
+ fr,0.5772994129158513,0.8571428571428571,0.8808864265927978,0.8688524590163934
7
+ it,0.9332579185520362,0.9616613418530351,0.9525316455696202,0.9570747217806042
8
+ kk,0.9313346228239845,0.9816849816849816,0.9710144927536232,0.97632058287796
9
+ ko,0.9369047619047619,0.9726962457337884,0.9827586206896551,0.9777015437392795
10
+ pt,0.8936170212765957,0.9591836734693877,0.9563953488372093,0.957787481804949
11
+ ru,0.9259259259259259,0.971875,0.9658385093167702,0.9688473520249222
12
+ sv,0.9726027397260274,0.9927007299270073,1.0,0.9963369963369962
13
+ Average,0.872860031121472,0.9566288056970947,0.9556865032750766,0.9560761429225966
tests.ipynb CHANGED
@@ -9,7 +9,7 @@
9
  },
10
  {
11
  "cell_type": "code",
12
- "execution_count": 18,
13
  "metadata": {},
14
  "outputs": [
15
  {
@@ -36,7 +36,7 @@
36
  },
37
  {
38
  "cell_type": "code",
39
- "execution_count": 3,
40
  "metadata": {},
41
  "outputs": [
42
  {
@@ -163,22 +163,9 @@
163
  },
164
  {
165
  "cell_type": "code",
166
- "execution_count": 17,
167
  "metadata": {},
168
- "outputs": [
169
- {
170
- "name": "stdout",
171
- "output_type": "stream",
172
- "text": [
173
- "ISCO CSV file downloaded\n",
174
- "Weighted ISCO hierarchy dictionary created\n",
175
- "{'1111': {'111': 0.75, '11': 0.5, '1': 0.25}, '1112': {'111': 0.75, '11': 0.5, '1': 0.25}, '1113': {'111': 0.75, '11': 0.5, '1': 0.25}, '1114': {'111': 0.75, '11': 0.5, '1': 0.25}, '1120': {'112': 0.75, '11': 0.5, '1': 0.25}, '1211': {'121': 0.75, '12': 0.5, '1': 0.25}, '1212': {'121': 0.75, '12': 0.5, '1': 0.25}, '1213': {'121': 0.75, '12': 0.5, '1': 0.25}, '1219': {'121': 0.75, '12': 0.5, '1': 0.25}, '1221': {'122': 0.75, '12': 0.5, '1': 0.25}, '1222': {'122': 0.75, '12': 0.5, '1': 0.25}, '1223': {'122': 0.75, '12': 0.5, '1': 0.25}, '1311': {'131': 0.75, '13': 0.5, '1': 0.25}, '1312': {'131': 0.75, '13': 0.5, '1': 0.25}, '1321': {'132': 0.75, '13': 0.5, '1': 0.25}, '1322': {'132': 0.75, '13': 0.5, '1': 0.25}, '1323': {'132': 0.75, '13': 0.5, '1': 0.25}, '1324': {'132': 0.75, '13': 0.5, '1': 0.25}, '1330': {'133': 0.75, '13': 0.5, '1': 0.25}, '1341': {'134': 0.75, '13': 0.5, '1': 0.25}, '1342': {'134': 0.75, '13': 0.5, '1': 0.25}, '1343': {'134': 0.75, '13': 0.5, '1': 0.25}, '1344': {'134': 0.75, '13': 0.5, '1': 0.25}, '1345': {'134': 0.75, '13': 0.5, '1': 0.25}, '1346': {'134': 0.75, '13': 0.5, '1': 0.25}, '1349': {'134': 0.75, '13': 0.5, '1': 0.25}, '1411': {'141': 0.75, '14': 0.5, '1': 0.25}, '1412': {'141': 0.75, '14': 0.5, '1': 0.25}, '1420': {'142': 0.75, '14': 0.5, '1': 0.25}, '1431': {'143': 0.75, '14': 0.5, '1': 0.25}, '1439': {'143': 0.75, '14': 0.5, '1': 0.25}, '2111': {'211': 0.75, '21': 0.5, '2': 0.25}, '2112': {'211': 0.75, '21': 0.5, '2': 0.25}, '2113': {'211': 0.75, '21': 0.5, '2': 0.25}, '2114': {'211': 0.75, '21': 0.5, '2': 0.25}, '2120': {'212': 0.75, '21': 0.5, '2': 0.25}, '2131': {'213': 0.75, '21': 0.5, '2': 0.25}, '2132': {'213': 0.75, '21': 0.5, '2': 0.25}, '2133': {'213': 0.75, '21': 0.5, '2': 0.25}, '2141': {'214': 0.75, '21': 0.5, '2': 0.25}, '2142': {'214': 0.75, '21': 0.5, '2': 0.25}, '2143': {'214': 0.75, '21': 0.5, '2': 0.25}, '2144': {'214': 0.75, '21': 0.5, '2': 0.25}, '2145': {'214': 0.75, '21': 0.5, '2': 0.25}, '2146': {'214': 0.75, '21': 0.5, '2': 0.25}, '2149': {'214': 0.75, '21': 0.5, '2': 0.25}, '2151': {'215': 0.75, '21': 0.5, '2': 0.25}, '2152': {'215': 0.75, '21': 0.5, '2': 0.25}, '2153': {'215': 0.75, '21': 0.5, '2': 0.25}, '2161': {'216': 0.75, '21': 0.5, '2': 0.25}, '2162': {'216': 0.75, '21': 0.5, '2': 0.25}, '2163': {'216': 0.75, '21': 0.5, '2': 0.25}, '2164': {'216': 0.75, '21': 0.5, '2': 0.25}, '2165': {'216': 0.75, '21': 0.5, '2': 0.25}, '2166': {'216': 0.75, '21': 0.5, '2': 0.25}, '2211': {'221': 0.75, '22': 0.5, '2': 0.25}, '2212': {'221': 0.75, '22': 0.5, '2': 0.25}, '2221': {'222': 0.75, '22': 0.5, '2': 0.25}, '2222': {'222': 0.75, '22': 0.5, '2': 0.25}, '2230': {'223': 0.75, '22': 0.5, '2': 0.25}, '2240': {'224': 0.75, '22': 0.5, '2': 0.25}, '2250': {'225': 0.75, '22': 0.5, '2': 0.25}, '2261': {'226': 0.75, '22': 0.5, '2': 0.25}, '2262': {'226': 0.75, '22': 0.5, '2': 0.25}, '2263': {'226': 0.75, '22': 0.5, '2': 0.25}, '2264': {'226': 0.75, '22': 0.5, '2': 0.25}, '2265': {'226': 0.75, '22': 0.5, '2': 0.25}, '2266': {'226': 0.75, '22': 0.5, '2': 0.25}, '2267': {'226': 0.75, '22': 0.5, '2': 0.25}, '2269': {'226': 0.75, '22': 0.5, '2': 0.25}, '2310': {'231': 0.75, '23': 0.5, '2': 0.25}, '2320': {'232': 0.75, '23': 0.5, '2': 0.25}, '2330': {'233': 0.75, '23': 0.5, '2': 0.25}, '2341': {'234': 0.75, '23': 0.5, '2': 0.25}, '2342': {'234': 0.75, '23': 0.5, '2': 0.25}, '2351': {'235': 0.75, '23': 0.5, '2': 0.25}, '2352': {'235': 0.75, '23': 0.5, '2': 0.25}, '2353': {'235': 0.75, '23': 0.5, '2': 0.25}, '2354': {'235': 0.75, '23': 0.5, '2': 0.25}, '2355': {'235': 0.75, '23': 0.5, '2': 0.25}, '2356': {'235': 0.75, '23': 0.5, '2': 0.25}, '2359': {'235': 0.75, '23': 0.5, '2': 0.25}, '2411': {'241': 0.75, '24': 0.5, '2': 0.25}, '2412': {'241': 0.75, '24': 0.5, '2': 0.25}, '2413': {'241': 0.75, '24': 0.5, '2': 0.25}, '2421': {'242': 0.75, '24': 0.5, '2': 0.25}, '2422': {'242': 0.75, '24': 0.5, '2': 0.25}, '2423': {'242': 0.75, '24': 0.5, '2': 0.25}, '2424': {'242': 0.75, '24': 0.5, '2': 0.25}, '2431': {'243': 0.75, '24': 0.5, '2': 0.25}, '2432': {'243': 0.75, '24': 0.5, '2': 0.25}, '2433': {'243': 0.75, '24': 0.5, '2': 0.25}, '2434': {'243': 0.75, '24': 0.5, '2': 0.25}, '2511': {'251': 0.75, '25': 0.5, '2': 0.25}, '2512': {'251': 0.75, '25': 0.5, '2': 0.25}, '2513': {'251': 0.75, '25': 0.5, '2': 0.25}, '2514': {'251': 0.75, '25': 0.5, '2': 0.25}, '2519': {'251': 0.75, '25': 0.5, '2': 0.25}, '2521': {'252': 0.75, '25': 0.5, '2': 0.25}, '2522': {'252': 0.75, '25': 0.5, '2': 0.25}, '2523': {'252': 0.75, '25': 0.5, '2': 0.25}, '2529': {'252': 0.75, '25': 0.5, '2': 0.25}, '2611': {'261': 0.75, '26': 0.5, '2': 0.25}, '2612': {'261': 0.75, '26': 0.5, '2': 0.25}, '2619': {'261': 0.75, '26': 0.5, '2': 0.25}, '2621': {'262': 0.75, '26': 0.5, '2': 0.25}, '2622': {'262': 0.75, '26': 0.5, '2': 0.25}, '2631': {'263': 0.75, '26': 0.5, '2': 0.25}, '2632': {'263': 0.75, '26': 0.5, '2': 0.25}, '2633': {'263': 0.75, '26': 0.5, '2': 0.25}, '2634': {'263': 0.75, '26': 0.5, '2': 0.25}, '2635': {'263': 0.75, '26': 0.5, '2': 0.25}, '2636': {'263': 0.75, '26': 0.5, '2': 0.25}, '2641': {'264': 0.75, '26': 0.5, '2': 0.25}, '2642': {'264': 0.75, '26': 0.5, '2': 0.25}, '2643': {'264': 0.75, '26': 0.5, '2': 0.25}, '2651': {'265': 0.75, '26': 0.5, '2': 0.25}, '2652': {'265': 0.75, '26': 0.5, '2': 0.25}, '2653': {'265': 0.75, '26': 0.5, '2': 0.25}, '2654': {'265': 0.75, '26': 0.5, '2': 0.25}, '2655': {'265': 0.75, '26': 0.5, '2': 0.25}, '2656': {'265': 0.75, '26': 0.5, '2': 0.25}, '2659': {'265': 0.75, '26': 0.5, '2': 0.25}, '3111': {'311': 0.75, '31': 0.5, '3': 0.25}, '3112': {'311': 0.75, '31': 0.5, '3': 0.25}, '3113': {'311': 0.75, '31': 0.5, '3': 0.25}, '3114': {'311': 0.75, '31': 0.5, '3': 0.25}, '3115': {'311': 0.75, '31': 0.5, '3': 0.25}, '3116': {'311': 0.75, '31': 0.5, '3': 0.25}, '3117': {'311': 0.75, '31': 0.5, '3': 0.25}, '3118': {'311': 0.75, '31': 0.5, '3': 0.25}, '3119': {'311': 0.75, '31': 0.5, '3': 0.25}, '3121': {'312': 0.75, '31': 0.5, '3': 0.25}, '3122': {'312': 0.75, '31': 0.5, '3': 0.25}, '3123': {'312': 0.75, '31': 0.5, '3': 0.25}, '3131': {'313': 0.75, '31': 0.5, '3': 0.25}, '3132': {'313': 0.75, '31': 0.5, '3': 0.25}, '3133': {'313': 0.75, '31': 0.5, '3': 0.25}, '3134': {'313': 0.75, '31': 0.5, '3': 0.25}, '3135': {'313': 0.75, '31': 0.5, '3': 0.25}, '3139': {'313': 0.75, '31': 0.5, '3': 0.25}, '3141': {'314': 0.75, '31': 0.5, '3': 0.25}, '3142': {'314': 0.75, '31': 0.5, '3': 0.25}, '3143': {'314': 0.75, '31': 0.5, '3': 0.25}, '3151': {'315': 0.75, '31': 0.5, '3': 0.25}, '3152': {'315': 0.75, '31': 0.5, '3': 0.25}, '3153': {'315': 0.75, '31': 0.5, '3': 0.25}, '3154': {'315': 0.75, '31': 0.5, '3': 0.25}, '3155': {'315': 0.75, '31': 0.5, '3': 0.25}, '3211': {'321': 0.75, '32': 0.5, '3': 0.25}, '3212': {'321': 0.75, '32': 0.5, '3': 0.25}, '3213': {'321': 0.75, '32': 0.5, '3': 0.25}, '3214': {'321': 0.75, '32': 0.5, '3': 0.25}, '3221': {'322': 0.75, '32': 0.5, '3': 0.25}, '3222': {'322': 0.75, '32': 0.5, '3': 0.25}, '3230': {'323': 0.75, '32': 0.5, '3': 0.25}, '3240': {'324': 0.75, '32': 0.5, '3': 0.25}, '3251': {'325': 0.75, '32': 0.5, '3': 0.25}, '3252': {'325': 0.75, '32': 0.5, '3': 0.25}, '3253': {'325': 0.75, '32': 0.5, '3': 0.25}, '3254': {'325': 0.75, '32': 0.5, '3': 0.25}, '3255': {'325': 0.75, '32': 0.5, '3': 0.25}, '3256': {'325': 0.75, '32': 0.5, '3': 0.25}, '3257': {'325': 0.75, '32': 0.5, '3': 0.25}, '3258': {'325': 0.75, '32': 0.5, '3': 0.25}, '3259': {'325': 0.75, '32': 0.5, '3': 0.25}, '3311': {'331': 0.75, '33': 0.5, '3': 0.25}, '3312': {'331': 0.75, '33': 0.5, '3': 0.25}, '3313': {'331': 0.75, '33': 0.5, '3': 0.25}, '3314': {'331': 0.75, '33': 0.5, '3': 0.25}, '3315': {'331': 0.75, '33': 0.5, '3': 0.25}, '3321': {'332': 0.75, '33': 0.5, '3': 0.25}, '3322': {'332': 0.75, '33': 0.5, '3': 0.25}, '3323': {'332': 0.75, '33': 0.5, '3': 0.25}, '3324': {'332': 0.75, '33': 0.5, '3': 0.25}, '3331': {'333': 0.75, '33': 0.5, '3': 0.25}, '3332': {'333': 0.75, '33': 0.5, '3': 0.25}, '3333': {'333': 0.75, '33': 0.5, '3': 0.25}, '3334': {'333': 0.75, '33': 0.5, '3': 0.25}, '3339': {'333': 0.75, '33': 0.5, '3': 0.25}, '3341': {'334': 0.75, '33': 0.5, '3': 0.25}, '3342': {'334': 0.75, '33': 0.5, '3': 0.25}, '3343': {'334': 0.75, '33': 0.5, '3': 0.25}, '3344': {'334': 0.75, '33': 0.5, '3': 0.25}, '3351': {'335': 0.75, '33': 0.5, '3': 0.25}, '3352': {'335': 0.75, '33': 0.5, '3': 0.25}, '3353': {'335': 0.75, '33': 0.5, '3': 0.25}, '3354': {'335': 0.75, '33': 0.5, '3': 0.25}, '3355': {'335': 0.75, '33': 0.5, '3': 0.25}, '3359': {'335': 0.75, '33': 0.5, '3': 0.25}, '3411': {'341': 0.75, '34': 0.5, '3': 0.25}, '3412': {'341': 0.75, '34': 0.5, '3': 0.25}, '3413': {'341': 0.75, '34': 0.5, '3': 0.25}, '3421': {'342': 0.75, '34': 0.5, '3': 0.25}, '3422': {'342': 0.75, '34': 0.5, '3': 0.25}, '3423': {'342': 0.75, '34': 0.5, '3': 0.25}, '3431': {'343': 0.75, '34': 0.5, '3': 0.25}, '3432': {'343': 0.75, '34': 0.5, '3': 0.25}, '3433': {'343': 0.75, '34': 0.5, '3': 0.25}, '3434': {'343': 0.75, '34': 0.5, '3': 0.25}, '3435': {'343': 0.75, '34': 0.5, '3': 0.25}, '3511': {'351': 0.75, '35': 0.5, '3': 0.25}, '3512': {'351': 0.75, '35': 0.5, '3': 0.25}, '3513': {'351': 0.75, '35': 0.5, '3': 0.25}, '3514': {'351': 0.75, '35': 0.5, '3': 0.25}, '3521': {'352': 0.75, '35': 0.5, '3': 0.25}, '3522': {'352': 0.75, '35': 0.5, '3': 0.25}, '4110': {'411': 0.75, '41': 0.5, '4': 0.25}, '4120': {'412': 0.75, '41': 0.5, '4': 0.25}, '4131': {'413': 0.75, '41': 0.5, '4': 0.25}, '4132': {'413': 0.75, '41': 0.5, '4': 0.25}, '4211': {'421': 0.75, '42': 0.5, '4': 0.25}, '4212': {'421': 0.75, '42': 0.5, '4': 0.25}, '4213': {'421': 0.75, '42': 0.5, '4': 0.25}, '4214': {'421': 0.75, '42': 0.5, '4': 0.25}, '4221': {'422': 0.75, '42': 0.5, '4': 0.25}, '4222': {'422': 0.75, '42': 0.5, '4': 0.25}, '4223': {'422': 0.75, '42': 0.5, '4': 0.25}, '4224': {'422': 0.75, '42': 0.5, '4': 0.25}, '4225': {'422': 0.75, '42': 0.5, '4': 0.25}, '4226': {'422': 0.75, '42': 0.5, '4': 0.25}, '4227': {'422': 0.75, '42': 0.5, '4': 0.25}, '4229': {'422': 0.75, '42': 0.5, '4': 0.25}, '4311': {'431': 0.75, '43': 0.5, '4': 0.25}, '4312': {'431': 0.75, '43': 0.5, '4': 0.25}, '4313': {'431': 0.75, '43': 0.5, '4': 0.25}, '4321': {'432': 0.75, '43': 0.5, '4': 0.25}, '4322': {'432': 0.75, '43': 0.5, '4': 0.25}, '4323': {'432': 0.75, '43': 0.5, '4': 0.25}, '4411': {'441': 0.75, '44': 0.5, '4': 0.25}, '4412': {'441': 0.75, '44': 0.5, '4': 0.25}, '4413': {'441': 0.75, '44': 0.5, '4': 0.25}, '4414': {'441': 0.75, '44': 0.5, '4': 0.25}, '4415': {'441': 0.75, '44': 0.5, '4': 0.25}, '4416': {'441': 0.75, '44': 0.5, '4': 0.25}, '4419': {'441': 0.75, '44': 0.5, '4': 0.25}, '5111': {'511': 0.75, '51': 0.5, '5': 0.25}, '5112': {'511': 0.75, '51': 0.5, '5': 0.25}, '5113': {'511': 0.75, '51': 0.5, '5': 0.25}, '5120': {'512': 0.75, '51': 0.5, '5': 0.25}, '5131': {'513': 0.75, '51': 0.5, '5': 0.25}, '5132': {'513': 0.75, '51': 0.5, '5': 0.25}, '5141': {'514': 0.75, '51': 0.5, '5': 0.25}, '5142': {'514': 0.75, '51': 0.5, '5': 0.25}, '5151': {'515': 0.75, '51': 0.5, '5': 0.25}, '5152': {'515': 0.75, '51': 0.5, '5': 0.25}, '5153': {'515': 0.75, '51': 0.5, '5': 0.25}, '5161': {'516': 0.75, '51': 0.5, '5': 0.25}, '5162': {'516': 0.75, '51': 0.5, '5': 0.25}, '5163': {'516': 0.75, '51': 0.5, '5': 0.25}, '5164': {'516': 0.75, '51': 0.5, '5': 0.25}, '5165': {'516': 0.75, '51': 0.5, '5': 0.25}, '5169': {'516': 0.75, '51': 0.5, '5': 0.25}, '5211': {'521': 0.75, '52': 0.5, '5': 0.25}, '5212': {'521': 0.75, '52': 0.5, '5': 0.25}, '5221': {'522': 0.75, '52': 0.5, '5': 0.25}, '5222': {'522': 0.75, '52': 0.5, '5': 0.25}, '5223': {'522': 0.75, '52': 0.5, '5': 0.25}, '5230': {'523': 0.75, '52': 0.5, '5': 0.25}, '5241': {'524': 0.75, '52': 0.5, '5': 0.25}, '5242': {'524': 0.75, '52': 0.5, '5': 0.25}, '5243': {'524': 0.75, '52': 0.5, '5': 0.25}, '5244': {'524': 0.75, '52': 0.5, '5': 0.25}, '5245': {'524': 0.75, '52': 0.5, '5': 0.25}, '5246': {'524': 0.75, '52': 0.5, '5': 0.25}, '5249': {'524': 0.75, '52': 0.5, '5': 0.25}, '5311': {'531': 0.75, '53': 0.5, '5': 0.25}, '5312': {'531': 0.75, '53': 0.5, '5': 0.25}, '5321': {'532': 0.75, '53': 0.5, '5': 0.25}, '5322': {'532': 0.75, '53': 0.5, '5': 0.25}, '5329': {'532': 0.75, '53': 0.5, '5': 0.25}, '5411': {'541': 0.75, '54': 0.5, '5': 0.25}, '5412': {'541': 0.75, '54': 0.5, '5': 0.25}, '5413': {'541': 0.75, '54': 0.5, '5': 0.25}, '5414': {'541': 0.75, '54': 0.5, '5': 0.25}, '5419': {'541': 0.75, '54': 0.5, '5': 0.25}, '6111': {'611': 0.75, '61': 0.5, '6': 0.25}, '6112': {'611': 0.75, '61': 0.5, '6': 0.25}, '6113': {'611': 0.75, '61': 0.5, '6': 0.25}, '6114': {'611': 0.75, '61': 0.5, '6': 0.25}, '6121': {'612': 0.75, '61': 0.5, '6': 0.25}, '6122': {'612': 0.75, '61': 0.5, '6': 0.25}, '6123': {'612': 0.75, '61': 0.5, '6': 0.25}, '6129': {'612': 0.75, '61': 0.5, '6': 0.25}, '6130': {'613': 0.75, '61': 0.5, '6': 0.25}, '6210': {'621': 0.75, '62': 0.5, '6': 0.25}, '6221': {'622': 0.75, '62': 0.5, '6': 0.25}, '6222': {'622': 0.75, '62': 0.5, '6': 0.25}, '6223': {'622': 0.75, '62': 0.5, '6': 0.25}, '6224': {'622': 0.75, '62': 0.5, '6': 0.25}, '6310': {'631': 0.75, '63': 0.5, '6': 0.25}, '6320': {'632': 0.75, '63': 0.5, '6': 0.25}, '6330': {'633': 0.75, '63': 0.5, '6': 0.25}, '6340': {'634': 0.75, '63': 0.5, '6': 0.25}, '7111': {'711': 0.75, '71': 0.5, '7': 0.25}, '7112': {'711': 0.75, '71': 0.5, '7': 0.25}, '7113': {'711': 0.75, '71': 0.5, '7': 0.25}, '7114': {'711': 0.75, '71': 0.5, '7': 0.25}, '7115': {'711': 0.75, '71': 0.5, '7': 0.25}, '7119': {'711': 0.75, '71': 0.5, '7': 0.25}, '7121': {'712': 0.75, '71': 0.5, '7': 0.25}, '7122': {'712': 0.75, '71': 0.5, '7': 0.25}, '7123': {'712': 0.75, '71': 0.5, '7': 0.25}, '7124': {'712': 0.75, '71': 0.5, '7': 0.25}, '7125': {'712': 0.75, '71': 0.5, '7': 0.25}, '7126': {'712': 0.75, '71': 0.5, '7': 0.25}, '7127': {'712': 0.75, '71': 0.5, '7': 0.25}, '7131': {'713': 0.75, '71': 0.5, '7': 0.25}, '7132': {'713': 0.75, '71': 0.5, '7': 0.25}, '7133': {'713': 0.75, '71': 0.5, '7': 0.25}, '7211': {'721': 0.75, '72': 0.5, '7': 0.25}, '7212': {'721': 0.75, '72': 0.5, '7': 0.25}, '7213': {'721': 0.75, '72': 0.5, '7': 0.25}, '7214': {'721': 0.75, '72': 0.5, '7': 0.25}, '7215': {'721': 0.75, '72': 0.5, '7': 0.25}, '7221': {'722': 0.75, '72': 0.5, '7': 0.25}, '7222': {'722': 0.75, '72': 0.5, '7': 0.25}, '7223': {'722': 0.75, '72': 0.5, '7': 0.25}, '7224': {'722': 0.75, '72': 0.5, '7': 0.25}, '7231': {'723': 0.75, '72': 0.5, '7': 0.25}, '7232': {'723': 0.75, '72': 0.5, '7': 0.25}, '7233': {'723': 0.75, '72': 0.5, '7': 0.25}, '7234': {'723': 0.75, '72': 0.5, '7': 0.25}, '7311': {'731': 0.75, '73': 0.5, '7': 0.25}, '7312': {'731': 0.75, '73': 0.5, '7': 0.25}, '7313': {'731': 0.75, '73': 0.5, '7': 0.25}, '7314': {'731': 0.75, '73': 0.5, '7': 0.25}, '7315': {'731': 0.75, '73': 0.5, '7': 0.25}, '7316': {'731': 0.75, '73': 0.5, '7': 0.25}, '7317': {'731': 0.75, '73': 0.5, '7': 0.25}, '7318': {'731': 0.75, '73': 0.5, '7': 0.25}, '7319': {'731': 0.75, '73': 0.5, '7': 0.25}, '7321': {'732': 0.75, '73': 0.5, '7': 0.25}, '7322': {'732': 0.75, '73': 0.5, '7': 0.25}, '7323': {'732': 0.75, '73': 0.5, '7': 0.25}, '7411': {'741': 0.75, '74': 0.5, '7': 0.25}, '7412': {'741': 0.75, '74': 0.5, '7': 0.25}, '7413': {'741': 0.75, '74': 0.5, '7': 0.25}, '7421': {'742': 0.75, '74': 0.5, '7': 0.25}, '7422': {'742': 0.75, '74': 0.5, '7': 0.25}, '7511': {'751': 0.75, '75': 0.5, '7': 0.25}, '7512': {'751': 0.75, '75': 0.5, '7': 0.25}, '7513': {'751': 0.75, '75': 0.5, '7': 0.25}, '7514': {'751': 0.75, '75': 0.5, '7': 0.25}, '7515': {'751': 0.75, '75': 0.5, '7': 0.25}, '7516': {'751': 0.75, '75': 0.5, '7': 0.25}, '7521': {'752': 0.75, '75': 0.5, '7': 0.25}, '7522': {'752': 0.75, '75': 0.5, '7': 0.25}, '7523': {'752': 0.75, '75': 0.5, '7': 0.25}, '7531': {'753': 0.75, '75': 0.5, '7': 0.25}, '7532': {'753': 0.75, '75': 0.5, '7': 0.25}, '7533': {'753': 0.75, '75': 0.5, '7': 0.25}, '7534': {'753': 0.75, '75': 0.5, '7': 0.25}, '7535': {'753': 0.75, '75': 0.5, '7': 0.25}, '7536': {'753': 0.75, '75': 0.5, '7': 0.25}, '7541': {'754': 0.75, '75': 0.5, '7': 0.25}, '7542': {'754': 0.75, '75': 0.5, '7': 0.25}, '7543': {'754': 0.75, '75': 0.5, '7': 0.25}, '7544': {'754': 0.75, '75': 0.5, '7': 0.25}, '7549': {'754': 0.75, '75': 0.5, '7': 0.25}, '8111': {'811': 0.75, '81': 0.5, '8': 0.25}, '8112': {'811': 0.75, '81': 0.5, '8': 0.25}, '8113': {'811': 0.75, '81': 0.5, '8': 0.25}, '8114': {'811': 0.75, '81': 0.5, '8': 0.25}, '8121': {'812': 0.75, '81': 0.5, '8': 0.25}, '8122': {'812': 0.75, '81': 0.5, '8': 0.25}, '8131': {'813': 0.75, '81': 0.5, '8': 0.25}, '8132': {'813': 0.75, '81': 0.5, '8': 0.25}, '8141': {'814': 0.75, '81': 0.5, '8': 0.25}, '8142': {'814': 0.75, '81': 0.5, '8': 0.25}, '8143': {'814': 0.75, '81': 0.5, '8': 0.25}, '8151': {'815': 0.75, '81': 0.5, '8': 0.25}, '8152': {'815': 0.75, '81': 0.5, '8': 0.25}, '8153': {'815': 0.75, '81': 0.5, '8': 0.25}, '8154': {'815': 0.75, '81': 0.5, '8': 0.25}, '8155': {'815': 0.75, '81': 0.5, '8': 0.25}, '8156': {'815': 0.75, '81': 0.5, '8': 0.25}, '8157': {'815': 0.75, '81': 0.5, '8': 0.25}, '8159': {'815': 0.75, '81': 0.5, '8': 0.25}, '8160': {'816': 0.75, '81': 0.5, '8': 0.25}, '8171': {'817': 0.75, '81': 0.5, '8': 0.25}, '8172': {'817': 0.75, '81': 0.5, '8': 0.25}, '8181': {'818': 0.75, '81': 0.5, '8': 0.25}, '8182': {'818': 0.75, '81': 0.5, '8': 0.25}, '8183': {'818': 0.75, '81': 0.5, '8': 0.25}, '8189': {'818': 0.75, '81': 0.5, '8': 0.25}, '8211': {'821': 0.75, '82': 0.5, '8': 0.25}, '8212': {'821': 0.75, '82': 0.5, '8': 0.25}, '8219': {'821': 0.75, '82': 0.5, '8': 0.25}, '8311': {'831': 0.75, '83': 0.5, '8': 0.25}, '8312': {'831': 0.75, '83': 0.5, '8': 0.25}, '8321': {'832': 0.75, '83': 0.5, '8': 0.25}, '8322': {'832': 0.75, '83': 0.5, '8': 0.25}, '8331': {'833': 0.75, '83': 0.5, '8': 0.25}, '8332': {'833': 0.75, '83': 0.5, '8': 0.25}, '8341': {'834': 0.75, '83': 0.5, '8': 0.25}, '8342': {'834': 0.75, '83': 0.5, '8': 0.25}, '8343': {'834': 0.75, '83': 0.5, '8': 0.25}, '8344': {'834': 0.75, '83': 0.5, '8': 0.25}, '8350': {'835': 0.75, '83': 0.5, '8': 0.25}, '9111': {'911': 0.75, '91': 0.5, '9': 0.25}, '9112': {'911': 0.75, '91': 0.5, '9': 0.25}, '9121': {'912': 0.75, '91': 0.5, '9': 0.25}, '9122': {'912': 0.75, '91': 0.5, '9': 0.25}, '9123': {'912': 0.75, '91': 0.5, '9': 0.25}, '9129': {'912': 0.75, '91': 0.5, '9': 0.25}, '9211': {'921': 0.75, '92': 0.5, '9': 0.25}, '9212': {'921': 0.75, '92': 0.5, '9': 0.25}, '9213': {'921': 0.75, '92': 0.5, '9': 0.25}, '9214': {'921': 0.75, '92': 0.5, '9': 0.25}, '9215': {'921': 0.75, '92': 0.5, '9': 0.25}, '9216': {'921': 0.75, '92': 0.5, '9': 0.25}, '9311': {'931': 0.75, '93': 0.5, '9': 0.25}, '9312': {'931': 0.75, '93': 0.5, '9': 0.25}, '9313': {'931': 0.75, '93': 0.5, '9': 0.25}, '9321': {'932': 0.75, '93': 0.5, '9': 0.25}, '9329': {'932': 0.75, '93': 0.5, '9': 0.25}, '9331': {'933': 0.75, '93': 0.5, '9': 0.25}, '9332': {'933': 0.75, '93': 0.5, '9': 0.25}, '9333': {'933': 0.75, '93': 0.5, '9': 0.25}, '9334': {'933': 0.75, '93': 0.5, '9': 0.25}, '9411': {'941': 0.75, '94': 0.5, '9': 0.25}, '9412': {'941': 0.75, '94': 0.5, '9': 0.25}, '9510': {'951': 0.75, '95': 0.5, '9': 0.25}, '9520': {'952': 0.75, '95': 0.5, '9': 0.25}, '9611': {'961': 0.75, '96': 0.5, '9': 0.25}, '9612': {'961': 0.75, '96': 0.5, '9': 0.25}, '9613': {'961': 0.75, '96': 0.5, '9': 0.25}, '9621': {'962': 0.75, '96': 0.5, '9': 0.25}, '9622': {'962': 0.75, '96': 0.5, '9': 0.25}, '9623': {'962': 0.75, '96': 0.5, '9': 0.25}, '9624': {'962': 0.75, '96': 0.5, '9': 0.25}, '9629': {'962': 0.75, '96': 0.5, '9': 0.25}, '0110': {'011': 0.75, '01': 0.5, '0': 0.25}, '0210': {'021': 0.75, '02': 0.5, '0': 0.25}, '0310': {'031': 0.75, '03': 0.5, '0': 0.25}}\n",
176
- "Accuracy: 0.8611914401388086\n",
177
- "Hierarchical Precision: 0.989010989010989, Hierarchical Recall: 0.9836065573770492, Hierarchical F-measure: 0.9863013698630136\n",
178
- "Evaluation results saved to isco_results.txt\n"
179
- ]
180
- }
181
- ],
182
  "source": [
183
  "import os\n",
184
  "from datasets import load_dataset\n",
@@ -203,6 +190,10 @@
203
  " \"ICILS/multilingual_parental_occupations\", split=\"test\", token=hf_token\n",
204
  ")\n",
205
  "\n",
 
 
 
 
206
  "# Initialize the pipeline\n",
207
  "pipe = pipeline(\"text-classification\", model=\"ICILS/XLM-R-ISCO\", token=hf_token)\n",
208
  "\n",
@@ -211,6 +202,32 @@
211
  " # ISCO_CODE_TITLE is a string like \"7412 Electrical Mechanics and Fitters\" so we need to extract the first part for the evaluation.\n",
212
  " return isco_code_title.split()[0]\n",
213
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  "# Evaluate the model\n",
215
  "predictions = []\n",
216
  "references = []\n",
@@ -219,25 +236,1051 @@
219
  " # Predict\n",
220
  " prediction = pipe(\n",
221
  " example[\"JOB_DUTIES\"]\n",
222
- " ) # Use the correct key \"JOB_DUTIES\" for the text data\n",
223
  " predicted_label = extract_isco_code(prediction[0][\"label\"])\n",
224
  " predictions.append(predicted_label)\n",
225
  "\n",
226
  " # Reference\n",
227
- " reference_label = example[\"ISCO\"] # Use the correct key \"ISCO\" for the ISCO code\n",
228
  " references.append(reference_label)\n",
229
  "\n",
230
- "# Initialize the hierarchical accuracy measure\n",
231
- "hierarchical_accuracy = evaluate.load(\"danieldux/isco_hierarchical_accuracy\")\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  "\n",
233
  "# Compute the hierarchical accuracy\n",
234
- "results = hierarchical_accuracy.compute(predictions=predictions, references=references)\n",
235
  "\n",
236
  "# Save the results to a JSON file\n",
237
- "with open(\"isco_results.json\", \"w\") as f:\n",
238
- " json.dump(results, f)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  "\n",
240
- "print(\"Evaluation results saved to isco_results.json\")"
 
241
  ]
242
  }
243
  ],
 
9
  },
10
  {
11
  "cell_type": "code",
12
+ "execution_count": 1,
13
  "metadata": {},
14
  "outputs": [
15
  {
 
36
  },
37
  {
38
  "cell_type": "code",
39
+ "execution_count": 2,
40
  "metadata": {},
41
  "outputs": [
42
  {
 
163
  },
164
  {
165
  "cell_type": "code",
166
+ "execution_count": null,
167
  "metadata": {},
168
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  "source": [
170
  "import os\n",
171
  "from datasets import load_dataset\n",
 
190
  " \"ICILS/multilingual_parental_occupations\", split=\"test\", token=hf_token\n",
191
  ")\n",
192
  "\n",
193
+ "validation_data = load_dataset(\n",
194
+ " \"ICILS/multilingual_parental_occupations\", split=\"validation\", token=hf_token\n",
195
+ ")\n",
196
+ "\n",
197
  "# Initialize the pipeline\n",
198
  "pipe = pipeline(\"text-classification\", model=\"ICILS/XLM-R-ISCO\", token=hf_token)\n",
199
  "\n",
 
202
  " # ISCO_CODE_TITLE is a string like \"7412 Electrical Mechanics and Fitters\" so we need to extract the first part for the evaluation.\n",
203
  " return isco_code_title.split()[0]\n",
204
  "\n",
205
+ "# Initialize the hierarchical accuracy measure\n",
206
+ "hierarchical_accuracy = evaluate.load(\"danieldux/isco_hierarchical_accuracy\")"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "markdown",
211
+ "metadata": {},
212
+ "source": [
213
+ "## Test set"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": 2,
219
+ "metadata": {},
220
+ "outputs": [
221
+ {
222
+ "name": "stdout",
223
+ "output_type": "stream",
224
+ "text": [
225
+ "Accuracy: 0.8611914401388086, Hierarchical Precision: 0.989010989010989, Hierarchical Recall: 0.9836065573770492, Hierarchical F-measure: 0.9863013698630136\n",
226
+ "Evaluation results saved to isco_test_results.json\n"
227
+ ]
228
+ }
229
+ ],
230
+ "source": [
231
  "# Evaluate the model\n",
232
  "predictions = []\n",
233
  "references = []\n",
 
236
  " # Predict\n",
237
  " prediction = pipe(\n",
238
  " example[\"JOB_DUTIES\"]\n",
239
+ " ) # Use the key \"JOB_DUTIES\" for the text data\n",
240
  " predicted_label = extract_isco_code(prediction[0][\"label\"])\n",
241
  " predictions.append(predicted_label)\n",
242
  "\n",
243
  " # Reference\n",
244
+ " reference_label = example[\"ISCO\"] # Use the key \"ISCO\" for the ISCO code\n",
245
  " references.append(reference_label)\n",
246
  "\n",
247
+ "# Compute the hierarchical accuracy\n",
248
+ "test_results = hierarchical_accuracy.compute(predictions=predictions, references=references)\n",
249
+ "\n",
250
+ "# Save the results to a JSON file\n",
251
+ "with open(\"isco_test_results.json\", \"w\") as f:\n",
252
+ " json.dump(test_results, f)\n",
253
+ "\n",
254
+ "print(\"Evaluation results saved to isco_test_results.json\")"
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "markdown",
259
+ "metadata": {},
260
+ "source": [
261
+ "## Validation set"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "code",
266
+ "execution_count": 78,
267
+ "metadata": {},
268
+ "outputs": [
269
+ {
270
+ "name": "stdout",
271
+ "output_type": "stream",
272
+ "text": [
273
+ "Accuracy: 0.8576800694243564, Hierarchical Precision: 0.9757462686567164, Hierarchical Recall: 0.9812382739212008, Hierarchical F-measure: 0.9784845650140319\n",
274
+ "Evaluation results saved to isco_validation_results.json\n"
275
+ ]
276
+ }
277
+ ],
278
+ "source": [
279
+ "# Evaluate the model\n",
280
+ "predictions = []\n",
281
+ "references = []\n",
282
+ "for example in validation_data:\n",
283
+ "\n",
284
+ " # Predict\n",
285
+ " prediction = pipe(\n",
286
+ " example[\"JOB_DUTIES\"]\n",
287
+ " ) # Use the key \"JOB_DUTIES\" for the text data\n",
288
+ " predicted_label = extract_isco_code(prediction[0][\"label\"])\n",
289
+ " predictions.append(predicted_label)\n",
290
+ "\n",
291
+ " # Reference\n",
292
+ " reference_label = example[\"ISCO\"] # Use the key \"ISCO\" for the ISCO code\n",
293
+ " references.append(reference_label)\n",
294
  "\n",
295
  "# Compute the hierarchical accuracy\n",
296
+ "validation_results = hierarchical_accuracy.compute(predictions=predictions, references=references)\n",
297
  "\n",
298
  "# Save the results to a JSON file\n",
299
+ "with open(\"isco_validation_results.json\", \"w\") as f:\n",
300
+ " json.dump(validation_results, f)\n",
301
+ "\n",
302
+ "print(\"Evaluation results saved to isco_validation_results.json\")"
303
+ ]
304
+ },
305
+ {
306
+ "cell_type": "markdown",
307
+ "metadata": {},
308
+ "source": [
309
+ "# Inter rater agreement"
310
+ ]
311
+ },
312
+ {
313
+ "cell_type": "code",
314
+ "execution_count": 70,
315
+ "metadata": {},
316
+ "outputs": [],
317
+ "source": [
318
+ "import pandas as pd\n",
319
+ "\n",
320
+ "# icils_isco_int_ml = \"/datasets/isco-data/processed/2018/icils_2018_isco_ml.parquet\"\n",
321
+ "icils_isco_int_ml = \"gs://isco-data-asia-southeast1/processed/2018/icils_2018_isco_ml.parquet\"\n",
322
+ "\n",
323
+ "icils_df = pd.read_parquet(icils_isco_int_ml)[['JOB', 'DUTIES', 'ISCO', 'ISCO_REL', 'LANGUAGE']]\n",
324
+ "\n",
325
+ "# Create a new pandas dataframe with samples that have ISCO_REL values\n",
326
+ "isco_rel_df = icils_df[icils_df['ISCO'].notna()].copy()\n",
327
+ "\n",
328
+ "# remove rows with None values in ISCO_REL\n",
329
+ "isco_rel_df = isco_rel_df[isco_rel_df['ISCO_REL'].notna()]\n",
330
+ "\n",
331
+ "# Group the DataFrame by LANGUAGE column\n",
332
+ "grouped_df = isco_rel_df.groupby('LANGUAGE')"
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": 79,
338
+ "metadata": {},
339
+ "outputs": [
340
+ {
341
+ "data": {
342
+ "text/plain": [
343
+ "<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f2c318dd350>"
344
+ ]
345
+ },
346
+ "execution_count": 79,
347
+ "metadata": {},
348
+ "output_type": "execute_result"
349
+ }
350
+ ],
351
+ "source": [
352
+ "grouped_df"
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "code",
357
+ "execution_count": 77,
358
+ "metadata": {},
359
+ "outputs": [
360
+ {
361
+ "name": "stdout",
362
+ "output_type": "stream",
363
+ "text": [
364
+ "Accuracy: 0.7149425287356321, Hierarchical Precision: 0.9314641744548287, Hierarchical Recall: 0.8898809523809523, Hierarchical F-measure: 0.9101978691019786\n",
365
+ "Language: da\n",
366
+ "Result: {'accuracy': 0.7149425287356321, 'hierarchical_precision': 0.9314641744548287, 'hierarchical_recall': 0.8898809523809523, 'hierarchical_fmeasure': 0.9101978691019786}\n",
367
+ "\n",
368
+ "Accuracy: 0.9075297225891678, Hierarchical Precision: 0.9578651685393258, Hierarchical Recall: 0.9742857142857143, Hierarchical F-measure: 0.9660056657223796\n",
369
+ "Language: en\n",
370
+ "Result: {'accuracy': 0.9075297225891678, 'hierarchical_precision': 0.9578651685393258, 'hierarchical_recall': 0.9742857142857143, 'hierarchical_fmeasure': 0.9660056657223796}\n",
371
+ "\n",
372
+ "Accuracy: 0.8794080604534005, Hierarchical Precision: 0.9774590163934426, Hierarchical Recall: 0.9655870445344129, Hierarchical F-measure: 0.9714867617107942\n",
373
+ "Language: es\n",
374
+ "Result: {'accuracy': 0.8794080604534005, 'hierarchical_precision': 0.9774590163934426, 'hierarchical_recall': 0.9655870445344129, 'hierarchical_fmeasure': 0.9714867617107942}\n",
375
+ "\n",
376
+ "Accuracy: 0.9286376274328082, Hierarchical Precision: 0.9591836734693877, Hierarchical Recall: 0.9733727810650887, Hierarchical F-measure: 0.9662261380323054\n",
377
+ "Language: fi\n",
378
+ "Result: {'accuracy': 0.9286376274328082, 'hierarchical_precision': 0.9591836734693877, 'hierarchical_recall': 0.9733727810650887, 'hierarchical_fmeasure': 0.9662261380323054}\n",
379
+ "\n",
380
+ "Accuracy: 0.5772994129158513, Hierarchical Precision: 0.8571428571428571, Hierarchical Recall: 0.8808864265927978, Hierarchical F-measure: 0.8688524590163934\n",
381
+ "Language: fr\n",
382
+ "Result: {'accuracy': 0.5772994129158513, 'hierarchical_precision': 0.8571428571428571, 'hierarchical_recall': 0.8808864265927978, 'hierarchical_fmeasure': 0.8688524590163934}\n",
383
+ "\n",
384
+ "Accuracy: 0.9332579185520362, Hierarchical Precision: 0.9616613418530351, Hierarchical Recall: 0.9525316455696202, Hierarchical F-measure: 0.9570747217806042\n",
385
+ "Language: it\n",
386
+ "Result: {'accuracy': 0.9332579185520362, 'hierarchical_precision': 0.9616613418530351, 'hierarchical_recall': 0.9525316455696202, 'hierarchical_fmeasure': 0.9570747217806042}\n",
387
+ "\n",
388
+ "Accuracy: 0.9313346228239845, Hierarchical Precision: 0.9816849816849816, Hierarchical Recall: 0.9710144927536232, Hierarchical F-measure: 0.97632058287796\n",
389
+ "Language: kk\n",
390
+ "Result: {'accuracy': 0.9313346228239845, 'hierarchical_precision': 0.9816849816849816, 'hierarchical_recall': 0.9710144927536232, 'hierarchical_fmeasure': 0.97632058287796}\n",
391
+ "\n",
392
+ "Accuracy: 0.9369047619047619, Hierarchical Precision: 0.9726962457337884, Hierarchical Recall: 0.9827586206896551, Hierarchical F-measure: 0.9777015437392795\n",
393
+ "Language: ko\n",
394
+ "Result: {'accuracy': 0.9369047619047619, 'hierarchical_precision': 0.9726962457337884, 'hierarchical_recall': 0.9827586206896551, 'hierarchical_fmeasure': 0.9777015437392795}\n",
395
+ "\n",
396
+ "Accuracy: 0.8936170212765957, Hierarchical Precision: 0.9591836734693877, Hierarchical Recall: 0.9563953488372093, Hierarchical F-measure: 0.957787481804949\n",
397
+ "Language: pt\n",
398
+ "Result: {'accuracy': 0.8936170212765957, 'hierarchical_precision': 0.9591836734693877, 'hierarchical_recall': 0.9563953488372093, 'hierarchical_fmeasure': 0.957787481804949}\n",
399
+ "\n",
400
+ "Accuracy: 0.9259259259259259, Hierarchical Precision: 0.971875, Hierarchical Recall: 0.9658385093167702, Hierarchical F-measure: 0.9688473520249222\n",
401
+ "Language: ru\n",
402
+ "Result: {'accuracy': 0.9259259259259259, 'hierarchical_precision': 0.971875, 'hierarchical_recall': 0.9658385093167702, 'hierarchical_fmeasure': 0.9688473520249222}\n",
403
+ "\n",
404
+ "Accuracy: 0.9726027397260274, Hierarchical Precision: 0.9927007299270073, Hierarchical Recall: 1.0, Hierarchical F-measure: 0.9963369963369962\n",
405
+ "Language: sv\n",
406
+ "Result: {'accuracy': 0.9726027397260274, 'hierarchical_precision': 0.9927007299270073, 'hierarchical_recall': 1.0, 'hierarchical_fmeasure': 0.9963369963369962}\n",
407
+ "\n"
408
+ ]
409
+ },
410
+ {
411
+ "name": "stderr",
412
+ "output_type": "stream",
413
+ "text": [
414
+ "/tmp/ipykernel_29614/1496722815.py:17: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
415
+ " results_df = pd.concat([results_df, group_result_df], ignore_index=True)\n"
416
+ ]
417
+ }
418
+ ],
419
+ "source": [
420
+ "\n",
421
+ "results_df = pd.DataFrame(columns=['Language', 'Accuracy', 'Hierarchical Precision', 'Hierarchical Recall', 'Hierarchical F1'])\n",
422
+ "\n",
423
+ "# Iterate over each group\n",
424
+ "for language, group in grouped_df:\n",
425
+ " references = group['ISCO'].tolist()\n",
426
+ " predictions = group['ISCO_REL'].tolist()\n",
427
+ " \n",
428
+ " # Apply the compute function\n",
429
+ " rel_result = hierarchical_accuracy.compute(references=references, predictions=predictions)\n",
430
+ " \n",
431
+ " # Create a new DataFrame with the result for the current group\n",
432
+ " group_result_df = pd.DataFrame({'Language': [language], 'Accuracy': [rel_result['accuracy']], 'Hierarchical Precision': [rel_result['hierarchical_precision']], 'Hierarchical Recall': [rel_result['hierarchical_recall']], 'Hierarchical F1': [rel_result['hierarchical_fmeasure']]})\n",
433
+ " \n",
434
+ " # Concatenate the group_result_df with the results_df\n",
435
+ " results_df = pd.concat([results_df, group_result_df], ignore_index=True)\n",
436
+ " \n",
437
+ " # Print the result\n",
438
+ " print(f\"Language: {language}\")\n",
439
+ " # print(f\"References: {references}\")\n",
440
+ " # print(f\"Predictions: {predictions}\")\n",
441
+ " print(f\"Result: {rel_result}\")\n",
442
+ " print()\n",
443
+ "\n",
444
+ "average_accuracy = results_df['Accuracy'].mean()\n",
445
+ "average_hierarchical_precision = results_df['Hierarchical Precision'].mean()\n",
446
+ "average_hierarchical_recall = results_df['Hierarchical Recall'].mean()\n",
447
+ "average_hierarchical_f1 = results_df['Hierarchical F1'].mean()\n",
448
+ "\n",
449
+ "average_row = ['Average', average_accuracy, average_hierarchical_precision, average_hierarchical_recall, average_hierarchical_f1]\n",
450
+ "results_df.loc[len(results_df)] = average_row\n",
451
+ "\n",
452
+ "\n",
453
+ "results_df.to_csv('language_results.csv', index=False)"
454
+ ]
455
+ },
456
+ {
457
+ "cell_type": "code",
458
+ "execution_count": 62,
459
+ "metadata": {},
460
+ "outputs": [
461
+ {
462
+ "data": {
463
+ "text/html": [
464
+ "<div>\n",
465
+ "<style scoped>\n",
466
+ " .dataframe tbody tr th:only-of-type {\n",
467
+ " vertical-align: middle;\n",
468
+ " }\n",
469
+ "\n",
470
+ " .dataframe tbody tr th {\n",
471
+ " vertical-align: top;\n",
472
+ " }\n",
473
+ "\n",
474
+ " .dataframe thead th {\n",
475
+ " text-align: right;\n",
476
+ " }\n",
477
+ "</style>\n",
478
+ "<table border=\"1\" class=\"dataframe\">\n",
479
+ " <thead>\n",
480
+ " <tr style=\"text-align: right;\">\n",
481
+ " <th></th>\n",
482
+ " <th>JOB</th>\n",
483
+ " <th>DUTIES</th>\n",
484
+ " <th>ISCO</th>\n",
485
+ " <th>ISCO_REL</th>\n",
486
+ " <th>LANGUAGE</th>\n",
487
+ " </tr>\n",
488
+ " </thead>\n",
489
+ " <tbody>\n",
490
+ " <tr>\n",
491
+ " <th>0</th>\n",
492
+ " <td>acopio</td>\n",
493
+ " <td>recibe tarros con leche y despues hecha la lec...</td>\n",
494
+ " <td>9333</td>\n",
495
+ " <td>9333</td>\n",
496
+ " <td>es</td>\n",
497
+ " </tr>\n",
498
+ " <tr>\n",
499
+ " <th>5</th>\n",
500
+ " <td>yo vivo con mi abuela y abuelo mi abuela o tr...</td>\n",
501
+ " <td>mi mama trabaja en limpiar las casas</td>\n",
502
+ " <td>9111</td>\n",
503
+ " <td>9111</td>\n",
504
+ " <td>es</td>\n",
505
+ " </tr>\n",
506
+ " <tr>\n",
507
+ " <th>9</th>\n",
508
+ " <td>dueña de casa</td>\n",
509
+ " <td>mantiene el orden de la casa</td>\n",
510
+ " <td>9701</td>\n",
511
+ " <td>9701</td>\n",
512
+ " <td>es</td>\n",
513
+ " </tr>\n",
514
+ " <tr>\n",
515
+ " <th>10</th>\n",
516
+ " <td>señora de casa</td>\n",
517
+ " <td>trabaja en la lecheria con las bacas y terneros</td>\n",
518
+ " <td>9701</td>\n",
519
+ " <td>9701</td>\n",
520
+ " <td>es</td>\n",
521
+ " </tr>\n",
522
+ " <tr>\n",
523
+ " <th>11</th>\n",
524
+ " <td>trabajadora agricolar</td>\n",
525
+ " <td>aplicar liquidos ala plantas</td>\n",
526
+ " <td>9211</td>\n",
527
+ " <td>9211</td>\n",
528
+ " <td>es</td>\n",
529
+ " </tr>\n",
530
+ " <tr>\n",
531
+ " <th>...</th>\n",
532
+ " <td>...</td>\n",
533
+ " <td>...</td>\n",
534
+ " <td>...</td>\n",
535
+ " <td>...</td>\n",
536
+ " <td>...</td>\n",
537
+ " </tr>\n",
538
+ " <tr>\n",
539
+ " <th>113962</th>\n",
540
+ " <td>Фотограф</td>\n",
541
+ " <td>Рассылал снимки в журналы, получал за это гоно...</td>\n",
542
+ " <td>3431</td>\n",
543
+ " <td>3431</td>\n",
544
+ " <td>ru</td>\n",
545
+ " </tr>\n",
546
+ " <tr>\n",
547
+ " <th>114114</th>\n",
548
+ " <td>Магазин</td>\n",
549
+ " <td>У него есть всой магазин где он работает.</td>\n",
550
+ " <td>5221</td>\n",
551
+ " <td>5221</td>\n",
552
+ " <td>ru</td>\n",
553
+ " </tr>\n",
554
+ " <tr>\n",
555
+ " <th>114295</th>\n",
556
+ " <td>цирк</td>\n",
557
+ " <td>держал перши</td>\n",
558
+ " <td>2659</td>\n",
559
+ " <td>2659</td>\n",
560
+ " <td>ru</td>\n",
561
+ " </tr>\n",
562
+ " <tr>\n",
563
+ " <th>114317</th>\n",
564
+ " <td>Человек-молкула</td>\n",
565
+ " <td>Супер-герой</td>\n",
566
+ " <td>9705</td>\n",
567
+ " <td>9705</td>\n",
568
+ " <td>ru</td>\n",
569
+ " </tr>\n",
570
+ " <tr>\n",
571
+ " <th>114371</th>\n",
572
+ " <td>Строительство заборов</td>\n",
573
+ " <td>Ставит заборы дачникам и не только</td>\n",
574
+ " <td>7111</td>\n",
575
+ " <td>7111</td>\n",
576
+ " <td>ru</td>\n",
577
+ " </tr>\n",
578
+ " </tbody>\n",
579
+ "</table>\n",
580
+ "<p>13055 rows × 5 columns</p>\n",
581
+ "</div>"
582
+ ],
583
+ "text/plain": [
584
+ " JOB \\\n",
585
+ "0 acopio \n",
586
+ "5 yo vivo con mi abuela y abuelo mi abuela o tr... \n",
587
+ "9 dueña de casa \n",
588
+ "10 señora de casa \n",
589
+ "11 trabajadora agricolar \n",
590
+ "... ... \n",
591
+ "113962 Фотограф \n",
592
+ "114114 Магазин \n",
593
+ "114295 цирк \n",
594
+ "114317 Человек-молкула \n",
595
+ "114371 Строительство заборов \n",
596
+ "\n",
597
+ " DUTIES ISCO ISCO_REL \\\n",
598
+ "0 recibe tarros con leche y despues hecha la lec... 9333 9333 \n",
599
+ "5 mi mama trabaja en limpiar las casas 9111 9111 \n",
600
+ "9 mantiene el orden de la casa 9701 9701 \n",
601
+ "10 trabaja en la lecheria con las bacas y terneros 9701 9701 \n",
602
+ "11 aplicar liquidos ala plantas 9211 9211 \n",
603
+ "... ... ... ... \n",
604
+ "113962 Рассылал снимки в журналы, получал за это гоно... 3431 3431 \n",
605
+ "114114 У него есть всой магазин где он работает. 5221 5221 \n",
606
+ "114295 держал перши 2659 2659 \n",
607
+ "114317 Супер-герой 9705 9705 \n",
608
+ "114371 Ставит заборы дачникам и не только 7111 7111 \n",
609
+ "\n",
610
+ " LANGUAGE \n",
611
+ "0 es \n",
612
+ "5 es \n",
613
+ "9 es \n",
614
+ "10 es \n",
615
+ "11 es \n",
616
+ "... ... \n",
617
+ "113962 ru \n",
618
+ "114114 ru \n",
619
+ "114295 ru \n",
620
+ "114317 ru \n",
621
+ "114371 ru \n",
622
+ "\n",
623
+ "[13055 rows x 5 columns]"
624
+ ]
625
+ },
626
+ "execution_count": 62,
627
+ "metadata": {},
628
+ "output_type": "execute_result"
629
+ }
630
+ ],
631
+ "source": [
632
+ "# create a dataframe with samples where ISCO and ISCO_REL the same\n",
633
+ "isco_rel_df_same = isco_rel_df[isco_rel_df['ISCO'] == isco_rel_df['ISCO_REL']]\n",
634
+ "\n",
635
+ "isco_rel_df_same"
636
+ ]
637
+ },
638
+ {
639
+ "cell_type": "code",
640
+ "execution_count": 63,
641
+ "metadata": {},
642
+ "outputs": [
643
+ {
644
+ "data": {
645
+ "text/html": [
646
+ "<div>\n",
647
+ "<style scoped>\n",
648
+ " .dataframe tbody tr th:only-of-type {\n",
649
+ " vertical-align: middle;\n",
650
+ " }\n",
651
+ "\n",
652
+ " .dataframe tbody tr th {\n",
653
+ " vertical-align: top;\n",
654
+ " }\n",
655
+ "\n",
656
+ " .dataframe thead th {\n",
657
+ " text-align: right;\n",
658
+ " }\n",
659
+ "</style>\n",
660
+ "<table border=\"1\" class=\"dataframe\">\n",
661
+ " <thead>\n",
662
+ " <tr style=\"text-align: right;\">\n",
663
+ " <th></th>\n",
664
+ " <th>JOB</th>\n",
665
+ " <th>DUTIES</th>\n",
666
+ " <th>ISCO</th>\n",
667
+ " <th>ISCO_REL</th>\n",
668
+ " <th>LANGUAGE</th>\n",
669
+ " </tr>\n",
670
+ " </thead>\n",
671
+ " <tbody>\n",
672
+ " <tr>\n",
673
+ " <th>4</th>\n",
674
+ " <td>Asistente judirica</td>\n",
675
+ " <td>gestionar casos de fiscalia</td>\n",
676
+ " <td>3342</td>\n",
677
+ " <td>3411</td>\n",
678
+ " <td>es</td>\n",
679
+ " </tr>\n",
680
+ " <tr>\n",
681
+ " <th>8</th>\n",
682
+ " <td>lechera</td>\n",
683
+ " <td>saca leche</td>\n",
684
+ " <td>9212</td>\n",
685
+ " <td>9211</td>\n",
686
+ " <td>es</td>\n",
687
+ " </tr>\n",
688
+ " <tr>\n",
689
+ " <th>14</th>\n",
690
+ " <td>Mi madre es dueña de casa</td>\n",
691
+ " <td>Realiza todos los quehaceres del hogar, y trab...</td>\n",
692
+ " <td>9111</td>\n",
693
+ " <td>9701</td>\n",
694
+ " <td>es</td>\n",
695
+ " </tr>\n",
696
+ " <tr>\n",
697
+ " <th>34</th>\n",
698
+ " <td>algricultura</td>\n",
699
+ " <td>algricultura</td>\n",
700
+ " <td>9705</td>\n",
701
+ " <td>9211</td>\n",
702
+ " <td>es</td>\n",
703
+ " </tr>\n",
704
+ " <tr>\n",
705
+ " <th>38</th>\n",
706
+ " <td>en la agricultura</td>\n",
707
+ " <td>produce alimentos de vegetacion</td>\n",
708
+ " <td>633</td>\n",
709
+ " <td>9211</td>\n",
710
+ " <td>es</td>\n",
711
+ " </tr>\n",
712
+ " <tr>\n",
713
+ " <th>...</th>\n",
714
+ " <td>...</td>\n",
715
+ " <td>...</td>\n",
716
+ " <td>...</td>\n",
717
+ " <td>...</td>\n",
718
+ " <td>...</td>\n",
719
+ " </tr>\n",
720
+ " <tr>\n",
721
+ " <th>111656</th>\n",
722
+ " <td>gerente de ventas</td>\n",
723
+ " <td>ropa</td>\n",
724
+ " <td>5222</td>\n",
725
+ " <td>1221</td>\n",
726
+ " <td>es</td>\n",
727
+ " </tr>\n",
728
+ " <tr>\n",
729
+ " <th>111700</th>\n",
730
+ " <td>policia jubilado</td>\n",
731
+ " <td>capitan</td>\n",
732
+ " <td>5412</td>\n",
733
+ " <td>9703</td>\n",
734
+ " <td>es</td>\n",
735
+ " </tr>\n",
736
+ " <tr>\n",
737
+ " <th>111792</th>\n",
738
+ " <td>Vendiendo comida</td>\n",
739
+ " <td>Mi padrastro vende comida</td>\n",
740
+ " <td>5223</td>\n",
741
+ " <td>5212</td>\n",
742
+ " <td>es</td>\n",
743
+ " </tr>\n",
744
+ " <tr>\n",
745
+ " <th>112817</th>\n",
746
+ " <td>Собственник ювелирного магазина</td>\n",
747
+ " <td>Продавал ювелирные изделия</td>\n",
748
+ " <td>7313</td>\n",
749
+ " <td>5221</td>\n",
750
+ " <td>ru</td>\n",
751
+ " </tr>\n",
752
+ " <tr>\n",
753
+ " <th>113081</th>\n",
754
+ " <td>Предприниматель</td>\n",
755
+ " <td>Вещи продовал (продукты)</td>\n",
756
+ " <td>5221</td>\n",
757
+ " <td>112</td>\n",
758
+ " <td>ru</td>\n",
759
+ " </tr>\n",
760
+ " </tbody>\n",
761
+ "</table>\n",
762
+ "<p>1958 rows × 5 columns</p>\n",
763
+ "</div>"
764
+ ],
765
+ "text/plain": [
766
+ " JOB \\\n",
767
+ "4 Asistente judirica \n",
768
+ "8 lechera \n",
769
+ "14 Mi madre es dueña de casa \n",
770
+ "34 algricultura \n",
771
+ "38 en la agricultura \n",
772
+ "... ... \n",
773
+ "111656 gerente de ventas \n",
774
+ "111700 policia jubilado \n",
775
+ "111792 Vendiendo comida \n",
776
+ "112817 Собственник ювелирного магазина \n",
777
+ "113081 Предприниматель \n",
778
+ "\n",
779
+ " DUTIES ISCO ISCO_REL \\\n",
780
+ "4 gestionar casos de fiscalia 3342 3411 \n",
781
+ "8 saca leche 9212 9211 \n",
782
+ "14 Realiza todos los quehaceres del hogar, y trab... 9111 9701 \n",
783
+ "34 algricultura 9705 9211 \n",
784
+ "38 produce alimentos de vegetacion 633 9211 \n",
785
+ "... ... ... ... \n",
786
+ "111656 ropa 5222 1221 \n",
787
+ "111700 capitan 5412 9703 \n",
788
+ "111792 Mi padrastro vende comida 5223 5212 \n",
789
+ "112817 Продавал ювелирные изделия 7313 5221 \n",
790
+ "113081 Вещи продовал (продукты) 5221 112 \n",
791
+ "\n",
792
+ " LANGUAGE \n",
793
+ "4 es \n",
794
+ "8 es \n",
795
+ "14 es \n",
796
+ "34 es \n",
797
+ "38 es \n",
798
+ "... ... \n",
799
+ "111656 es \n",
800
+ "111700 es \n",
801
+ "111792 es \n",
802
+ "112817 ru \n",
803
+ "113081 ru \n",
804
+ "\n",
805
+ "[1958 rows x 5 columns]"
806
+ ]
807
+ },
808
+ "execution_count": 63,
809
+ "metadata": {},
810
+ "output_type": "execute_result"
811
+ }
812
+ ],
813
+ "source": [
814
+ "# create a dataframe with samples where ISCO and ISCO_REL are different\n",
815
+ "isco_rel_df_diff = isco_rel_df[isco_rel_df['ISCO'] != isco_rel_df['ISCO_REL']]\n",
816
+ "\n",
817
+ "isco_rel_df_diff"
818
+ ]
819
+ },
820
+ {
821
+ "cell_type": "code",
822
+ "execution_count": 64,
823
+ "metadata": {},
824
+ "outputs": [],
825
+ "source": [
826
+ "# Make a list of all values in ISCO and ISCO_REL columns\n",
827
+ "coder1 = list(isco_rel_df['ISCO'])\n",
828
+ "coder2 = list(isco_rel_df['ISCO_REL'])"
829
+ ]
830
+ },
831
+ {
832
+ "cell_type": "code",
833
+ "execution_count": 66,
834
+ "metadata": {},
835
+ "outputs": [
836
+ {
837
+ "name": "stdout",
838
+ "output_type": "stream",
839
+ "text": [
840
+ "Accuracy: 0.8695796975954173, Hierarchical Precision: 0.9876106194690265, Hierarchical Recall: 0.9911190053285968, Hierarchical F-measure: 0.9893617021276595\n",
841
+ "Evaluation results saved to isco_rel_results.json\n"
842
+ ]
843
+ }
844
+ ],
845
+ "source": [
846
+ "# Compute the hierarchical accuracy\n",
847
+ "reliability_results = hierarchical_accuracy.compute(predictions=coder2, references=coder1)\n",
848
+ "\n",
849
+ "# Save the results to a JSON file\n",
850
+ "with open(\"isco_rel_results.json\", \"w\") as f:\n",
851
+ " json.dump(reliability_results, f)\n",
852
+ "\n",
853
+ "print(\"Evaluation results saved to isco_rel_results.json\")"
854
+ ]
855
+ },
856
+ {
857
+ "cell_type": "markdown",
858
+ "metadata": {},
859
+ "source": [
860
+ "## Giskard model testing"
861
+ ]
862
+ },
863
+ {
864
+ "cell_type": "code",
865
+ "execution_count": 1,
866
+ "metadata": {},
867
+ "outputs": [],
868
+ "source": [
869
+ "import numpy as np\n",
870
+ "import pandas as pd\n",
871
+ "from scipy.special import softmax\n",
872
+ "from datasets import load_dataset\n",
873
+ "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
874
+ "\n",
875
+ "from giskard import Dataset, Model, scan, testing, GiskardClient, Suite"
876
+ ]
877
+ },
878
+ {
879
+ "cell_type": "code",
880
+ "execution_count": 3,
881
+ "metadata": {},
882
+ "outputs": [
883
+ {
884
+ "data": {
885
+ "text/html": [
886
+ "<div>\n",
887
+ "<style scoped>\n",
888
+ " .dataframe tbody tr th:only-of-type {\n",
889
+ " vertical-align: middle;\n",
890
+ " }\n",
891
+ "\n",
892
+ " .dataframe tbody tr th {\n",
893
+ " vertical-align: top;\n",
894
+ " }\n",
895
+ "\n",
896
+ " .dataframe thead th {\n",
897
+ " text-align: right;\n",
898
+ " }\n",
899
+ "</style>\n",
900
+ "<table border=\"1\" class=\"dataframe\">\n",
901
+ " <thead>\n",
902
+ " <tr style=\"text-align: right;\">\n",
903
+ " <th></th>\n",
904
+ " <th>IDSTUD</th>\n",
905
+ " <th>JOB_DUTIES</th>\n",
906
+ " <th>ISCO</th>\n",
907
+ " <th>ISCO_REL</th>\n",
908
+ " <th>ISCO_TITLE</th>\n",
909
+ " <th>ISCO_CODE_TITLE</th>\n",
910
+ " <th>COUNTRY</th>\n",
911
+ " <th>LANGUAGE</th>\n",
912
+ " </tr>\n",
913
+ " </thead>\n",
914
+ " <tbody>\n",
915
+ " <tr>\n",
916
+ " <th>0</th>\n",
917
+ " <td>10670109</td>\n",
918
+ " <td>forældre 1: Han arbejder som med-chef sammen...</td>\n",
919
+ " <td>7412</td>\n",
920
+ " <td>None</td>\n",
921
+ " <td>Electrical Mechanics and Fitters</td>\n",
922
+ " <td>7412 Electrical Mechanics and Fitters</td>\n",
923
+ " <td>DNK</td>\n",
924
+ " <td>da</td>\n",
925
+ " </tr>\n",
926
+ " <tr>\n",
927
+ " <th>1</th>\n",
928
+ " <td>10130106</td>\n",
929
+ " <td>asistente de parbulo y basica. ayudaba en la e...</td>\n",
930
+ " <td>5312</td>\n",
931
+ " <td>5312</td>\n",
932
+ " <td>Teachers' Aides</td>\n",
933
+ " <td>5312 Teachers' Aides</td>\n",
934
+ " <td>CHL</td>\n",
935
+ " <td>es</td>\n",
936
+ " </tr>\n",
937
+ " <tr>\n",
938
+ " <th>2</th>\n",
939
+ " <td>10740120</td>\n",
940
+ " <td>trabajaba en el campo como capatas. aveces cui...</td>\n",
941
+ " <td>6121</td>\n",
942
+ " <td>None</td>\n",
943
+ " <td>Livestock and Dairy Producers</td>\n",
944
+ " <td>6121 Livestock and Dairy Producers</td>\n",
945
+ " <td>URY</td>\n",
946
+ " <td>es</td>\n",
947
+ " </tr>\n",
948
+ " <tr>\n",
949
+ " <th>3</th>\n",
950
+ " <td>10170109</td>\n",
951
+ " <td>gas abastible. vende gas abastible</td>\n",
952
+ " <td>9621</td>\n",
953
+ " <td>5243</td>\n",
954
+ " <td>Messengers, Package Deliverers and Luggage Por...</td>\n",
955
+ " <td>9621 Messengers, Package Deliverers and Luggag...</td>\n",
956
+ " <td>CHL</td>\n",
957
+ " <td>es</td>\n",
958
+ " </tr>\n",
959
+ " <tr>\n",
960
+ " <th>4</th>\n",
961
+ " <td>11480109</td>\n",
962
+ " <td>jordbruk. sår potatis tar upp potatis plogar h...</td>\n",
963
+ " <td>6111</td>\n",
964
+ " <td>6111</td>\n",
965
+ " <td>Field Crop and Vegetable Growers</td>\n",
966
+ " <td>6111 Field Crop and Vegetable Growers</td>\n",
967
+ " <td>FIN</td>\n",
968
+ " <td>sv</td>\n",
969
+ " </tr>\n",
970
+ " <tr>\n",
971
+ " <th>...</th>\n",
972
+ " <td>...</td>\n",
973
+ " <td>...</td>\n",
974
+ " <td>...</td>\n",
975
+ " <td>...</td>\n",
976
+ " <td>...</td>\n",
977
+ " <td>...</td>\n",
978
+ " <td>...</td>\n",
979
+ " <td>...</td>\n",
980
+ " </tr>\n",
981
+ " <tr>\n",
982
+ " <th>495</th>\n",
983
+ " <td>11780107</td>\n",
984
+ " <td>acountent mannager|she mannages calls for jobs...</td>\n",
985
+ " <td>1211</td>\n",
986
+ " <td>9998</td>\n",
987
+ " <td>Finance Managers</td>\n",
988
+ " <td>1211 Finance Managers</td>\n",
989
+ " <td>AUS</td>\n",
990
+ " <td>en</td>\n",
991
+ " </tr>\n",
992
+ " <tr>\n",
993
+ " <th>496</th>\n",
994
+ " <td>10850104</td>\n",
995
+ " <td>geometra/muratore. proggetta case e le restaura</td>\n",
996
+ " <td>3112</td>\n",
997
+ " <td>3112</td>\n",
998
+ " <td>Civil Engineering Technicians</td>\n",
999
+ " <td>3112 Civil Engineering Technicians</td>\n",
1000
+ " <td>ITA</td>\n",
1001
+ " <td>it</td>\n",
1002
+ " </tr>\n",
1003
+ " <tr>\n",
1004
+ " <th>497</th>\n",
1005
+ " <td>11460111</td>\n",
1006
+ " <td>fa parte della misericordia. Trasporta i malat...</td>\n",
1007
+ " <td>3258</td>\n",
1008
+ " <td>3258</td>\n",
1009
+ " <td>Ambulance Workers</td>\n",
1010
+ " <td>3258 Ambulance Workers</td>\n",
1011
+ " <td>ITA</td>\n",
1012
+ " <td>it</td>\n",
1013
+ " </tr>\n",
1014
+ " <tr>\n",
1015
+ " <th>498</th>\n",
1016
+ " <td>10340111</td>\n",
1017
+ " <td>사회복지사. 회사에서 복지원 관리</td>\n",
1018
+ " <td>2635</td>\n",
1019
+ " <td>2635</td>\n",
1020
+ " <td>Social Work and Counselling Professionals</td>\n",
1021
+ " <td>2635 Social Work and Counselling Professionals</td>\n",
1022
+ " <td>KOR</td>\n",
1023
+ " <td>ko</td>\n",
1024
+ " </tr>\n",
1025
+ " <tr>\n",
1026
+ " <th>499</th>\n",
1027
+ " <td>10370105</td>\n",
1028
+ " <td>자영업. 가게를 운영하신다.</td>\n",
1029
+ " <td>5221</td>\n",
1030
+ " <td>None</td>\n",
1031
+ " <td>Shopkeepers</td>\n",
1032
+ " <td>5221 Shopkeepers</td>\n",
1033
+ " <td>KOR</td>\n",
1034
+ " <td>ko</td>\n",
1035
+ " </tr>\n",
1036
+ " </tbody>\n",
1037
+ "</table>\n",
1038
+ "<p>500 rows × 8 columns</p>\n",
1039
+ "</div>"
1040
+ ],
1041
+ "text/plain": [
1042
+ " IDSTUD JOB_DUTIES ISCO \\\n",
1043
+ "0 10670109 forældre 1: Han arbejder som med-chef sammen... 7412 \n",
1044
+ "1 10130106 asistente de parbulo y basica. ayudaba en la e... 5312 \n",
1045
+ "2 10740120 trabajaba en el campo como capatas. aveces cui... 6121 \n",
1046
+ "3 10170109 gas abastible. vende gas abastible 9621 \n",
1047
+ "4 11480109 jordbruk. sår potatis tar upp potatis plogar h... 6111 \n",
1048
+ ".. ... ... ... \n",
1049
+ "495 11780107 acountent mannager|she mannages calls for jobs... 1211 \n",
1050
+ "496 10850104 geometra/muratore. proggetta case e le restaura 3112 \n",
1051
+ "497 11460111 fa parte della misericordia. Trasporta i malat... 3258 \n",
1052
+ "498 10340111 사회복지사. 회사에서 복지원 관리 2635 \n",
1053
+ "499 10370105 자영업. 가게를 운영하신다. 5221 \n",
1054
+ "\n",
1055
+ " ISCO_REL ISCO_TITLE \\\n",
1056
+ "0 None Electrical Mechanics and Fitters \n",
1057
+ "1 5312 Teachers' Aides \n",
1058
+ "2 None Livestock and Dairy Producers \n",
1059
+ "3 5243 Messengers, Package Deliverers and Luggage Por... \n",
1060
+ "4 6111 Field Crop and Vegetable Growers \n",
1061
+ ".. ... ... \n",
1062
+ "495 9998 Finance Managers \n",
1063
+ "496 3112 Civil Engineering Technicians \n",
1064
+ "497 3258 Ambulance Workers \n",
1065
+ "498 2635 Social Work and Counselling Professionals \n",
1066
+ "499 None Shopkeepers \n",
1067
+ "\n",
1068
+ " ISCO_CODE_TITLE COUNTRY LANGUAGE \n",
1069
+ "0 7412 Electrical Mechanics and Fitters DNK da \n",
1070
+ "1 5312 Teachers' Aides CHL es \n",
1071
+ "2 6121 Livestock and Dairy Producers URY es \n",
1072
+ "3 9621 Messengers, Package Deliverers and Luggag... CHL es \n",
1073
+ "4 6111 Field Crop and Vegetable Growers FIN sv \n",
1074
+ ".. ... ... ... \n",
1075
+ "495 1211 Finance Managers AUS en \n",
1076
+ "496 3112 Civil Engineering Technicians ITA it \n",
1077
+ "497 3258 Ambulance Workers ITA it \n",
1078
+ "498 2635 Social Work and Counselling Professionals KOR ko \n",
1079
+ "499 5221 Shopkeepers KOR ko \n",
1080
+ "\n",
1081
+ "[500 rows x 8 columns]"
1082
+ ]
1083
+ },
1084
+ "execution_count": 3,
1085
+ "metadata": {},
1086
+ "output_type": "execute_result"
1087
+ }
1088
+ ],
1089
+ "source": [
1090
+ "MODEL_NAME = \"ICILS/XLM-R-ISCO\"\n",
1091
+ "# DATASET_CONFIG = {\"path\": \"tweet_eval\", \"name\": \"sentiment\", \"split\": \"validation\"}\n",
1092
+ "TEXT_COLUMN = \"JOB_DUTIES\"\n",
1093
+ "TARGET_COLUMN = \"ISCO_CODE_TITLE\"\n",
1094
+ "\n",
1095
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
1096
+ "model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)\n",
1097
+ "\n",
1098
+ "label2id: dict = model.config.label2id\n",
1099
+ "id2label: dict = model.config.id2label\n",
1100
+ "# LABEL_MAPPING = id2label.items()\n",
1101
+ "\n",
1102
+ "# raw_data = load_dataset(**DATASET_CONFIG).to_pandas().iloc[:500]\n",
1103
+ "raw_data = load_dataset(\"ICILS/multilingual_parental_occupations\", split=\"test\").to_pandas().iloc[:500]\n",
1104
+ "# raw_data = raw_data.replace({\"ISCO_CODE_TITLE\": LABEL_MAPPING})\n",
1105
+ "raw_data[\"ISCO\"] = raw_data[\"ISCO\"].astype(str)\n",
1106
+ "raw_data[\"ISCO_REL\"] = raw_data[\"ISCO_REL\"].astype(str)\n",
1107
+ "\n",
1108
+ "raw_data"
1109
+ ]
1110
+ },
1111
+ {
1112
+ "cell_type": "code",
1113
+ "execution_count": 4,
1114
+ "metadata": {},
1115
+ "outputs": [
1116
+ {
1117
+ "name": "stdout",
1118
+ "output_type": "stream",
1119
+ "text": [
1120
+ "2024-03-15 01:07:06,923 pid:166193 MainThread giskard.datasets.base INFO Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.\n",
1121
+ "2024-03-15 01:07:06,925 pid:166193 MainThread giskard.models.automodel INFO Your 'prediction_function' is successfully wrapped by Giskard's 'PredictionFunctionModel' wrapper class.\n"
1122
+ ]
1123
+ },
1124
+ {
1125
+ "name": "stderr",
1126
+ "output_type": "stream",
1127
+ "text": [
1128
+ "/home/dux/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/giskard/datasets/base/__init__.py:466: UserWarning: The column ISCO is declared as numeric but has 'object' as data type. To avoid potential future issues, make sure to cast this column to the correct data type.\n",
1129
+ " warning(\n"
1130
+ ]
1131
+ }
1132
+ ],
1133
+ "source": [
1134
+ "giskard_dataset = Dataset(\n",
1135
+ " df=raw_data, # A pandas.DataFrame that contains the raw data (before all the pre-processing steps) and the actual ground truth variable (target).\n",
1136
+ " target=TARGET_COLUMN, # Ground truth variable.\n",
1137
+ " name=\"ISCO-08 Parental Occupation Corpus\", # Optional.\n",
1138
+ ")\n",
1139
+ "\n",
1140
+ "def prediction_function(df: pd.DataFrame) -> np.ndarray:\n",
1141
+ " encoded_input = tokenizer(list(df[TEXT_COLUMN]), padding=True, return_tensors=\"pt\")\n",
1142
+ " output = model(**encoded_input)\n",
1143
+ " return softmax(output[\"logits\"].detach().numpy(), axis=1)\n",
1144
+ "\n",
1145
+ "\n",
1146
+ "giskard_model = Model(\n",
1147
+ " model=prediction_function, # A prediction function that encapsulates all the data pre-processing steps and that\n",
1148
+ " model_type=\"classification\", # Either regression, classification or text_generation.\n",
1149
+ " name=\"XLM-R ISCO\", # Optional\n",
1150
+ " classification_labels=list(label2id.keys()), # Their order MUST be identical to the prediction_function's\n",
1151
+ " feature_names=[TEXT_COLUMN], # Default: all columns of your dataset\n",
1152
+ ")"
1153
+ ]
1154
+ },
1155
+ {
1156
+ "cell_type": "code",
1157
+ "execution_count": 5,
1158
+ "metadata": {},
1159
+ "outputs": [
1160
+ {
1161
+ "name": "stdout",
1162
+ "output_type": "stream",
1163
+ "text": [
1164
+ "2024-03-15 01:07:10,228 pid:166193 MainThread giskard.datasets.base INFO Casting dataframe columns from {'JOB_DUTIES': 'object'} to {'JOB_DUTIES': 'object'}\n"
1165
+ ]
1166
+ },
1167
+ {
1168
+ "name": "stdout",
1169
+ "output_type": "stream",
1170
+ "text": [
1171
+ "2024-03-15 01:07:12,838 pid:166193 MainThread giskard.utils.logging_utils INFO Predicted dataset with shape (10, 8) executed in 0:00:02.617399\n",
1172
+ "2024-03-15 01:07:12,848 pid:166193 MainThread giskard.datasets.base INFO Casting dataframe columns from {'JOB_DUTIES': 'object'} to {'JOB_DUTIES': 'object'}\n",
1173
+ "2024-03-15 01:07:13,007 pid:166193 MainThread giskard.utils.logging_utils INFO Predicted dataset with shape (1, 8) executed in 0:00:00.166843\n",
1174
+ "2024-03-15 01:07:13,015 pid:166193 MainThread giskard.datasets.base INFO Casting dataframe columns from {'JOB_DUTIES': 'object'} to {'JOB_DUTIES': 'object'}\n",
1175
+ "2024-03-15 01:07:13,017 pid:166193 MainThread giskard.utils.logging_utils INFO Predicted dataset with shape (10, 8) executed in 0:00:00.009517\n",
1176
+ "2024-03-15 01:07:13,029 pid:166193 MainThread giskard.datasets.base INFO Casting dataframe columns from {'JOB_DUTIES': 'object'} to {'JOB_DUTIES': 'object'}\n"
1177
+ ]
1178
+ },
1179
+ {
1180
+ "ename": "",
1181
+ "evalue": "",
1182
+ "output_type": "error",
1183
+ "traceback": [
1184
+ "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
1185
+ "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
1186
+ "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
1187
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1188
+ ]
1189
+ }
1190
+ ],
1191
+ "source": [
1192
+ "results = scan(giskard_model, giskard_dataset)"
1193
+ ]
1194
+ },
1195
+ {
1196
+ "cell_type": "code",
1197
+ "execution_count": null,
1198
+ "metadata": {},
1199
+ "outputs": [
1200
+ {
1201
+ "ename": "NameError",
1202
+ "evalue": "name 'results' is not defined",
1203
+ "output_type": "error",
1204
+ "traceback": [
1205
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1206
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
1207
+ "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m display(\u001b[43mresults\u001b[49m)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Save it to a file\u001b[39;00m\n\u001b[1;32m 4\u001b[0m results\u001b[38;5;241m.\u001b[39mto_html(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mscan_report.html\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
1208
+ "\u001b[0;31mNameError\u001b[0m: name 'results' is not defined"
1209
+ ]
1210
+ }
1211
+ ],
1212
+ "source": [
1213
+ "display(results)\n",
1214
+ "\n",
1215
+ "# Save it to a file\n",
1216
+ "results.to_html(\"scan_report.html\")"
1217
+ ]
1218
+ },
1219
+ {
1220
+ "cell_type": "code",
1221
+ "execution_count": 2,
1222
+ "metadata": {},
1223
+ "outputs": [
1224
+ {
1225
+ "ename": "GiskardError",
1226
+ "evalue": "No details or messages available.",
1227
+ "output_type": "error",
1228
+ "traceback": [
1229
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1230
+ "\u001b[0;31mGiskardError\u001b[0m Traceback (most recent call last)",
1231
+ "Cell \u001b[0;32mIn[2], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m project_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mxlmr_isco\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;66;03m# Create a giskard client to communicate with Giskard\u001b[39;00m\n\u001b[0;32m---> 10\u001b[0m client \u001b[38;5;241m=\u001b[39m \u001b[43mGiskardClient\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n",
1232
+ "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/giskard/client/giskard_client.py:153\u001b[0m, in \u001b[0;36mGiskardClient.__init__\u001b[0;34m(self, url, key, hf_token)\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m hf_token:\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_session\u001b[38;5;241m.\u001b[39mcookies[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mspaces-jwt\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m hf_token\n\u001b[0;32m--> 153\u001b[0m server_settings: ServerInfo \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_server_info\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m server_settings\u001b[38;5;241m.\u001b[39mserverVersion \u001b[38;5;241m!=\u001b[39m giskard\u001b[38;5;241m.\u001b[39m__version__:\n\u001b[1;32m 156\u001b[0m warning(\n\u001b[1;32m 157\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour giskard client version (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mgiskard\u001b[38;5;241m.\u001b[39m__version__\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) does not match the hub version \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 158\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mserver_settings\u001b[38;5;241m.\u001b[39mserverVersion\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m). \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease upgrade your client to the latest version. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 160\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpip install \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgiskard[hub]>=2.0.0b\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m -U\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 161\u001b[0m )\n",
1233
+ "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/giskard/client/giskard_client.py:417\u001b[0m, in \u001b[0;36mGiskardClient.get_server_info\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 416\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_server_info\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m ServerInfo:\n\u001b[0;32m--> 417\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/public-api/ml-worker-connect\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 418\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 419\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ServerInfo\u001b[38;5;241m.\u001b[39mparse_obj(resp\u001b[38;5;241m.\u001b[39mjson())\n",
1234
+ "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/requests/sessions.py:602\u001b[0m, in \u001b[0;36mSession.get\u001b[0;34m(self, url, **kwargs)\u001b[0m\n\u001b[1;32m 594\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Sends a GET request. Returns :class:`Response` object.\u001b[39;00m\n\u001b[1;32m 595\u001b[0m \n\u001b[1;32m 596\u001b[0m \u001b[38;5;124;03m:param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[1;32m 597\u001b[0m \u001b[38;5;124;03m:param \\*\\*kwargs: Optional arguments that ``request`` takes.\u001b[39;00m\n\u001b[1;32m 598\u001b[0m \u001b[38;5;124;03m:rtype: requests.Response\u001b[39;00m\n\u001b[1;32m 599\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 601\u001b[0m kwargs\u001b[38;5;241m.\u001b[39msetdefault(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 602\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
1235
+ "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/requests_toolbelt/sessions.py:76\u001b[0m, in \u001b[0;36mBaseUrlSession.request\u001b[0;34m(self, method, url, *args, **kwargs)\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Send the request after generating the complete URL.\"\"\"\u001b[39;00m\n\u001b[1;32m 75\u001b[0m url \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcreate_url(url)\n\u001b[0;32m---> 76\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mBaseUrlSession\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 77\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 78\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
1236
+ "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/requests/sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 584\u001b[0m send_kwargs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 585\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m: timeout,\n\u001b[1;32m 586\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m: allow_redirects,\n\u001b[1;32m 587\u001b[0m }\n\u001b[1;32m 588\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 589\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n",
1237
+ "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/requests/sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 700\u001b[0m start \u001b[38;5;241m=\u001b[39m preferred_clock()\n\u001b[1;32m 702\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43madapter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 705\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[1;32m 706\u001b[0m elapsed \u001b[38;5;241m=\u001b[39m preferred_clock() \u001b[38;5;241m-\u001b[39m start\n",
1238
+ "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/requests/adapters.py:538\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 535\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 536\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m--> 538\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbuild_response\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresp\u001b[49m\u001b[43m)\u001b[49m\n",
1239
+ "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/giskard/client/giskard_client.py:107\u001b[0m, in \u001b[0;36mErrorHandlingAdapter.build_response\u001b[0;34m(self, req, resp)\u001b[0m\n\u001b[1;32m 105\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msuper\u001b[39m(ErrorHandlingAdapter, \u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mbuild_response(req, resp)\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _get_status(resp) \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m400\u001b[39m:\n\u001b[0;32m--> 107\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m explain_error(resp)\n\u001b[1;32m 109\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n",
1240
+ "\u001b[0;31mGiskardError\u001b[0m: No details or messages available."
1241
+ ]
1242
+ }
1243
+ ],
1244
+ "source": [
1245
+ "import giskard\n",
1246
+ "from datasets import load_dataset\n",
1247
+ "\n",
1248
+ "dataset = load_dataset(\"ICILS/multilingual_parental_occupations\", split=\"test\")\n",
1249
+ "\n",
1250
+ "# Replace this with your own data & model creation.\n",
1251
+ "# df = giskard.demo.titanic_df()\n",
1252
+ "df = dataset\n",
1253
+ "demo_data_preprocessing_function, demo_sklearn_model = giskard.demo.titanic_pipeline()\n",
1254
+ "\n",
1255
+ "# Wrap your Pandas DataFrame\n",
1256
+ "giskard_dataset = giskard.Dataset(df=df,\n",
1257
+ " target=\"ISCO_CODE_TITLE\",\n",
1258
+ " name=\"ISCO-08 Parental Occupation Corpus\",\n",
1259
+ " cat_columns=['LANGUAGE', 'COUNTRY'])\n",
1260
+ "\n",
1261
+ "# Wrap your model\n",
1262
+ "def prediction_function(df):\n",
1263
+ " preprocessed_df = demo_data_preprocessing_function(df)\n",
1264
+ " return demo_sklearn_model.predict_proba(preprocessed_df)\n",
1265
+ "\n",
1266
+ "giskard_model = giskard.Model(model=prediction_function,\n",
1267
+ " model_type=\"classification\",\n",
1268
+ " name=\"Titanic model\",\n",
1269
+ " classification_labels=demo_sklearn_model.classes_,\n",
1270
+ " feature_names=['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])\n",
1271
+ "\n",
1272
+ "# Then apply the scan\n",
1273
+ "results = giskard.scan(giskard_model, giskard_dataset)\n",
1274
+ "\n",
1275
+ "\n",
1276
+ "# Create a Giskard client\n",
1277
+ "client = giskard.GiskardClient(\n",
1278
+ " url=\"https://danieldux-giskard.hf.space\", # URL of your Giskard instance\n",
1279
+ " key=\"<Generate your API Key on the Giskard Hub settings page first>\")\n",
1280
+ "\n",
1281
  "\n",
1282
+ "# Upload an automatically created test suite to the current project ✉️\n",
1283
+ "results.generate_test_suite(\"Test suite created by scan\").upload(client, \"xlmr_isco\")\n"
1284
  ]
1285
  }
1286
  ],