amirali1985 commited on
Commit
de47b00
·
verified ·
1 Parent(s): e744a2a

Upload add_sub_baseline_100K_1L2H256d/metrics.json with huggingface_hub

Browse files
add_sub_baseline_100K_1L2H256d/metrics.json CHANGED
@@ -2582,502 +2582,567 @@
2582
  "K": null,
2583
  "mode": "sft",
2584
  "n_digits": 6,
2585
- "n_per_split": 100
2586
  },
2587
  "splits": {
2588
  "add_S0": {
2589
- "full_accuracy": 0.74,
2590
- "n_examples": 100,
 
2591
  "per_subtask": {
2592
  "SA": {
2593
- "accuracy": 0.9603305785123967,
2594
- "count": 605
2595
  },
2596
  "SS": {
2597
- "accuracy": 0.9473684210526315,
2598
- "count": 95
2599
  }
2600
  }
2601
  },
2602
  "add_S1": {
2603
- "full_accuracy": 0.59,
2604
- "n_examples": 100,
 
2605
  "per_subtask": {
2606
  "SA": {
2607
- "accuracy": 0.9509803921568627,
2608
- "count": 204
2609
  },
2610
  "SC": {
2611
- "accuracy": 0.9289940828402367,
2612
- "count": 169
2613
  },
2614
  "SS": {
2615
- "accuracy": 0.9354838709677419,
2616
- "count": 31
2617
  },
2618
  "UC": {
2619
- "accuracy": 0.8986486486486487,
2620
- "count": 296
2621
  }
2622
  }
2623
  },
2624
  "add_S2": {
2625
- "full_accuracy": 0.33,
2626
- "n_examples": 100,
 
2627
  "per_subtask": {
2628
  "SA": {
2629
- "accuracy": 0.950920245398773,
2630
- "count": 163
2631
  },
2632
  "SC": {
2633
- "accuracy": 0.8846153846153846,
2634
- "count": 130
2635
  },
2636
  "SS": {
2637
- "accuracy": 0.8620689655172413,
2638
- "count": 87
2639
  },
2640
  "UC": {
2641
- "accuracy": 0.7093596059113301,
2642
- "count": 203
2643
  },
2644
  "US": {
2645
- "accuracy": 0.9487179487179487,
2646
- "count": 117
2647
  }
2648
  }
2649
  },
2650
  "add_S3": {
2651
- "full_accuracy": 0.17,
2652
- "n_examples": 100,
 
2653
  "per_subtask": {
2654
  "SA": {
2655
- "accuracy": 0.9669421487603306,
2656
- "count": 121
2657
  },
2658
  "SC": {
2659
- "accuracy": 0.9173553719008265,
2660
- "count": 121
2661
  },
2662
  "SS": {
2663
- "accuracy": 0.9387755102040817,
2664
- "count": 49
2665
  },
2666
  "UC": {
2667
- "accuracy": 0.6021505376344086,
2668
- "count": 186
2669
  },
2670
  "US": {
2671
- "accuracy": 0.6502242152466368,
2672
- "count": 223
2673
  }
2674
  }
2675
  },
2676
  "add_S4": {
2677
- "full_accuracy": 0.13,
2678
- "n_examples": 100,
 
2679
  "per_subtask": {
2680
  "SA": {
2681
- "accuracy": 0.9807692307692307,
2682
- "count": 104
2683
  },
2684
  "SC": {
2685
- "accuracy": 0.8962264150943396,
2686
- "count": 106
2687
  },
2688
  "SS": {
2689
- "accuracy": 0.9565217391304348,
2690
- "count": 23
2691
  },
2692
  "UC": {
2693
- "accuracy": 0.6,
2694
- "count": 160
2695
  },
2696
  "US": {
2697
- "accuracy": 0.4364820846905538,
2698
- "count": 307
2699
  }
2700
  }
2701
  },
2702
  "add_S5": {
2703
- "full_accuracy": 0.16,
2704
- "n_examples": 100,
 
2705
  "per_subtask": {
2706
  "SA": {
2707
  "accuracy": 1.0,
2708
- "count": 100
2709
  },
2710
  "SC": {
2711
- "accuracy": 0.98,
2712
- "count": 100
2713
  },
2714
  "UC": {
2715
- "accuracy": 0.37,
2716
- "count": 100
2717
  },
2718
  "US": {
2719
- "accuracy": 0.2825,
2720
- "count": 400
2721
  }
2722
  }
2723
  },
2724
  "add_S6": {
2725
- "full_accuracy": 0.36,
2726
- "n_examples": 100,
 
2727
  "per_subtask": {
2728
  "SC": {
2729
  "accuracy": 1.0,
2730
- "count": 100
2731
  },
2732
  "UC": {
2733
- "accuracy": 0.42,
2734
- "count": 100
2735
  },
2736
  "US": {
2737
- "accuracy": 0.456,
2738
- "count": 500
2739
  }
2740
  }
2741
  },
2742
  "add_random": {
2743
- "full_accuracy": 0.67,
 
2744
  "n_examples": 200,
2745
  "per_subtask": {
2746
  "SA": {
2747
- "accuracy": 0.9642058165548099,
2748
- "count": 447
2749
  },
2750
  "SC": {
2751
- "accuracy": 0.95625,
2752
- "count": 320
2753
  },
2754
  "SS": {
2755
- "accuracy": 0.9464285714285714,
2756
- "count": 56
2757
  },
2758
  "UC": {
2759
- "accuracy": 0.9262759924385633,
2760
- "count": 529
2761
  },
2762
  "US": {
2763
- "accuracy": 0.9166666666666666,
2764
- "count": 48
2765
  }
2766
  }
2767
  },
2768
- "add_C3": {
2769
- "full_accuracy": 0.39,
2770
- "n_examples": 100,
 
2771
  "per_subtask": {
2772
  "SA": {
2773
- "accuracy": 0.9966666666666667,
2774
- "count": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2775
  },
2776
  "SC": {
2777
  "accuracy": 1.0,
2778
- "count": 100
2779
  },
2780
  "UC": {
2781
- "accuracy": 0.6632124352331606,
2782
- "count": 193
2783
  },
2784
  "US": {
2785
- "accuracy": 0.6728971962616822,
2786
- "count": 107
2787
  }
2788
  }
2789
  },
2790
- "add_C4": {
2791
- "full_accuracy": 0.32,
2792
- "n_examples": 100,
 
2793
  "per_subtask": {
2794
  "SA": {
2795
- "accuracy": 0.995,
2796
- "count": 200
2797
  },
2798
  "SC": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2799
  "accuracy": 0.99,
2800
  "count": 100
2801
  },
 
 
 
 
2802
  "UC": {
2803
- "accuracy": 0.72265625,
2804
- "count": 256
2805
  },
2806
  "US": {
2807
- "accuracy": 0.7013888888888888,
2808
- "count": 144
2809
  }
2810
  }
2811
  },
2812
  "add_C5": {
2813
- "full_accuracy": 0.31,
2814
- "n_examples": 100,
 
2815
  "per_subtask": {
2816
  "SA": {
2817
  "accuracy": 1.0,
2818
- "count": 100
2819
  },
2820
  "SC": {
2821
  "accuracy": 1.0,
2822
- "count": 100
2823
  },
2824
  "UC": {
2825
- "accuracy": 0.7549019607843137,
2826
- "count": 306
2827
  },
2828
  "US": {
2829
- "accuracy": 0.7422680412371134,
2830
- "count": 194
2831
  }
2832
  }
2833
  },
2834
  "add_C6": {
2835
- "full_accuracy": 0.31,
2836
- "n_examples": 100,
 
2837
  "per_subtask": {
2838
  "SC": {
2839
  "accuracy": 1.0,
2840
- "count": 100
2841
  },
2842
  "UC": {
2843
- "accuracy": 0.7513661202185792,
2844
- "count": 366
2845
  },
2846
  "US": {
2847
- "accuracy": 0.7863247863247863,
2848
- "count": 234
2849
  }
2850
  }
2851
  },
2852
  "sub_M0": {
2853
- "full_accuracy": 0.68,
2854
- "n_examples": 100,
 
2855
  "per_subtask": {
2856
  "MD": {
2857
- "accuracy": 0.9467554076539102,
2858
- "count": 601
2859
  },
2860
  "ME": {
2861
- "accuracy": 0.9494949494949495,
2862
- "count": 99
2863
  }
2864
  }
2865
  },
2866
  "sub_M1": {
2867
- "full_accuracy": 0.49,
2868
- "n_examples": 100,
 
2869
  "per_subtask": {
2870
  "MD": {
2871
- "accuracy": 0.967741935483871,
2872
- "count": 279
2873
  },
2874
  "MB": {
2875
- "accuracy": 0.9655172413793104,
2876
- "count": 145
2877
  },
2878
  "ME": {
2879
  "accuracy": 1.0,
2880
- "count": 24
2881
  },
2882
  "UB": {
2883
- "accuracy": 0.8253968253968254,
2884
- "count": 252
2885
  }
2886
  }
2887
  },
2888
  "sub_M2": {
2889
- "full_accuracy": 0.22,
2890
- "n_examples": 100,
 
2891
  "per_subtask": {
2892
  "MD": {
2893
- "accuracy": 0.971830985915493,
2894
- "count": 213
2895
  },
2896
  "MB": {
2897
- "accuracy": 0.9734513274336283,
2898
- "count": 113
2899
  },
2900
  "ME": {
2901
- "accuracy": 1.0,
2902
- "count": 85
2903
  },
2904
  "UB": {
2905
- "accuracy": 0.5414364640883977,
2906
- "count": 181
2907
  },
2908
  "UD": {
2909
- "accuracy": 0.8425925925925926,
2910
- "count": 108
2911
  }
2912
  }
2913
  },
2914
  "sub_M3": {
2915
- "full_accuracy": 0.05,
2916
- "n_examples": 100,
 
2917
  "per_subtask": {
2918
  "MD": {
2919
- "accuracy": 0.994413407821229,
2920
- "count": 179
2921
  },
2922
  "MB": {
2923
- "accuracy": 0.912621359223301,
2924
- "count": 103
2925
  },
2926
  "ME": {
2927
- "accuracy": 1.0,
2928
- "count": 56
2929
  },
2930
  "UB": {
2931
- "accuracy": 0.4697986577181208,
2932
- "count": 149
2933
  },
2934
  "UD": {
2935
- "accuracy": 0.5117370892018779,
2936
- "count": 213
2937
  }
2938
  }
2939
  },
2940
  "sub_M4": {
2941
- "full_accuracy": 0.07,
2942
- "n_examples": 100,
 
2943
  "per_subtask": {
2944
  "MD": {
2945
  "accuracy": 0.98,
2946
- "count": 200
2947
  },
2948
  "MB": {
2949
- "accuracy": 0.95,
2950
- "count": 100
2951
  },
2952
  "UB": {
2953
- "accuracy": 0.53,
2954
- "count": 100
2955
  },
2956
  "UD": {
2957
  "accuracy": 0.29333333333333333,
2958
- "count": 300
2959
  }
2960
  }
2961
  },
2962
  "sub_M5": {
2963
- "full_accuracy": 0.07,
2964
- "n_examples": 100,
 
2965
  "per_subtask": {
2966
  "MD": {
2967
  "accuracy": 1.0,
2968
- "count": 100
2969
  },
2970
  "MB": {
2971
  "accuracy": 1.0,
2972
- "count": 100
2973
  },
2974
  "UB": {
2975
- "accuracy": 0.46,
2976
- "count": 100
2977
  },
2978
  "UD": {
2979
- "accuracy": 0.25,
2980
- "count": 400
2981
  }
2982
  }
2983
  },
2984
  "sub_random": {
2985
- "full_accuracy": 0.515,
 
2986
  "n_examples": 200,
2987
  "per_subtask": {
2988
  "MD": {
2989
- "accuracy": 0.955,
2990
- "count": 600
2991
  },
2992
  "MB": {
2993
- "accuracy": 0.951310861423221,
2994
- "count": 267
2995
  },
2996
  "ME": {
2997
- "accuracy": 0.9811320754716981,
2998
  "count": 53
2999
  },
3000
  "UB": {
3001
- "accuracy": 0.8291571753986332,
3002
- "count": 439
3003
  },
3004
  "UD": {
3005
- "accuracy": 0.7804878048780488,
3006
- "count": 41
3007
  }
3008
  }
3009
  },
3010
  "sub_B3": {
3011
- "full_accuracy": 0.19,
3012
- "n_examples": 100,
 
3013
  "per_subtask": {
3014
  "MD": {
3015
- "accuracy": 0.9766666666666667,
3016
- "count": 300
3017
  },
3018
  "MB": {
3019
  "accuracy": 0.96,
3020
- "count": 100
3021
  },
3022
  "UB": {
3023
- "accuracy": 0.6345177664974619,
3024
- "count": 197
3025
  },
3026
  "UD": {
3027
- "accuracy": 0.5728155339805825,
3028
- "count": 103
3029
  }
3030
  }
3031
  },
3032
  "sub_B4": {
3033
- "full_accuracy": 0.01,
3034
- "n_examples": 100,
 
3035
  "per_subtask": {
3036
  "MD": {
3037
- "accuracy": 0.985,
3038
- "count": 200
3039
  },
3040
  "MB": {
3041
- "accuracy": 0.96,
3042
- "count": 100
3043
  },
3044
  "UB": {
3045
- "accuracy": 0.5182186234817814,
3046
- "count": 247
3047
  },
3048
  "UD": {
3049
- "accuracy": 0.5228758169934641,
3050
- "count": 153
3051
  }
3052
  }
3053
  },
3054
  "sub_B5": {
3055
- "full_accuracy": 0.05,
3056
- "n_examples": 100,
 
3057
  "per_subtask": {
3058
  "MD": {
3059
  "accuracy": 1.0,
3060
- "count": 100
3061
  },
3062
  "MB": {
3063
  "accuracy": 1.0,
3064
- "count": 100
3065
  },
3066
  "UB": {
3067
- "accuracy": 0.5973154362416108,
3068
- "count": 298
3069
  },
3070
  "UD": {
3071
- "accuracy": 0.5445544554455446,
3072
- "count": 202
3073
  }
3074
  }
3075
  }
3076
  },
3077
  "summary": {
3078
- "overall_accuracy": 0.33375,
3079
- "total_examples": 2400,
3080
- "n_splits": 22
 
3081
  }
3082
  }
3083
  }
 
2582
  "K": null,
2583
  "mode": "sft",
2584
  "n_digits": 6,
2585
+ "n_per_split": 50
2586
  },
2587
  "splits": {
2588
  "add_S0": {
2589
+ "full_accuracy": 0.68,
2590
+ "digit_accuracy": 0.9485714285714286,
2591
+ "n_examples": 50,
2592
  "per_subtask": {
2593
  "SA": {
2594
+ "accuracy": 0.9457627118644067,
2595
+ "count": 295
2596
  },
2597
  "SS": {
2598
+ "accuracy": 0.9636363636363636,
2599
+ "count": 55
2600
  }
2601
  }
2602
  },
2603
  "add_S1": {
2604
+ "full_accuracy": 0.76,
2605
+ "digit_accuracy": 0.9542857142857143,
2606
+ "n_examples": 50,
2607
  "per_subtask": {
2608
  "SA": {
2609
+ "accuracy": 0.9761904761904762,
2610
+ "count": 126
2611
  },
2612
  "SC": {
2613
+ "accuracy": 0.9240506329113924,
2614
+ "count": 79
2615
  },
2616
  "SS": {
2617
+ "accuracy": 1.0,
2618
+ "count": 21
2619
  },
2620
  "UC": {
2621
+ "accuracy": 0.9435483870967742,
2622
+ "count": 124
2623
  }
2624
  }
2625
  },
2626
  "add_S2": {
2627
+ "full_accuracy": 0.32,
2628
+ "digit_accuracy": 0.8542857142857143,
2629
+ "n_examples": 50,
2630
  "per_subtask": {
2631
  "SA": {
2632
+ "accuracy": 0.9466666666666667,
2633
+ "count": 75
2634
  },
2635
  "SC": {
2636
+ "accuracy": 0.8548387096774194,
2637
+ "count": 62
2638
  },
2639
  "SS": {
2640
+ "accuracy": 0.8205128205128205,
2641
+ "count": 39
2642
  },
2643
  "UC": {
2644
+ "accuracy": 0.7747747747747747,
2645
+ "count": 111
2646
  },
2647
  "US": {
2648
+ "accuracy": 0.9047619047619048,
2649
+ "count": 63
2650
  }
2651
  }
2652
  },
2653
  "add_S3": {
2654
+ "full_accuracy": 0.18,
2655
+ "digit_accuracy": 0.7857142857142857,
2656
+ "n_examples": 50,
2657
  "per_subtask": {
2658
  "SA": {
2659
+ "accuracy": 0.9333333333333333,
2660
+ "count": 60
2661
  },
2662
  "SC": {
2663
+ "accuracy": 0.9298245614035088,
2664
+ "count": 57
2665
  },
2666
  "SS": {
2667
+ "accuracy": 0.8947368421052632,
2668
+ "count": 19
2669
  },
2670
  "UC": {
2671
+ "accuracy": 0.6923076923076923,
2672
+ "count": 104
2673
  },
2674
  "US": {
2675
+ "accuracy": 0.7,
2676
+ "count": 110
2677
  }
2678
  }
2679
  },
2680
  "add_S4": {
2681
+ "full_accuracy": 0.14,
2682
+ "digit_accuracy": 0.6371428571428571,
2683
+ "n_examples": 50,
2684
  "per_subtask": {
2685
  "SA": {
2686
+ "accuracy": 1.0,
2687
+ "count": 48
2688
  },
2689
  "SC": {
2690
+ "accuracy": 0.9423076923076923,
2691
+ "count": 52
2692
  },
2693
  "SS": {
2694
+ "accuracy": 0.8571428571428571,
2695
+ "count": 7
2696
  },
2697
  "UC": {
2698
+ "accuracy": 0.5168539325842697,
2699
+ "count": 89
2700
  },
2701
  "US": {
2702
+ "accuracy": 0.4805194805194805,
2703
+ "count": 154
2704
  }
2705
  }
2706
  },
2707
  "add_S5": {
2708
+ "full_accuracy": 0.08,
2709
+ "digit_accuracy": 0.4257142857142857,
2710
+ "n_examples": 50,
2711
  "per_subtask": {
2712
  "SA": {
2713
  "accuracy": 1.0,
2714
+ "count": 50
2715
  },
2716
  "SC": {
2717
+ "accuracy": 1.0,
2718
+ "count": 50
2719
  },
2720
  "UC": {
2721
+ "accuracy": 0.24,
2722
+ "count": 50
2723
  },
2724
  "US": {
2725
+ "accuracy": 0.185,
2726
+ "count": 200
2727
  }
2728
  }
2729
  },
2730
  "add_S6": {
2731
+ "full_accuracy": 0.4,
2732
+ "digit_accuracy": 0.52,
2733
+ "n_examples": 50,
2734
  "per_subtask": {
2735
  "SC": {
2736
  "accuracy": 1.0,
2737
+ "count": 50
2738
  },
2739
  "UC": {
2740
+ "accuracy": 0.4,
2741
+ "count": 50
2742
  },
2743
  "US": {
2744
+ "accuracy": 0.448,
2745
+ "count": 250
2746
  }
2747
  }
2748
  },
2749
  "add_random": {
2750
+ "full_accuracy": 0.62,
2751
+ "digit_accuracy": 0.9321428571428572,
2752
  "n_examples": 200,
2753
  "per_subtask": {
2754
  "SA": {
2755
+ "accuracy": 0.9767981438515081,
2756
+ "count": 431
2757
  },
2758
  "SC": {
2759
+ "accuracy": 0.9588607594936709,
2760
+ "count": 316
2761
  },
2762
  "SS": {
2763
+ "accuracy": 1.0,
2764
+ "count": 39
2765
  },
2766
  "UC": {
2767
+ "accuracy": 0.8892857142857142,
2768
+ "count": 560
2769
  },
2770
  "US": {
2771
+ "accuracy": 0.8148148148148148,
2772
+ "count": 54
2773
  }
2774
  }
2775
  },
2776
+ "add_C1": {
2777
+ "full_accuracy": 0.86,
2778
+ "digit_accuracy": 0.98,
2779
+ "n_examples": 50,
2780
  "per_subtask": {
2781
  "SA": {
2782
+ "accuracy": 0.992,
2783
+ "count": 250
2784
+ },
2785
+ "SC": {
2786
+ "accuracy": 0.98,
2787
+ "count": 50
2788
+ },
2789
+ "UC": {
2790
+ "accuracy": 0.92,
2791
+ "count": 50
2792
+ }
2793
+ }
2794
+ },
2795
+ "add_C2": {
2796
+ "full_accuracy": 0.66,
2797
+ "digit_accuracy": 0.9485714285714286,
2798
+ "n_examples": 50,
2799
+ "per_subtask": {
2800
+ "SA": {
2801
+ "accuracy": 0.985,
2802
+ "count": 200
2803
  },
2804
  "SC": {
2805
  "accuracy": 1.0,
2806
+ "count": 50
2807
  },
2808
  "UC": {
2809
+ "accuracy": 0.8433734939759037,
2810
+ "count": 83
2811
  },
2812
  "US": {
2813
+ "accuracy": 0.8823529411764706,
2814
+ "count": 17
2815
  }
2816
  }
2817
  },
2818
+ "add_C3": {
2819
+ "full_accuracy": 0.36,
2820
+ "digit_accuracy": 0.8628571428571429,
2821
+ "n_examples": 50,
2822
  "per_subtask": {
2823
  "SA": {
2824
+ "accuracy": 0.9933333333333333,
2825
+ "count": 150
2826
  },
2827
  "SC": {
2828
+ "accuracy": 1.0,
2829
+ "count": 50
2830
+ },
2831
+ "UC": {
2832
+ "accuracy": 0.68,
2833
+ "count": 100
2834
+ },
2835
+ "US": {
2836
+ "accuracy": 0.7,
2837
+ "count": 50
2838
+ }
2839
+ }
2840
+ },
2841
+ "add_C4": {
2842
+ "full_accuracy": 0.3,
2843
+ "digit_accuracy": 0.8542857142857143,
2844
+ "n_examples": 50,
2845
+ "per_subtask": {
2846
+ "SA": {
2847
  "accuracy": 0.99,
2848
  "count": 100
2849
  },
2850
+ "SC": {
2851
+ "accuracy": 1.0,
2852
+ "count": 50
2853
+ },
2854
  "UC": {
2855
+ "accuracy": 0.7121212121212122,
2856
+ "count": 132
2857
  },
2858
  "US": {
2859
+ "accuracy": 0.8235294117647058,
2860
+ "count": 68
2861
  }
2862
  }
2863
  },
2864
  "add_C5": {
2865
+ "full_accuracy": 0.26,
2866
+ "digit_accuracy": 0.7828571428571428,
2867
+ "n_examples": 50,
2868
  "per_subtask": {
2869
  "SA": {
2870
  "accuracy": 1.0,
2871
+ "count": 50
2872
  },
2873
  "SC": {
2874
  "accuracy": 1.0,
2875
+ "count": 50
2876
  },
2877
  "UC": {
2878
+ "accuracy": 0.7465753424657534,
2879
+ "count": 146
2880
  },
2881
  "US": {
2882
+ "accuracy": 0.625,
2883
+ "count": 104
2884
  }
2885
  }
2886
  },
2887
  "add_C6": {
2888
+ "full_accuracy": 0.3,
2889
+ "digit_accuracy": 0.7971428571428572,
2890
+ "n_examples": 50,
2891
  "per_subtask": {
2892
  "SC": {
2893
  "accuracy": 1.0,
2894
+ "count": 50
2895
  },
2896
  "UC": {
2897
+ "accuracy": 0.7724867724867724,
2898
+ "count": 189
2899
  },
2900
  "US": {
2901
+ "accuracy": 0.7477477477477478,
2902
+ "count": 111
2903
  }
2904
  }
2905
  },
2906
  "sub_M0": {
2907
+ "full_accuracy": 0.8,
2908
+ "digit_accuracy": 0.9714285714285714,
2909
+ "n_examples": 50,
2910
  "per_subtask": {
2911
  "MD": {
2912
+ "accuracy": 0.966996699669967,
2913
+ "count": 303
2914
  },
2915
  "ME": {
2916
+ "accuracy": 1.0,
2917
+ "count": 47
2918
  }
2919
  }
2920
  },
2921
  "sub_M1": {
2922
+ "full_accuracy": 0.58,
2923
+ "digit_accuracy": 0.9314285714285714,
2924
+ "n_examples": 50,
2925
  "per_subtask": {
2926
  "MD": {
2927
+ "accuracy": 0.9858156028368794,
2928
+ "count": 141
2929
  },
2930
  "MB": {
2931
+ "accuracy": 0.9305555555555556,
2932
+ "count": 72
2933
  },
2934
  "ME": {
2935
  "accuracy": 1.0,
2936
+ "count": 18
2937
  },
2938
  "UB": {
2939
+ "accuracy": 0.8571428571428571,
2940
+ "count": 119
2941
  }
2942
  }
2943
  },
2944
  "sub_M2": {
2945
+ "full_accuracy": 0.14,
2946
+ "digit_accuracy": 0.8228571428571428,
2947
+ "n_examples": 50,
2948
  "per_subtask": {
2949
  "MD": {
2950
+ "accuracy": 0.9553571428571429,
2951
+ "count": 112
2952
  },
2953
  "MB": {
2954
+ "accuracy": 0.8679245283018868,
2955
+ "count": 53
2956
  },
2957
  "ME": {
2958
+ "accuracy": 0.9787234042553191,
2959
+ "count": 47
2960
  },
2961
  "UB": {
2962
+ "accuracy": 0.5529411764705883,
2963
+ "count": 85
2964
  },
2965
  "UD": {
2966
+ "accuracy": 0.7924528301886793,
2967
+ "count": 53
2968
  }
2969
  }
2970
  },
2971
  "sub_M3": {
2972
+ "full_accuracy": 0.06,
2973
+ "digit_accuracy": 0.7228571428571429,
2974
+ "n_examples": 50,
2975
  "per_subtask": {
2976
  "MD": {
2977
+ "accuracy": 0.9690721649484536,
2978
+ "count": 97
2979
  },
2980
  "MB": {
2981
+ "accuracy": 0.9803921568627451,
2982
+ "count": 51
2983
  },
2984
  "ME": {
2985
+ "accuracy": 0.9629629629629629,
2986
+ "count": 27
2987
  },
2988
  "UB": {
2989
+ "accuracy": 0.5405405405405406,
2990
+ "count": 74
2991
  },
2992
  "UD": {
2993
+ "accuracy": 0.42574257425742573,
2994
+ "count": 101
2995
  }
2996
  }
2997
  },
2998
  "sub_M4": {
2999
+ "full_accuracy": 0.06,
3000
+ "digit_accuracy": 0.6228571428571429,
3001
+ "n_examples": 50,
3002
  "per_subtask": {
3003
  "MD": {
3004
  "accuracy": 0.98,
3005
+ "count": 100
3006
  },
3007
  "MB": {
3008
+ "accuracy": 0.98,
3009
+ "count": 50
3010
  },
3011
  "UB": {
3012
+ "accuracy": 0.54,
3013
+ "count": 50
3014
  },
3015
  "UD": {
3016
  "accuracy": 0.29333333333333333,
3017
+ "count": 150
3018
  }
3019
  }
3020
  },
3021
  "sub_M5": {
3022
+ "full_accuracy": 0.06,
3023
+ "digit_accuracy": 0.5114285714285715,
3024
+ "n_examples": 50,
3025
  "per_subtask": {
3026
  "MD": {
3027
  "accuracy": 1.0,
3028
+ "count": 50
3029
  },
3030
  "MB": {
3031
  "accuracy": 1.0,
3032
+ "count": 50
3033
  },
3034
  "UB": {
3035
+ "accuracy": 0.38,
3036
+ "count": 50
3037
  },
3038
  "UD": {
3039
+ "accuracy": 0.3,
3040
+ "count": 200
3041
  }
3042
  }
3043
  },
3044
  "sub_random": {
3045
+ "full_accuracy": 0.525,
3046
+ "digit_accuracy": 0.9178571428571428,
3047
  "n_examples": 200,
3048
  "per_subtask": {
3049
  "MD": {
3050
+ "accuracy": 0.9736842105263158,
3051
+ "count": 570
3052
  },
3053
  "MB": {
3054
+ "accuracy": 0.9675090252707581,
3055
+ "count": 277
3056
  },
3057
  "ME": {
3058
+ "accuracy": 1.0,
3059
  "count": 53
3060
  },
3061
  "UB": {
3062
+ "accuracy": 0.8131634819532909,
3063
+ "count": 471
3064
  },
3065
  "UD": {
3066
+ "accuracy": 0.896551724137931,
3067
+ "count": 29
3068
  }
3069
  }
3070
  },
3071
  "sub_B3": {
3072
+ "full_accuracy": 0.26,
3073
+ "digit_accuracy": 0.8285714285714286,
3074
+ "n_examples": 50,
3075
  "per_subtask": {
3076
  "MD": {
3077
+ "accuracy": 0.9866666666666667,
3078
+ "count": 150
3079
  },
3080
  "MB": {
3081
  "accuracy": 0.96,
3082
+ "count": 50
3083
  },
3084
  "UB": {
3085
+ "accuracy": 0.6237623762376238,
3086
+ "count": 101
3087
  },
3088
  "UD": {
3089
+ "accuracy": 0.6326530612244898,
3090
+ "count": 49
3091
  }
3092
  }
3093
  },
3094
  "sub_B4": {
3095
+ "full_accuracy": 0.1,
3096
+ "digit_accuracy": 0.7371428571428571,
3097
+ "n_examples": 50,
3098
  "per_subtask": {
3099
  "MD": {
3100
+ "accuracy": 0.98,
3101
+ "count": 100
3102
  },
3103
  "MB": {
3104
+ "accuracy": 0.94,
3105
+ "count": 50
3106
  },
3107
  "UB": {
3108
+ "accuracy": 0.5867768595041323,
3109
+ "count": 121
3110
  },
3111
  "UD": {
3112
+ "accuracy": 0.5316455696202531,
3113
+ "count": 79
3114
  }
3115
  }
3116
  },
3117
  "sub_B5": {
3118
+ "full_accuracy": 0.16,
3119
+ "digit_accuracy": 0.7028571428571428,
3120
+ "n_examples": 50,
3121
  "per_subtask": {
3122
  "MD": {
3123
  "accuracy": 1.0,
3124
+ "count": 50
3125
  },
3126
  "MB": {
3127
  "accuracy": 1.0,
3128
+ "count": 50
3129
  },
3130
  "UB": {
3131
+ "accuracy": 0.5986842105263158,
3132
+ "count": 152
3133
  },
3134
  "UD": {
3135
+ "accuracy": 0.5612244897959183,
3136
+ "count": 98
3137
  }
3138
  }
3139
  }
3140
  },
3141
  "summary": {
3142
+ "overall_accuracy": 0.4026666666666667,
3143
+ "digit_accuracy": 0.82,
3144
+ "total_examples": 1500,
3145
+ "n_splits": 24
3146
  }
3147
  }
3148
  }