File size: 2,858 Bytes
39950c9
 
 
a01d3ba
 
39950c9
 
9e1ff19
 
 
39950c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a01d3ba
 
 
 
 
 
 
39950c9
a01d3ba
 
39950c9
a01d3ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39950c9
aef1dbe
a01d3ba
 
 
a7bba68
a01d3ba
aef1dbe
a01d3ba
aef1dbe
a01d3ba
a7bba68
 
aef1dbe
 
a01d3ba
aef1dbe
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import functools
import operator

import pandas as pd


def correlations_for_group(group):
    REL_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_related")]
    IND_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_independent")]
    AGGR_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_aggr")]

    correlations = []
    for rel_metric in REL_METRICS:
        for ind_metric in IND_METRICS:
            correlations.append({
                f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr(
                    group[f"{ind_metric}_independent"], method="pearson"),
                f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr(
                    group[f"{ind_metric}_independent"], method="spearman"),
            })
        for aggr_metric in AGGR_METRICS:
            correlations.append({
                f"rel_{rel_metric}_aggr_{aggr_metric}_pearson": group[f"{rel_metric}_related"].corr(
                    group[f"{aggr_metric}_aggr"], method="pearson"),
                f"rel_{rel_metric}_aggr_{aggr_metric}_spearman": group[f"{rel_metric}_related"].corr(
                    group[f"{aggr_metric}_aggr"], method="spearman"),
            })
    return pd.Series(functools.reduce(operator.ior, correlations, {}))


def split_metrics_string(s):
    tokens = s.split("_")
    return tokens[1], tokens[3]


def get_correlations_df(df, right_side):
    correlations_raw = correlations_for_group(df)

    idx = list(set("_".join(col.split("_")[:-1]) for col in correlations_raw.index if right_side in col))

    data = []
    for metrics in idx:
        data.append(
            {"metrics": metrics,
             "spearman": correlations_raw[f"{metrics}_spearman"],
             "pearson": correlations_raw[f"{metrics}_pearson"],
             }
        )

    result = pd.DataFrame.from_records(data=data, index="metrics").sort_index()
    result.index = pd.MultiIndex.from_tuples(result.index.map(split_metrics_string).tolist())
    result.index.set_names(["relative", "independent"], inplace=True)

    return result


def get_correlations_for_groups(df, right_side):
    correlations = {"all": get_correlations_df(df, right_side=right_side)}

    for e2s in (False, True):
        for s2e in (False, True):
            group = "golden"
            if e2s:
                group += "+e2s"
            if s2e:
                group += "+s2e"

            subdf = df[((df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)) | (
                    (df["end_to_start"] == False) & (df["start_to_end"] == False))]
            subdf_corr = get_correlations_df(subdf, right_side=right_side)
            correlations[group] = subdf_corr

    correlations = pd.concat(correlations, axis=1)
    return correlations