File size: 5,894 Bytes
9d1ee0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import random
from collections import OrderedDict


def get_dict_first_item(dict_obj):
    for key in dict_obj:
        return key, dict_obj[key]


def sort_dict(dict_obj, key=None, reverse=False):
    return OrderedDict(sorted(dict_obj.items(), key=key, reverse=reverse))


def create_multidict(key_list, value_list):
    assert len(key_list) == len(value_list)
    multidict_obj = {}
    for key, value in zip(key_list, value_list):
        multidict_obj.setdefault(key, []).append(value)
    return multidict_obj


def convert_multidict_to_list(multidict_obj):
    key_list, value_list = [], []
    for key, value in multidict_obj.items():
        key_list += [key] * len(value)
        value_list += value
    return key_list, value_list


def convert_multidict_to_records(multidict_obj, key_map=None, raise_if_key_error=True):
    records = []
    if key_map is None:
        for key in multidict_obj:
            for value in multidict_obj[key]:
                records.append('{},{}'.format(value, key))
    else:
        for key in multidict_obj:
            if raise_if_key_error:
                mapped_key = key_map[key]
            else:
                mapped_key = key_map.get(key, key)
            for value in multidict_obj[key]:
                records.append('{},{}'.format(value, mapped_key))
    return records
    
    
def sample_multidict(multidict_obj, num_keys, num_per_key=None):
    num_keys = min(num_keys, len(multidict_obj))
    sub_keys = random.sample(list(multidict_obj), num_keys)
    if num_per_key is None:
        sub_mdict = {key: multidict_obj[key] for key in sub_keys}
    else:
        sub_mdict = {}
        for key in sub_keys:
            num_examples_inner = min(num_per_key, len(multidict_obj[key]))
            sub_mdict[key] = random.sample(multidict_obj[key], num_examples_inner)
    return sub_mdict
    
    
def split_multidict_on_key(multidict_obj, split_ratio, use_shuffle=False):
    """Split multidict_obj on its key.
    """
    assert isinstance(multidict_obj, dict)
    assert isinstance(split_ratio, (list, tuple))
    
    pdf = [k / float(sum(split_ratio)) for k in split_ratio]
    cdf = [sum(pdf[:k]) for k in range(len(pdf) + 1)]
    indices = [int(round(len(multidict_obj) * k)) for k in cdf]
    dict_keys = list(multidict_obj)
    if use_shuffle: 
        random.shuffle(dict_keys)
        
    be_split_list = []
    for i in range(len(split_ratio)):
        part_keys = dict_keys[indices[i]: indices[i + 1]]
        part_dict = dict([(key, multidict_obj[key]) for key in part_keys])
        be_split_list.append(part_dict)
    return be_split_list
    
    
def split_multidict_on_value(multidict_obj, split_ratio, use_shuffle=False):
    """Split multidict_obj on its value.
    """
    assert isinstance(multidict_obj, dict)
    assert isinstance(split_ratio, (list, tuple))
    
    pdf = [k / float(sum(split_ratio)) for k in split_ratio]
    cdf = [sum(pdf[:k]) for k in range(len(pdf) + 1)]
    be_split_list = [dict() for k in range(len(split_ratio))] 
    for key, value in multidict_obj.items():
        indices = [int(round(len(value) * k)) for k in cdf]
        cloned = value[:]
        if use_shuffle: 
            random.shuffle(cloned)
        for i in range(len(split_ratio)):
            be_split_list[i][key] = cloned[indices[i]: indices[i + 1]]
    return be_split_list
    
    
def get_multidict_info(multidict_obj, with_print=False, desc=None):
    num_list = [len(val) for val in multidict_obj.values()]
    num_keys = len(num_list)
    num_values = sum(num_list)
    max_values_per_key = max(num_list)
    min_values_per_key = min(num_list)
    if num_keys == 0:
        avg_values_per_key = 0
    else:
        avg_values_per_key = num_values / num_keys
    info = {
        'num_keys': num_keys,
        'num_values': num_values,
        'max_values_per_key': max_values_per_key,
        'min_values_per_key': min_values_per_key,
        'avg_values_per_key': avg_values_per_key,
    }
    if with_print:
        desc = desc or '<unknown>'
        print('{} key number:    {}'.format(desc, info['num_keys']))
        print('{} value number:    {}'.format(desc, info['num_values']))
        print('{} max number per-key: {}'.format(desc, info['max_values_per_key']))
        print('{} min number per-key: {}'.format(desc, info['min_values_per_key']))
        print('{} avg number per-key: {:.2f}'.format(desc, info['avg_values_per_key']))
    return info
    

def filter_multidict_by_number(multidict_obj, lower, upper=None):
    if upper is None:
        return {key: value for key, value in multidict_obj.items() 
                if lower <= len(value) }
    else:
        assert lower <= upper, 'lower must not be greater than upper'
        return {key: value for key, value in multidict_obj.items() 
                if lower <= len(value) <= upper }
        
        
def sort_multidict_by_number(multidict_obj, num_keys_to_keep=None, reverse=True):
    """
    Args:
        reverse: sort in ascending order when is True.
    """
    if num_keys_to_keep is None: 
        num_keys_to_keep = len(multidict_obj)
    else:
        num_keys_to_keep = min(num_keys_to_keep, len(multidict_obj))
    sorted_items = sorted(multidict_obj.items(), key=lambda x: len(x[1]), reverse=reverse)
    filtered_dict = OrderedDict()
    for i in range(num_keys_to_keep):
        filtered_dict[sorted_items[i][0]] = sorted_items[i][1]
    return filtered_dict

    
def merge_multidict(*mdicts):
    merged_multidict = {}
    for item in mdicts:
        for key, value in item.items():
            merged_multidict.setdefault(key, []).extend(value)
    return merged_multidict
    
    
def invert_multidict(multidict_obj):
    inverted_dict = {}
    for key, value in multidict_obj.items():
        for item in value:
            inverted_dict.setdefault(item, []).append(key)
    return inverted_dict