File size: 3,366 Bytes
290c238
2cfb891
290c238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2cfb891
 
 
290c238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2cfb891
 
 
290c238
 
 
 
2cfb891
 
290c238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2cfb891
290c238
 
2cfb891
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import itertools
import json


class TaxonomicNode:
    __slots__ = ("name", "index", "root", "_children")

    def __init__(self, name, index, root):
        self.name = name
        self.index = index
        self.root = root
        self._children = {}

    def add(self, name):
        added = 0
        if not name:
            return added

        first, rest = name[0], name[1:]
        if first not in self._children:
            self._children[first] = TaxonomicNode(first, self.root.size, self.root)
            self.root.size += 1

        self._children[first].add(rest)

    def children(self, name):
        if not name:
            return set((child.name, child.index) for child in self._children.values())

        first, rest = name[0], name[1:]
        if first not in self._children:
            return set()

        return self._children[first].children(rest)

    def __iter__(self):
        yield self.name, self.index

        for child in self._children.values():
            for name, index in child:
                yield f"{self.name} {name}", index

    @classmethod
    def from_dict(cls, dct, root):
        node = cls(dct["name"], dct["index"], root)
        node._children = {
            child["name"]: cls.from_dict(child, root) for child in dct["children"]
        }
        return node


class TaxonomicTree:
    """
    Efficient structure for finding taxonomic names and their descendants.
    Also returns an integer index i for each possible name.
    """

    def __init__(self):
        self.kingdoms = {}
        self.size = 0

    def add(self, name: list[str]):
        if not name:
            return

        first, rest = name[0], name[1:]
        if first not in self.kingdoms:
            self.kingdoms[first] = TaxonomicNode(first, self.size, self)
            self.size += 1

        self.kingdoms[first].add(rest)

    def children(self, name=None):
        if not name:
            return set(
                (kingdom.name, kingdom.index) for kingdom in self.kingdoms.values()
            )

        first, rest = name[0], name[1:]
        if first not in self.kingdoms:
            return set()

        return self.kingdoms[first].children(rest)

    def __iter__(self):
        for kingdom in self.kingdoms.values():
            yield from kingdom

    def __len__(self):
        return self.size

    @classmethod
    def from_dict(cls, dct):
        tree = cls()
        tree.kingdoms = {
            kingdom["name"]: TaxonomicNode.from_dict(kingdom, tree)
            for kingdom in dct["kingdoms"]
        }
        tree.size = dct["size"]
        return tree


class TaxonomicJsonEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, TaxonomicNode):
            return {
                "name": obj.name,
                "index": obj.index,
                "children": list(obj._children.values()),
            }
        elif isinstance(obj, TaxonomicTree):
            return {
                "kingdoms": list(obj.kingdoms.values()),
                "size": obj.size,
            }
        else:
            super().default(self, obj)


def batched(iterable, n):
    # batched('ABCDEFG', 3) --> ABC DEF G
    if n < 1:
        raise ValueError("n must be at least one")
    it = iter(iterable)
    while batch := tuple(itertools.islice(it, n)):
        yield zip(*batch)