File size: 3,284 Bytes
705d528
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import json
import itertools


class TaxonomicNode:
    __slots__ = ("name", "index", "root", "_children")

    def __init__(self, name, index, root):
        self.name = name
        self.index = index
        self.root = root
        self._children = {}

    def add(self, name):
        added = 0
        if not name:
            return added

        first, rest = name[0], name[1:]
        if first not in self._children:
            self._children[first] = TaxonomicNode(first, self.root.size, self.root)
            self.root.size += 1

        self._children[first].add(rest)

    def children(self, name):
        if not name:
            return set((child.name, child.index) for child in self._children.values())

        first, rest = name[0], name[1:]
        if first not in self._children:
            return set()

        return self._children[first].children(rest)

    def __iter__(self):
        yield self.name, self.index

        for child in self._children.values():
            for name, index in child:
                yield f"{self.name} {name}", index

    @classmethod
    def from_dict(cls, dct, root):
        node = cls(dct["name"], dct["index"], root)
        node._children = {child["name"]: cls.from_dict(child, root) for child in dct["children"]}
        return node



class TaxonomicTree:
    """
    Efficient structure for finding taxonomic names and their descendants.
    Also returns an integer index i for each possible name.
    """

    def __init__(self):
        self.kingdoms = {}
        self.size = 0

    def add(self, name: list[str]):
        if not name:
            return

        first, rest = name[0], name[1:]
        if first not in self.kingdoms:
            self.kingdoms[first] = TaxonomicNode(first, self.size, self)
            self.size += 1

        self.kingdoms[first].add(rest)

    def children(self, name=None):
        if not name:
            return set(
                (kingdom.name, kingdom.index) for kingdom in self.kingdoms.values()
            )

        first, rest = name[0], name[1:]
        if first not in self.kingdoms:
            return set()

        return self.kingdoms[first].children(rest)

    def __iter__(self):
        for kingdom in self.kingdoms.values():
            yield from kingdom

    @classmethod
    def from_dict(cls, dct):
        tree = cls()
        tree.kingdoms = {
            kingdom["name"]: TaxonomicNode.from_dict(kingdom, tree) for kingdom in dct["kingdoms"]
        }
        tree.size = dct["size"]
        return tree


class TaxonomicJsonEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, TaxonomicNode):
            return {
                "name": obj.name,
                "index": obj.index,
                "children": list(obj._children.values()),
            }
        elif isinstance(obj, TaxonomicTree):
            return {
                "kingdoms": list(obj.kingdoms.values()),
                "size": obj.size,
            }
        else:
            super().default(self, obj)



def batched(iterable, n):
    # batched('ABCDEFG', 3) --> ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while batch := tuple(itertools.islice(it, n)):
        yield zip(*batch)