bioclip-demo / lib.py
Samuel Stevens
try hierarchical averaging
a33c93d
"""
Mostly a TaxonomicTree class that implements a taxonomy and some helpers for easily
walking and looking in the tree.
A tree is an arrangement of TaxonomicNodes.
"""
import itertools
import json
class TaxonomicNode:
__slots__ = ("name", "index", "root", "_children")
def __init__(self, name, index, root):
self.name = name
self.index = index
self.root = root
self._children = {}
def add(self, name):
added = 0
if not name:
return added
first, rest = name[0], name[1:]
if first not in self._children:
self._children[first] = TaxonomicNode(first, self.root.size, self.root)
self.root.size += 1
self._children[first].add(rest)
def children(self, name):
if not name:
return set((child.name, child.index) for child in self._children.values())
first, rest = name[0], name[1:]
if first not in self._children:
return set()
return self._children[first].children(rest)
def descendants(self, prefix=None):
"""Iterates over all values in the subtree that match prefix."""
if not prefix:
yield (self.name,), self.index
for child in self._children.values():
for name, i in child.descendants():
yield (self.name, *name), i
return
first, rest = prefix[0], prefix[1:]
if first not in self._children:
return
for name, i in self._children[first].descendants(rest):
yield (self.name, *name), i
def values(self):
"""Iterates over all (name, i) pairs in the tree."""
yield (self.name,), self.index
for child in self._children.values():
for name, index in child.values():
yield (self.name, *name), index
@classmethod
def from_dict(cls, dct, root):
node = cls(dct["name"], dct["index"], root)
node._children = {
child["name"]: cls.from_dict(child, root) for child in dct["children"]
}
return node
class TaxonomicTree:
"""
Efficient structure for finding taxonomic names and their descendants.
Also returns an integer index i for each possible name.
"""
def __init__(self):
self.kingdoms = {}
self.size = 0
def add(self, name: list[str]):
if not name:
return
first, rest = name[0], name[1:]
if first not in self.kingdoms:
self.kingdoms[first] = TaxonomicNode(first, self.size, self)
self.size += 1
self.kingdoms[first].add(rest)
def children(self, name=None):
if not name:
return set(
(kingdom.name, kingdom.index) for kingdom in self.kingdoms.values()
)
first, rest = name[0], name[1:]
if first not in self.kingdoms:
return set()
return self.kingdoms[first].children(rest)
def descendants(self, prefix=None):
"""Iterates over all values in the tree that match prefix."""
if not prefix:
# Give them all the subnodes
for kingdom in self.kingdoms.values():
yield from kingdom.descendants()
return
first, rest = prefix[0], prefix[1:]
if first not in self.kingdoms:
return
yield from self.kingdoms[first].descendants(rest)
def values(self):
"""Iterates over all (name, i) pairs in the tree."""
for kingdom in self.kingdoms.values():
yield from kingdom.values()
def __len__(self):
return self.size
@classmethod
def from_dict(cls, dct):
tree = cls()
tree.kingdoms = {
kingdom["name"]: TaxonomicNode.from_dict(kingdom, tree)
for kingdom in dct["kingdoms"]
}
tree.size = dct["size"]
return tree
class TaxonomicJsonEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, TaxonomicNode):
return {
"name": obj.name,
"index": obj.index,
"children": list(obj._children.values()),
}
elif isinstance(obj, TaxonomicTree):
return {
"kingdoms": list(obj.kingdoms.values()),
"size": obj.size,
}
else:
super().default(self, obj)
def batched(iterable, n):
# batched('ABCDEFG', 3) --> ABC DEF G
if n < 1:
raise ValueError("n must be at least one")
it = iter(iterable)
while batch := tuple(itertools.islice(it, n)):
yield zip(*batch)