File size: 1,310 Bytes
a3ee979
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from pathlib import Path

import pyarrow as pa
import numpy as np


class ArrowMetadataProvider:
    """The arrow metadata provider provides metadata from contiguous ids using arrow.

    Code taken from:
        https://github.dev/rom1504/clip-retrieval
    """

    def __init__(self, arrow_folder):
        arrow_files = [str(a) for a in sorted(Path(arrow_folder).glob("**/*")) if a.is_file()]
        self.table = pa.concat_tables(
            [
                pa.ipc.RecordBatchFileReader(pa.memory_map(arrow_file, "r")).read_all()
                for arrow_file in arrow_files
            ]
        )

    def get(self, ids, cols=None):
        """implement the get method from the arrow metadata provide, get metadata from ids"""
        if cols is None:
            cols = self.table.schema.names
        else:
            cols = list(set(self.table.schema.names) & set(cols))
        t = pa.concat_tables([self.table[i:(i + 1)] for i in ids])
        return t.select(cols).to_pandas().to_dict("records")


def meta_to_dict(meta):
    """Convert a metadata list to a dictionary."""
    output = {}
    for k, v in meta.items():
        if isinstance(v, bytes):
            v = v.decode()
        elif type(v).__module__ == np.__name__:
            v = v.item()
        output[k] = v
    return output