File size: 7,285 Bytes
1380717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: https://opensource.org/license/bsd-3-clause/
"""Module containing a database to deal with packs"""
from gitdb.db.base import (
    FileDBBase,
    ObjectDBR,
    CachingDB
)

from gitdb.util import LazyMixin

from gitdb.exc import (
    BadObject,
    UnsupportedOperation,
    AmbiguousObjectName
)

from gitdb.pack import PackEntity

from functools import reduce

import os
import glob

__all__ = ('PackedDB', )

#{ Utilities


class PackedDB(FileDBBase, ObjectDBR, CachingDB, LazyMixin):

    """A database operating on a set of object packs"""

    # sort the priority list every N queries
    # Higher values are better, performance tests don't show this has
    # any effect, but it should have one
    _sort_interval = 500

    def __init__(self, root_path):
        super().__init__(root_path)
        # list of lists with three items:
        # * hits - number of times the pack was hit with a request
        # * entity - Pack entity instance
        # * sha_to_index - PackIndexFile.sha_to_index method for direct cache query
        # self._entities = list()       # lazy loaded list
        self._hit_count = 0             # amount of hits
        self._st_mtime = 0              # last modification data of our root path

    def _set_cache_(self, attr):
        if attr == '_entities':
            self._entities = list()
            self.update_cache(force=True)
        # END handle entities initialization

    def _sort_entities(self):
        self._entities.sort(key=lambda l: l[0], reverse=True)

    def _pack_info(self, sha):
        """:return: tuple(entity, index) for an item at the given sha
        :param sha: 20 or 40 byte sha
        :raise BadObject:
        **Note:** This method is not thread-safe, but may be hit in multi-threaded
            operation. The worst thing that can happen though is a counter that
            was not incremented, or the list being in wrong order. So we safe
            the time for locking here, lets see how that goes"""
        # presort ?
        if self._hit_count % self._sort_interval == 0:
            self._sort_entities()
        # END update sorting

        for item in self._entities:
            index = item[2](sha)
            if index is not None:
                item[0] += 1            # one hit for you
                self._hit_count += 1    # general hit count
                return (item[1], index)
            # END index found in pack
        # END for each item

        # no hit, see whether we have to update packs
        # NOTE: considering packs don't change very often, we safe this call
        # and leave it to the super-caller to trigger that
        raise BadObject(sha)

    #{ Object DB Read

    def has_object(self, sha):
        try:
            self._pack_info(sha)
            return True
        except BadObject:
            return False
        # END exception handling

    def info(self, sha):
        entity, index = self._pack_info(sha)
        return entity.info_at_index(index)

    def stream(self, sha):
        entity, index = self._pack_info(sha)
        return entity.stream_at_index(index)

    def sha_iter(self):
        for entity in self.entities():
            index = entity.index()
            sha_by_index = index.sha
            for index in range(index.size()):
                yield sha_by_index(index)
            # END for each index
        # END for each entity

    def size(self):
        sizes = [item[1].index().size() for item in self._entities]
        return reduce(lambda x, y: x + y, sizes, 0)

    #} END object db read

    #{ object db write

    def store(self, istream):
        """Storing individual objects is not feasible as a pack is designed to
        hold multiple objects. Writing or rewriting packs for single objects is
        inefficient"""
        raise UnsupportedOperation()

    #} END object db write

    #{ Interface

    def update_cache(self, force=False):
        """
        Update our cache with the actually existing packs on disk. Add new ones,
        and remove deleted ones. We keep the unchanged ones

        :param force: If True, the cache will be updated even though the directory
            does not appear to have changed according to its modification timestamp.
        :return: True if the packs have been updated so there is new information,
            False if there was no change to the pack database"""
        stat = os.stat(self.root_path())
        if not force and stat.st_mtime <= self._st_mtime:
            return False
        # END abort early on no change
        self._st_mtime = stat.st_mtime

        # packs are supposed to be prefixed with pack- by git-convention
        # get all pack files, figure out what changed
        pack_files = set(glob.glob(os.path.join(self.root_path(), "pack-*.pack")))
        our_pack_files = {item[1].pack().path() for item in self._entities}

        # new packs
        for pack_file in (pack_files - our_pack_files):
            # init the hit-counter/priority with the size, a good measure for hit-
            # probability. Its implemented so that only 12 bytes will be read
            entity = PackEntity(pack_file)
            self._entities.append([entity.pack().size(), entity, entity.index().sha_to_index])
        # END for each new packfile

        # removed packs
        for pack_file in (our_pack_files - pack_files):
            del_index = -1
            for i, item in enumerate(self._entities):
                if item[1].pack().path() == pack_file:
                    del_index = i
                    break
                # END found index
            # END for each entity
            assert del_index != -1
            del(self._entities[del_index])
        # END for each removed pack

        # reinitialize prioritiess
        self._sort_entities()
        return True

    def entities(self):
        """:return: list of pack entities operated upon by this database"""
        return [item[1] for item in self._entities]

    def partial_to_complete_sha(self, partial_binsha, canonical_length):
        """:return: 20 byte sha as inferred by the given partial binary sha
        :param partial_binsha: binary sha with less than 20 bytes
        :param canonical_length: length of the corresponding canonical representation.
            It is required as binary sha's cannot display whether the original hex sha
            had an odd or even number of characters
        :raise AmbiguousObjectName:
        :raise BadObject: """
        candidate = None
        for item in self._entities:
            item_index = item[1].index().partial_sha_to_index(partial_binsha, canonical_length)
            if item_index is not None:
                sha = item[1].index().sha(item_index)
                if candidate and candidate != sha:
                    raise AmbiguousObjectName(partial_binsha)
                candidate = sha
            # END handle full sha could be found
        # END for each entity

        if candidate:
            return candidate

        # still not found ?
        raise BadObject(partial_binsha)

    #} END interface