Skip to content

Commit f07f203

Browse files
hannes-ucscachave11-ucsc
authored andcommitted
Add ability to partition sets of UUIDs by infix (#7528)
1 parent 5f1deaa commit f07f203

File tree

2 files changed

+101
-27
lines changed

2 files changed

+101
-27
lines changed

src/azul/indexer/__init__.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -588,10 +588,3 @@ class BundlePartition(UUIDPartition):
588588

589589
def divisions(self, num_entities: int) -> int:
590590
return math.ceil(num_entities / self.max_partition_size)
591-
592-
def __attrs_post_init__(self):
593-
super().__attrs_post_init__()
594-
# Most bits in a v4 or v5 UUID are pseudo-random, including the leading
595-
# 32 bits but those are followed by a couple of deterministic ones.
596-
# For simplicity, we'll limit ourselves to 2 ** 32 leaf partitions.
597-
assert self.prefix_length <= 32, R('Too many partitions', self.prefix_length)

src/azul/uuids.py

Lines changed: 101 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
from hashlib import (
22
sha1,
33
)
4+
from itertools import (
5+
accumulate,
6+
)
47
import math
58
from typing import (
69
Any,
@@ -18,6 +21,7 @@
1821

1922
from azul import (
2023
R,
24+
cached_property,
2125
)
2226
from azul.types import (
2327
JSON,
@@ -181,43 +185,94 @@ class UUIDPartition(metaclass=UUIDPartitionMeta):
181185
#:
182186
prefix: int
183187

188+
#: The canonical string representation of UUIDs has five groups of
189+
#: hexadecimal digits separated by dash. The first group is eight digits
190+
#: long, the last group twelve and the three groups in between are four
191+
#: digits long. The first and the last group are best suited for a random
192+
#: distribution of v4 v5 UUIDs across partitions. By default, UUID
193+
#: partitions use the first group.
194+
#:
195+
group: int = 0
196+
184197
#: The partition that includes all UUIDs. Since this attribute holds an
185198
#: instance of this class, we can't initialize it here, but have to do so in
186199
#: the metaclass constructor.
187200
#:
188201
root: ClassVar[Self]
189202

203+
#: The width of each group in bits.
204+
#:
205+
group_lengths: ClassVar[tuple[int, ...]]
206+
group_lengths = tuple(4 * n for n in [8, 4, 4, 4, 12])
207+
208+
#: For each group, the number of bits to right-shift the binary, 128-bit-
209+
#: wide representation of a UUID in order to have the bits of that group
210+
#: become the low-order bits.
211+
#:
212+
group_shifts: ClassVar[tuple[int, ...]]
213+
group_shifts = tuple(accumulate(group_lengths[:-1], initial=0))
214+
190215
def __attrs_post_init__(self):
191216
"""
192217
>>> UUIDPartition(prefix_length=0, prefix=1)
193218
... # doctest: +NORMALIZE_WHITESPACE
194219
Traceback (most recent call last):
195220
...
196221
AssertionError: R('If prefix length is 0, the prefix must be, too',
197-
UUIDPartition(prefix_length=0, prefix=1))
222+
UUIDPartition(prefix_length=0, prefix=1, group=0))
198223
199224
>>> UUIDPartition(prefix_length=1, prefix=3)
200225
... # doctest: +NORMALIZE_WHITESPACE
201226
Traceback (most recent call last):
202227
...
203228
AssertionError: R('Prefix has extra high-order bits set',
204-
UUIDPartition(prefix_length=1, prefix=3))
229+
UUIDPartition(prefix_length=1, prefix=3, group=0))
230+
231+
>>> UUIDPartition(prefix_length=1, prefix=0, group=5)
232+
... # doctest: +NORMALIZE_WHITESPACE
233+
Traceback (most recent call last):
234+
...
235+
AssertionError: R('Invalid group',
236+
UUIDPartition(prefix_length=1, prefix=0, group=5))
237+
238+
>>> UUIDPartition(prefix_length=1, prefix=0, group=-1)
239+
... # doctest: +NORMALIZE_WHITESPACE
240+
Traceback (most recent call last):
241+
...
242+
AssertionError: R('Invalid group',
243+
UUIDPartition(prefix_length=1, prefix=0, group=-1))
205244
245+
>>> UUIDPartition(prefix_length=49, prefix=0, group=4)
246+
Traceback (most recent call last):
247+
...
248+
AssertionError: R('Length of prefix exceeds that of group', 49, 48)
249+
250+
>>> UUIDPartition(prefix_length=17, prefix=0, group=1)
251+
Traceback (most recent call last):
252+
...
253+
AssertionError: R('Length of prefix exceeds that of group', 17, 16)
206254
"""
207255
assert self.prefix_length != 0 or self.prefix == 0, R(
208256
'If prefix length is 0, the prefix must be, too', self)
257+
assert 0 <= self.group < len(self.group_shifts), R(
258+
'Invalid group', self)
259+
group_length = self.group_lengths[self.group]
260+
assert self.prefix_length <= group_length, R(
261+
'Length of prefix exceeds that of group', self.prefix_length, group_length)
209262
assert 0 <= self.prefix < 2 ** self.prefix_length, R(
210263
'Prefix has extra high-order bits set', self)
211264

212265
@classmethod
213266
def from_json(cls, json: JSON) -> Self:
214267
return cls(prefix_length=json_int(json['prefix_length']),
215-
prefix=json_int(json['prefix']))
268+
prefix=json_int(json['prefix']),
269+
group=json_int(json['group']))
216270

217271
def to_json(self) -> MutableJSON:
218272
return {
219273
'prefix_length': self.prefix_length,
220-
'prefix': self.prefix
274+
'prefix': self.prefix,
275+
'group': self.group
221276
}
222277

223278
def contains(self, member: UUID) -> bool:
@@ -229,10 +284,30 @@ def contains(self, member: UUID) -> bool:
229284
True
230285
>>> p.contains(UUID('ffd4524e-14c4-41d7-9071-6cadab09d75c'))
231286
True
287+
288+
>>> p = UUIDPartition(prefix_length=5, prefix=0b0110_0, group=4)
289+
>>> p.contains(UUID('fdd4524e-14c4-41d7-9071-66adab09d75c'))
290+
True
291+
>>> p.contains(UUID('fdd4524e-14c4-41d7-9071-67adab09d75c'))
292+
True
293+
>>> p.contains(UUID('fdd4524e-14c4-41d7-9071-68adab09d75c'))
294+
False
295+
296+
>>> p = UUIDPartition(prefix_length=48, prefix=0x68adab09d75c, group=4)
297+
>>> p.contains(UUID('fdd4524e-14c4-41d7-9071-68adab09d75c'))
298+
True
299+
>>> p.contains(UUID('fdd4524e-14c4-41d7-9071-68adab09d75d'))
300+
False
232301
"""
233-
# UUIDs are 128 bit integers
234-
shift = 128 - self.prefix_length
235-
return member.int >> shift == self.prefix
302+
mask, shift = self._mask_and_shift
303+
return (member.int & mask) >> shift == self.prefix
304+
305+
@cached_property
306+
def _mask_and_shift(self) -> tuple[int, int]:
307+
group_shift = self.group_shifts[self.group]
308+
shift = 128 - self.prefix_length - group_shift
309+
mask = (1 << (128 - group_shift)) - 1
310+
return mask, shift
236311

237312
def divide(self, num_divisions: int) -> list[Self]:
238313
"""
@@ -250,18 +325,24 @@ def divide(self, num_divisions: int) -> list[Self]:
250325
251326
>>> sorted(UUIDPartition.root.divide(3))
252327
... # doctest: +NORMALIZE_WHITESPACE
253-
[UUIDPartition(prefix_length=2, prefix=0),
254-
UUIDPartition(prefix_length=2, prefix=1),
255-
UUIDPartition(prefix_length=2, prefix=2),
256-
UUIDPartition(prefix_length=2, prefix=3)]
328+
[UUIDPartition(prefix_length=2, prefix=0, group=0),
329+
UUIDPartition(prefix_length=2, prefix=1, group=0),
330+
UUIDPartition(prefix_length=2, prefix=2, group=0),
331+
UUIDPartition(prefix_length=2, prefix=3, group=0)]
332+
333+
>>> UUIDPartition(prefix_length=2, prefix=0, group=4).divide(2)
334+
... # doctest: +NORMALIZE_WHITESPACE
335+
[UUIDPartition(prefix_length=3, prefix=0, group=4),
336+
UUIDPartition(prefix_length=3, prefix=1, group=4)]
257337
"""
258338
assert num_divisions > 0, R('Number of divisions must be 1 or more')
259339
prefix_length = math.ceil(math.log2(num_divisions))
260340
num_divisions = 2 ** prefix_length
261341
cls = type(self)
262342
return [
263343
cls(prefix_length=self.prefix_length + prefix_length,
264-
prefix=(self.prefix << prefix_length) + prefix)
344+
prefix=(self.prefix << prefix_length) + prefix,
345+
group=self.group)
265346
for prefix in range(num_divisions)
266347
]
267348

@@ -273,28 +354,28 @@ def __str__(self) -> str:
273354
returned by this function.
274355
275356
>>> str(UUIDPartition.root)
276-
'-'
357+
'-@0'
277358
278359
0b1111_1110 == 0xfe
279360
0b1111_1111 == 0xff
280-
>>> str(UUIDPartition(prefix_length=7, prefix=0b1111_111))
281-
'fe-ff'
361+
>>> str(UUIDPartition(prefix_length=7, prefix=0b1111_111, group=4))
362+
'fe-ff@4'
282363
283364
Leading zeroes in the high and low end of the range:
284365
285366
0b0000_1110 == 0x0e
286367
0b0000_1111 == 0x0f
287-
>>> str(UUIDPartition(prefix_length=7, prefix=0b0000_111))
288-
'0e-0f'
368+
>>> str(UUIDPartition(prefix_length=7, prefix=0b0000_111, group=4))
369+
'0e-0f@4'
289370
290371
A partition twice as big (a binary prefix that's one bit shorter):
291372
292373
0b0000_1100 = 0x0c
293374
0b0000_1101 = 0x0d
294375
0b0000_1110 = 0x0e
295376
0b0000_1111 = 0x0f
296-
>>> str(UUIDPartition(prefix_length=6, prefix=0b0000_11))
297-
'0c-0f'
377+
>>> str(UUIDPartition(prefix_length=6, prefix=0b0000_11, group=4))
378+
'0c-0f@4'
298379
"""
299380
shift = 4 - self.prefix_length % 4 # shift to align at nibble boundary
300381
all_ones = (1 << shift) - 1
@@ -306,7 +387,7 @@ def __str__(self) -> str:
306387
def hex(i):
307388
return format(i, f'0{hex_len}x')[:hex_len]
308389

309-
return '-'.join(map(hex, (lo, hi)))
390+
return f'{hex(lo)}-{hex(hi)}@{self.group}'
310391

311392

312393
def uuid5_for_bytes(namespace: UUID, name: bytes) -> UUID:

0 commit comments

Comments
 (0)