Skip to content

Commit cae4c4d

Browse files
committed
[3/N][VQueues] Partition Store Implementation
1 parent 92253d3 commit cae4c4d

File tree

15 files changed

+1529
-4
lines changed

15 files changed

+1529
-4
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/partition-store/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ restate-types = { workspace = true }
2323

2424
ahash = { workspace = true }
2525
anyhow = { workspace = true }
26+
bilrost = { workspace = true }
2627
bytes = { workspace = true }
2728
bytestring = { workspace = true }
2829
codederror = { workspace = true }

crates/partition-store/src/keys.rs

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,11 @@ use anyhow::anyhow;
1414
use bytes::{Buf, BufMut, Bytes, BytesMut};
1515
use bytestring::ByteString;
1616
use prost::encoding::encoded_len_varint;
17+
use rocksdb::MergeOperands;
1718
use strum::EnumIter;
19+
use tracing::{error, trace};
20+
21+
use restate_types::clock::UniqueTimestamp;
1822

1923
/// Every table key needs to have a key kind. This allows to multiplex different keys in the same
2024
/// column family and to evolve a key if necessary.
@@ -42,6 +46,17 @@ pub enum KeyKind {
4246
State,
4347
Timers,
4448
Promise,
49+
// VQueues --> owned by restate-vqueues
50+
//
51+
// todo: split this into empty and non-empty, or add the status in the key prefix
52+
// for instance, make this VQueueStatus (S | VQueueId)
53+
// or have empty_vqueues that carry the empty_since in their key prefix. Note that
54+
// doing so would require that we know the empty_since when we attempt to delete it
55+
VQueueActive,
56+
VQueueInbox,
57+
VQueueMeta,
58+
// Resources' canonical key(s)
59+
VQueueEntryState,
4560
}
4661

4762
impl KeyKind {
@@ -86,6 +101,13 @@ impl KeyKind {
86101
KeyKind::State => b"st",
87102
KeyKind::Timers => b"ti",
88103
KeyKind::Promise => b"pr",
104+
// ** VQueues ** //
105+
// VQueues own all keys that start with b"q".
106+
KeyKind::VQueueActive => b"qa",
107+
KeyKind::VQueueInbox => b"qi",
108+
KeyKind::VQueueMeta => b"qm",
109+
// Queue Entry State (canonical state of vqueue entries)
110+
KeyKind::VQueueEntryState => b"qe",
89111
}
90112
}
91113

@@ -115,6 +137,11 @@ impl KeyKind {
115137
b"st" => Some(KeyKind::State),
116138
b"ti" => Some(KeyKind::Timers),
117139
b"pr" => Some(KeyKind::Promise),
140+
// VQueues own all keys that start with b"q"
141+
b"qa" => Some(KeyKind::VQueueActive),
142+
b"qi" => Some(KeyKind::VQueueInbox),
143+
b"qm" => Some(KeyKind::VQueueMeta),
144+
b"qe" => Some(KeyKind::VQueueEntryState),
118145
_ => None,
119146
}
120147
}
@@ -134,6 +161,50 @@ impl KeyKind {
134161
Self::from_bytes(&bytes)
135162
.ok_or_else(|| StorageError::Generic(anyhow::anyhow!("unknown key kind: {:x?}", bytes)))
136163
}
164+
165+
// Rocksdb merge operator function (full merge)
166+
pub fn full_merge(
167+
key: &[u8],
168+
existing_val: Option<&[u8]>,
169+
operands: &MergeOperands,
170+
) -> Option<Vec<u8>> {
171+
let mut kind_buf = key;
172+
let kind = match KeyKind::deserialize(&mut kind_buf) {
173+
Ok(kind) => kind,
174+
Err(e) => {
175+
error!("Cannot apply merge operator; {e}");
176+
return None;
177+
}
178+
};
179+
trace!(?kind, "full merge");
180+
181+
match kind {
182+
KeyKind::VQueueMeta => vqueue_meta_merge::full_merge(key, existing_val, operands),
183+
_ => None,
184+
}
185+
}
186+
187+
// Rocksdb merge operator function (partial merge)
188+
pub fn partial_merge(
189+
key: &[u8],
190+
_unused: Option<&[u8]>,
191+
operands: &MergeOperands,
192+
) -> Option<Vec<u8>> {
193+
let mut kind_buf = key;
194+
let kind = match KeyKind::deserialize(&mut kind_buf) {
195+
Ok(kind) => kind,
196+
Err(e) => {
197+
error!("Cannot apply merge operator; {e}");
198+
return None;
199+
}
200+
};
201+
trace!(?kind, "partial merge");
202+
203+
match kind {
204+
KeyKind::VQueueMeta => vqueue_meta_merge::partial_merge(key, operands),
205+
_ => None,
206+
}
207+
}
137208
}
138209

139210
pub trait TableKey: Sized + std::fmt::Debug + Send + 'static {
@@ -350,6 +421,7 @@ macro_rules! define_table_key {
350421

351422
use crate::PaddedPartitionId;
352423
use crate::TableKind;
424+
use crate::vqueue_table::vqueue_meta_merge;
353425
pub(crate) use define_table_key;
354426
use restate_storage_api::StorageError;
355427
use restate_storage_api::deduplication_table::ProducerId;
@@ -411,6 +483,41 @@ impl KeyCodec for PaddedPartitionId {
411483
}
412484
}
413485

486+
impl KeyCodec for UniqueTimestamp {
487+
fn encode<B: BufMut>(&self, target: &mut B) {
488+
// store u64 in big-endian order to support byte-wise increment operation. See `crate::scan::try_increment`.
489+
target.put_u64(self.as_u64());
490+
}
491+
492+
fn decode<B: Buf>(source: &mut B) -> crate::Result<Self> {
493+
UniqueTimestamp::try_from(source.get_u64()).map_err(|e| StorageError::Conversion(e.into()))
494+
}
495+
496+
fn serialized_length(&self) -> usize {
497+
std::mem::size_of::<Self>()
498+
}
499+
}
500+
501+
impl<const L: usize> KeyCodec for [u8; L] {
502+
fn encode<B: BufMut>(&self, target: &mut B) {
503+
// stores the array as is.
504+
target.put_slice(self.as_ref());
505+
}
506+
507+
fn decode<B: Buf>(source: &mut B) -> crate::Result<Self> {
508+
if source.remaining() < L {
509+
return Err(StorageError::DataIntegrityError);
510+
}
511+
let mut buf = [0u8; L];
512+
source.copy_to_slice(&mut buf);
513+
Ok(buf)
514+
}
515+
516+
fn serialized_length(&self) -> usize {
517+
L
518+
}
519+
}
520+
414521
impl KeyCodec for u64 {
415522
fn encode<B: BufMut>(&self, target: &mut B) {
416523
// store u64 in big-endian order to support byte-wise increment operation. See `crate::scan::try_increment`.

crates/partition-store/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ pub mod service_status_table;
3232
pub mod snapshots;
3333
pub mod state_table;
3434
pub mod timer_table;
35+
pub mod vqueue_table;
3536

3637
#[cfg(test)]
3738
mod tests;

crates/partition-store/src/partition_db.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ use restate_types::config::Configuration;
2828
use restate_types::logs::Lsn;
2929
use restate_types::partitions::{CfName, Partition};
3030

31+
use crate::TableKind;
3132
use crate::durable_lsn_tracking::{AppliedLsnCollectorFactory, DurableLsnEventListener};
33+
use crate::keys::KeyKind;
3234
use crate::memory::MemoryBudget;
3335
use crate::snapshots::LocalPartitionSnapshot;
3436

@@ -80,6 +82,10 @@ impl PartitionDb {
8082
&self.cf.0
8183
}
8284

85+
pub(crate) fn table_cf_handle(&self, _table_kind: TableKind) -> &Arc<BoundColumnFamily<'_>> {
86+
self.cf_handle()
87+
}
88+
8389
pub fn cf_names(&self) -> Vec<SmartString> {
8490
vec![self.meta.cf_name().into_inner()]
8591
}
@@ -515,6 +521,12 @@ impl CfConfigurator for RocksConfigurator<AllDataCf> {
515521
Some(global_cache),
516522
);
517523
cf_options.set_block_based_table_factory(&block_options);
524+
cf_options.set_merge_operator(
525+
"PartitionMerge",
526+
KeyKind::full_merge,
527+
KeyKind::partial_merge,
528+
);
529+
cf_options.set_max_successive_merges(100);
518530

519531
// Actually, we would love to use CappedPrefixExtractor but unfortunately it's neither exposed
520532
// in the C API nor the rust binding. That's okay and we can change it later.

crates/partition-store/src/partition_store.rs

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ pub enum TableKind {
139139
Journal,
140140
JournalEvent,
141141
Promise,
142+
VQueue,
142143
}
143144

144145
impl TableKind {
@@ -162,6 +163,12 @@ impl TableKind {
162163
],
163164
Self::JournalEvent => &[KeyKind::JournalEvent],
164165
Self::Promise => &[KeyKind::Promise],
166+
Self::VQueue => &[
167+
KeyKind::VQueueMeta,
168+
KeyKind::VQueueInbox,
169+
KeyKind::VQueueActive,
170+
KeyKind::VQueueEntryState,
171+
],
165172
}
166173
}
167174

@@ -250,8 +257,8 @@ impl PartitionStore {
250257
self.db.partition().key_range.contains(&key)
251258
}
252259

253-
fn table_handle(&self, _table_kind: TableKind) -> &Arc<BoundColumnFamily<'_>> {
254-
self.db.cf_handle()
260+
pub(crate) fn table_handle(&self, table_kind: TableKind) -> &Arc<BoundColumnFamily<'_>> {
261+
self.db.table_cf_handle(table_kind)
255262
}
256263

257264
fn new_prefix_iterator_opts(&self, _key_kind: KeyKind, prefix: Bytes) -> ReadOptions {
@@ -822,6 +829,34 @@ impl PartitionStoreTransaction<'_> {
822829
opts
823830
}
824831

832+
#[inline]
833+
pub fn raw_put_cf(
834+
&mut self,
835+
_key_kind: KeyKind,
836+
key: impl AsRef<[u8]>,
837+
value: impl AsRef<[u8]>,
838+
) {
839+
self.write_batch_with_index
840+
.put_cf(self.data_cf_handle, key, value);
841+
}
842+
843+
#[inline]
844+
pub fn raw_merge_cf(
845+
&mut self,
846+
_key_kind: KeyKind,
847+
key: impl AsRef<[u8]>,
848+
value: impl AsRef<[u8]>,
849+
) {
850+
self.write_batch_with_index
851+
.merge_cf(self.data_cf_handle, key, value);
852+
}
853+
854+
#[inline]
855+
pub fn raw_delete_cf(&mut self, _key_kind: KeyKind, key: impl AsRef<[u8]>) {
856+
self.write_batch_with_index
857+
.delete_cf(self.data_cf_handle, key);
858+
}
859+
825860
pub(crate) fn prefix_iterator(
826861
&self,
827862
table: TableKind,

0 commit comments

Comments
 (0)