diff --git a/Cargo.lock b/Cargo.lock index a384d7601f..c92b6f74d0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6872,6 +6872,7 @@ dependencies = [ "restate-bifrost", "restate-core", "restate-errors", + "restate-ingestion-client", "restate-metadata-store", "restate-service-client", "restate-service-protocol", @@ -7268,6 +7269,27 @@ dependencies = [ "tower 0.5.2", ] +[[package]] +name = "restate-ingestion-client" +version = "1.6.0-dev" +dependencies = [ + "arc-swap", + "bytes", + "dashmap", + "futures", + "googletest", + "pin-project-lite", + "restate-core", + "restate-types", + "restate-workspace-hack", + "test-log", + "thiserror 2.0.17", + "tokio", + "tokio-stream", + "tokio-util", + "tracing", +] + [[package]] name = "restate-ingress-http" version = "1.6.0-dev" @@ -7325,13 +7347,14 @@ dependencies = [ "base64 0.22.1", "bytes", "derive_more", + "futures", "metrics", "opentelemetry", "opentelemetry_sdk", "parking_lot", "rdkafka", - "restate-bifrost", "restate-core", + "restate-ingestion-client", "restate-serde-util", "restate-storage-api", "restate-timer-queue", @@ -7344,6 +7367,7 @@ dependencies = [ "tokio", "tracing", "tracing-opentelemetry", + "xxhash-rust", ] [[package]] @@ -7683,6 +7707,7 @@ dependencies = [ "restate-bifrost", "restate-core", "restate-futures-util", + "restate-ingestion-client", "restate-ingress-http", "restate-log-server", "restate-metadata-providers", @@ -7696,6 +7721,7 @@ dependencies = [ "restate-storage-query-datafusion", "restate-tracing-instrumentation", "restate-types", + "restate-wal-protocol", "restate-worker", "restate-workspace-hack", "rust-rocksdb", @@ -8338,6 +8364,7 @@ dependencies = [ "restate-bifrost", "restate-core", "restate-errors", + "restate-ingestion-client", "restate-ingress-http", "restate-ingress-kafka", "restate-invoker-api", @@ -8518,6 +8545,7 @@ dependencies = [ "typenum", "ulid", "uuid", + "xxhash-rust", "zerocopy 0.7.35", "zeroize", "zstd 0.13.2", @@ -11302,8 +11330,8 @@ dependencies = [ "anyhow", "reqwest", "restate-admin", - "restate-bifrost", "restate-core", + "restate-ingestion-client", "restate-metadata-server", "restate-service-client", "restate-service-protocol", diff --git a/Cargo.toml b/Cargo.toml index 24d0e13f3e..17a61ab4b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -86,6 +86,7 @@ restate-utoipa = { path = "crates/utoipa" } restate-vqueues = { path = "crates/vqueues" } restate-wal-protocol = { path = "crates/wal-protocol" } restate-worker = { path = "crates/worker" } +restate-ingestion-client = { path = "crates/ingestion-client" } # this workspace-hack package is overridden by a patch below to use workspace-hack subdir when building in this repo # outside this repo, the crates.io restate-workspace-hack (an empty package) will be used instead diff --git a/crates/admin/Cargo.toml b/crates/admin/Cargo.toml index 9227ef9510..7b2eeb1af2 100644 --- a/crates/admin/Cargo.toml +++ b/crates/admin/Cargo.toml @@ -22,6 +22,7 @@ restate-admin-rest-model = { workspace = true, features = ["schema"] } restate-bifrost = { workspace = true, features = ["local-loglet", "replicated-loglet"] } restate-core = { workspace = true } restate-errors = { workspace = true } +restate-ingestion-client = { workspace = true } restate-metadata-store = { workspace = true } restate-service-client = { workspace = true } restate-service-protocol = { workspace = true, features = ["discovery"] } diff --git a/crates/admin/src/rest_api/deployments.rs b/crates/admin/src/rest_api/deployments.rs index ba6a750958..02e70dbd0a 100644 --- a/crates/admin/src/rest_api/deployments.rs +++ b/crates/admin/src/rest_api/deployments.rs @@ -58,8 +58,8 @@ use serde::Deserialize; from_type = "MetaApiError", ) )] -pub async fn create_deployment( - State(state): State>, +pub async fn create_deployment( + State(state): State>, Extension(version): Extension, #[request_body(required = true)] Json(payload): Json, ) -> Result @@ -188,8 +188,8 @@ where schema = "std::string::String" )) )] -pub async fn get_deployment( - State(state): State>, +pub async fn get_deployment( + State(state): State>, Path(deployment_id): Path, ) -> Result, MetaApiError> where @@ -210,8 +210,8 @@ where operation_id = "list_deployments", tags = "deployment" )] -pub async fn list_deployments( - State(state): State>, +pub async fn list_deployments( + State(state): State>, ) -> Json where Metadata: MetadataService, @@ -267,8 +267,8 @@ pub struct DeleteDeploymentParams { from_type = "MetaApiError", ) )] -pub async fn delete_deployment( - State(state): State>, +pub async fn delete_deployment( + State(state): State>, Path(deployment_id): Path, Query(DeleteDeploymentParams { force }): Query, ) -> Result @@ -302,8 +302,8 @@ where schema = "std::string::String" )) )] -pub async fn update_deployment( - State(state): State>, +pub async fn update_deployment( + State(state): State>, Extension(version): Extension, method: Method, Path(deployment_id): Path, diff --git a/crates/admin/src/rest_api/handlers.rs b/crates/admin/src/rest_api/handlers.rs index 7624d95934..9750d095cf 100644 --- a/crates/admin/src/rest_api/handlers.rs +++ b/crates/admin/src/rest_api/handlers.rs @@ -30,8 +30,8 @@ use restate_types::schema::service::HandlerMetadata; schema = "std::string::String" )) )] -pub async fn list_service_handlers( - State(state): State>, +pub async fn list_service_handlers( + State(state): State>, Path(service_name): Path, ) -> Result, MetaApiError> where @@ -62,8 +62,8 @@ where ) ) )] -pub async fn get_service_handler( - State(state): State>, +pub async fn get_service_handler( + State(state): State>, Path((service_name, handler_name)): Path<(String, String)>, ) -> Result, MetaApiError> where diff --git a/crates/admin/src/rest_api/invocations.rs b/crates/admin/src/rest_api/invocations.rs index 42446a69c7..43518e7d62 100644 --- a/crates/admin/src/rest_api/invocations.rs +++ b/crates/admin/src/rest_api/invocations.rs @@ -8,121 +8,23 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use super::error::*; -use crate::generate_meta_api_error; -use crate::rest_api::create_envelope_header; -use crate::state::AdminServiceState; use axum::Json; use axum::extract::{Path, Query, State}; use axum::http::StatusCode; use okapi_operation::*; +use serde::Deserialize; + use restate_admin_rest_model::invocations::RestartAsNewInvocationResponse; -use restate_types::identifiers::{ - DeploymentId, InvocationId, PartitionProcessorRpcRequestId, WithPartitionKey, -}; +use restate_types::identifiers::{DeploymentId, InvocationId, PartitionProcessorRpcRequestId}; use restate_types::invocation::client::{ self, CancelInvocationResponse, InvocationClient, KillInvocationResponse, PauseInvocationResponse, PurgeInvocationResponse, ResumeInvocationResponse, }; -use restate_types::invocation::{InvocationTermination, PurgeInvocationRequest, TerminationFlavor}; use restate_types::journal_v2::EntryIndex; -use restate_wal_protocol::{Command, Envelope}; -use serde::Deserialize; -use std::sync::Arc; -use tracing::warn; -#[derive(Debug, Default, Deserialize, JsonSchema)] -pub enum DeletionMode { - #[default] - #[serde(alias = "cancel")] - Cancel, - #[serde(alias = "kill")] - Kill, - #[serde(alias = "purge")] - Purge, -} -#[derive(Debug, Default, Deserialize, JsonSchema)] -pub struct DeleteInvocationParams { - pub mode: Option, -} - -/// Terminate an invocation -#[openapi( - summary = "Delete an invocation", - deprecated = true, - description = "Use kill_invocation/cancel_invocation/purge_invocation instead.", - operation_id = "delete_invocation", - tags = "invocation", - parameters( - path( - name = "invocation_id", - description = "Invocation identifier.", - schema = "std::string::String" - ), - query( - name = "mode", - description = "If cancel, it will gracefully terminate the invocation. \ - If kill, it will terminate the invocation with a hard stop. \ - If purge, it will only cleanup the response for completed invocations, and leave unaffected an in-flight invocation.", - required = false, - style = "simple", - allow_empty_value = false, - schema = "DeletionMode", - ) - ), - responses( - ignore_return_type = true, - response( - status = "202", - description = "Accepted", - content = "okapi_operation::Empty", - ), - from_type = "MetaApiError", - ) -)] -pub async fn delete_invocation( - State(state): State>, - Path(invocation_id): Path, - Query(DeleteInvocationParams { mode }): Query, -) -> Result { - let invocation_id = invocation_id - .parse::() - .map_err(|e| MetaApiError::InvalidField("invocation_id", e.to_string()))?; - - let cmd = match mode.unwrap_or_default() { - DeletionMode::Cancel => Command::TerminateInvocation(InvocationTermination { - invocation_id, - flavor: TerminationFlavor::Cancel, - response_sink: None, - }), - DeletionMode::Kill => Command::TerminateInvocation(InvocationTermination { - invocation_id, - flavor: TerminationFlavor::Kill, - response_sink: None, - }), - DeletionMode::Purge => Command::PurgeInvocation(PurgeInvocationRequest { - invocation_id, - response_sink: None, - }), - }; - - let partition_key = invocation_id.partition_key(); - - let result = restate_bifrost::append_to_bifrost( - &state.bifrost, - Arc::new(Envelope::new(create_envelope_header(partition_key), cmd)), - ) - .await; - - if let Err(err) = result { - warn!("Could not append invocation termination command to Bifrost: {err}"); - Err(MetaApiError::Internal( - "Failed sending invocation termination to the cluster.".to_owned(), - )) - } else { - Ok(StatusCode::ACCEPTED) - } -} +use super::error::*; +use crate::generate_meta_api_error; +use crate::state::AdminServiceState; generate_meta_api_error!(KillInvocationError: [InvocationNotFoundError, InvocationClientError, InvalidFieldError, InvocationWasAlreadyCompletedError]); @@ -139,8 +41,8 @@ generate_meta_api_error!(KillInvocationError: [InvocationNotFoundError, Invocati schema = "std::string::String" )) )] -pub async fn kill_invocation( - State(state): State>, +pub async fn kill_invocation( + State(state): State>, Path(invocation_id): Path, ) -> Result<(), KillInvocationError> where @@ -199,8 +101,8 @@ generate_meta_api_error!(CancelInvocationError: [InvocationNotFoundError, Invoca from_type = "CancelInvocationError", ) )] -pub async fn cancel_invocation( - State(state): State>, +pub async fn cancel_invocation( + State(state): State>, Path(invocation_id): Path, ) -> Result where @@ -241,8 +143,8 @@ generate_meta_api_error!(PurgeInvocationError: [InvocationNotFoundError, Invocat schema = "std::string::String" )) )] -pub async fn purge_invocation( - State(state): State>, +pub async fn purge_invocation( + State(state): State>, Path(invocation_id): Path, ) -> Result<(), PurgeInvocationError> where @@ -284,8 +186,8 @@ generate_meta_api_error!(PurgeJournalError: [InvocationNotFoundError, Invocation schema = "std::string::String" )) )] -pub async fn purge_journal( - State(state): State>, +pub async fn purge_journal( + State(state): State>, Path(invocation_id): Path, ) -> Result<(), PurgeJournalError> where @@ -398,8 +300,8 @@ generate_meta_api_error!(RestartInvocationError: [ ), ) )] -pub async fn restart_as_new_invocation( - State(state): State>, +pub async fn restart_as_new_invocation( + State(state): State>, Path(invocation_id): Path, Query(RestartAsNewInvocationQueryParams { from, deployment }): Query< RestartAsNewInvocationQueryParams, @@ -510,8 +412,8 @@ generate_meta_api_error!(ResumeInvocationError: [ ) ) )] -pub async fn resume_invocation( - State(state): State>, +pub async fn resume_invocation( + State(state): State>, Path(invocation_id): Path, Query(ResumeInvocationQueryParams { deployment }): Query, ) -> Result<(), ResumeInvocationError> @@ -596,8 +498,8 @@ generate_meta_api_error!(PauseInvocationError: [ from_type = "PauseInvocationError", ) )] -pub async fn pause_invocation( - State(state): State>, +pub async fn pause_invocation( + State(state): State>, Path(invocation_id): Path, ) -> Result where diff --git a/crates/admin/src/rest_api/mod.rs b/crates/admin/src/rest_api/mod.rs index 3dda5102e6..191534723f 100644 --- a/crates/admin/src/rest_api/mod.rs +++ b/crates/admin/src/rest_api/mod.rs @@ -23,6 +23,7 @@ mod version; use okapi_operation::axum_integration::{delete, get, patch, post}; use okapi_operation::okapi::openapi3::{ExternalDocs, Tag}; use okapi_operation::*; +use restate_core::network::TransportConnect; use restate_types::identifiers::PartitionKey; use restate_types::invocation::client::InvocationClient; use restate_types::schema::registry::{DiscoveryClient, MetadataService, TelemetryClient}; @@ -32,14 +33,15 @@ use crate::state::AdminServiceState; pub use version::{MAX_ADMIN_API_VERSION, MIN_ADMIN_API_VERSION}; -pub fn create_router( - state: AdminServiceState, +pub fn create_router( + state: AdminServiceState, ) -> axum::Router<()> where Metadata: MetadataService + Send + Sync + Clone + 'static, Discovery: DiscoveryClient + Send + Sync + Clone + 'static, Telemetry: TelemetryClient + Send + Sync + Clone + 'static, Invocations: InvocationClient + Send + Sync + Clone + 'static, + Transport: TransportConnect, { let mut router = axum_integration::Router::new() .route( @@ -91,10 +93,6 @@ where "/services/{service}/handlers/{handler}", get(openapi_handler!(handlers::get_service_handler)), ) - .route( - "/invocations/{invocation_id}", - delete(openapi_handler!(invocations::delete_invocation)), - ) .route( "/invocations/{invocation_id}/kill", patch(openapi_handler!(invocations::kill_invocation)), diff --git a/crates/admin/src/rest_api/services.rs b/crates/admin/src/rest_api/services.rs index 93664e7b7f..2c45a96ab8 100644 --- a/crates/admin/src/rest_api/services.rs +++ b/crates/admin/src/rest_api/services.rs @@ -8,7 +8,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use std::sync::Arc; use tracing::{debug, warn}; use axum::Json; @@ -20,6 +19,7 @@ use okapi_operation::*; use restate_admin_rest_model::services::ListServicesResponse; use restate_admin_rest_model::services::*; use restate_core::TaskCenter; +use restate_core::network::TransportConnect; use restate_errors::warn_it; use restate_types::config::Configuration; use restate_types::identifiers::{ServiceId, WithPartitionKey}; @@ -40,8 +40,8 @@ use crate::state::AdminServiceState; operation_id = "list_services", tags = "service" )] -pub async fn list_services( - State(state): State>, +pub async fn list_services( + State(state): State>, ) -> Result, MetaApiError> where Metadata: MetadataService, @@ -63,8 +63,8 @@ where schema = "std::string::String" )) )] -pub async fn get_service( - State(state): State>, +pub async fn get_service( + State(state): State>, Path(service_name): Path, ) -> Result, MetaApiError> where @@ -98,8 +98,8 @@ where from_type = "MetaApiError", ) )] -pub async fn get_service_openapi( - State(state): State>, +pub async fn get_service_openapi( + State(state): State>, Path(service_name): Path, ) -> Result, MetaApiError> where @@ -131,8 +131,8 @@ where schema = "std::string::String" )) )] -pub async fn modify_service( - State(state): State>, +pub async fn modify_service( + State(state): State>, Path(service_name): Path, #[request_body(required = true)] Json(ModifyServiceRequest { public, @@ -196,8 +196,8 @@ where from_type = "MetaApiError", ) )] -pub async fn modify_service_state( - State(state): State>, +pub async fn modify_service_state( + State(state): State>, Path(service_name): Path, #[request_body(required = true)] Json(ModifyServiceStateRequest { version, @@ -207,6 +207,7 @@ pub async fn modify_service_state( ) -> Result where Metadata: MetadataService, + Transport: TransportConnect, { if let Some(svc) = state.schema_registry.get_service(&service_name) { if !svc.ty.has_state() { @@ -236,16 +237,23 @@ where state: new_state, }; - let result = restate_bifrost::append_to_bifrost( - &state.bifrost, - Arc::new(Envelope::new( - create_envelope_header(partition_key), - Command::PatchState(patch_state), - )), - ) - .await; + let envelope = Envelope::new( + create_envelope_header(partition_key), + Command::PatchState(patch_state), + ); + + let result = state + .ingestion_client + .ingest(partition_key, envelope) + .await + .map_err(|err| { + warn!("Could not append state patching command to Bifrost: {err}"); + MetaApiError::Internal( + "Failed sending state patching command to the cluster.".to_owned(), + ) + })?; - if let Err(err) = result { + if let Err(err) = result.await { warn!("Could not append state patching command to Bifrost: {err}"); Err(MetaApiError::Internal( "Failed sending state patching command to the cluster.".to_owned(), diff --git a/crates/admin/src/rest_api/subscriptions.rs b/crates/admin/src/rest_api/subscriptions.rs index d6b5dac634..0159fa84d3 100644 --- a/crates/admin/src/rest_api/subscriptions.rs +++ b/crates/admin/src/rest_api/subscriptions.rs @@ -42,8 +42,8 @@ use restate_types::schema::registry::MetadataService; from_type = "MetaApiError", ) )] -pub async fn create_subscription( - State(state): State>, +pub async fn create_subscription( + State(state): State>, #[request_body(required = true)] Json(payload): Json, ) -> Result where @@ -77,8 +77,8 @@ where schema = "std::string::String" )) )] -pub async fn get_subscription( - State(state): State>, +pub async fn get_subscription( + State(state): State>, Path(subscription_id): Path, ) -> Result, MetaApiError> where @@ -117,8 +117,8 @@ where ) ) )] -pub async fn list_subscriptions( - State(state): State>, +pub async fn list_subscriptions( + State(state): State>, Query(ListSubscriptionsParams { sink, source }): Query, ) -> Json where @@ -168,8 +168,8 @@ where from_type = "MetaApiError", ) )] -pub async fn delete_subscription( - State(state): State>, +pub async fn delete_subscription( + State(state): State>, Path(subscription_id): Path, ) -> Result where diff --git a/crates/admin/src/service.rs b/crates/admin/src/service.rs index 30ee35533d..edbdf0f43a 100644 --- a/crates/admin/src/service.rs +++ b/crates/admin/src/service.rs @@ -12,14 +12,15 @@ use std::time::Duration; use axum::error_handling::HandleErrorLayer; use http::{Request, Response, StatusCode}; +use restate_ingestion_client::IngestionClient; +use restate_wal_protocol::Envelope; use tower::ServiceBuilder; use tower_http::classify::ServerErrorsFailureClass; use tower_http::trace::TraceLayer; use tracing::{Span, debug, error, info, info_span}; use restate_admin_rest_model::version::AdminApiVersion; -use restate_bifrost::Bifrost; -use restate_core::network::net_util; +use restate_core::network::{TransportConnect, net_util}; use restate_core::{MetadataWriter, TaskCenter}; use restate_service_client::HttpClient; use restate_service_protocol::discovery::ServiceDiscovery; @@ -39,9 +40,9 @@ use crate::{rest_api, state}; #[error("could not create the service client: {0}")] pub struct BuildError(#[from] restate_service_client::BuildError); -pub struct AdminService { +pub struct AdminService { listeners: Listeners, - bifrost: Bifrost, + ingestion_client: IngestionClient, schema_registry: SchemaRegistry, invocation_client: Invocations, #[cfg(feature = "storage-query")] @@ -50,21 +51,23 @@ pub struct AdminService { metadata_writer: MetadataWriter, } -impl AdminService +impl + AdminService where Invocations: InvocationClient + Send + Sync + Clone + 'static, + Transport: TransportConnect, { pub fn new( listeners: Listeners, metadata_writer: MetadataWriter, - bifrost: Bifrost, + ingestion_client: IngestionClient, invocation_client: Invocations, service_discovery: ServiceDiscovery, telemetry_http_client: Option, ) -> Self { Self { listeners, - bifrost, + ingestion_client, #[cfg(feature = "metadata-api")] metadata_writer: metadata_writer.clone(), schema_registry: SchemaRegistry::new( @@ -98,7 +101,7 @@ where let rest_state = state::AdminServiceState::new( self.schema_registry, self.invocation_client, - self.bifrost, + self.ingestion_client, ); let router = axum::Router::new(); diff --git a/crates/admin/src/state.rs b/crates/admin/src/state.rs index a08cfedbbc..4a8cbbffaa 100644 --- a/crates/admin/src/state.rs +++ b/crates/admin/src/state.rs @@ -8,28 +8,32 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use restate_bifrost::Bifrost; +use restate_core::network::TransportConnect; +use restate_ingestion_client::IngestionClient; use restate_types::schema::registry::SchemaRegistry; +use restate_wal_protocol::Envelope; #[derive(Clone, derive_builder::Builder)] -pub struct AdminServiceState { +pub struct AdminServiceState { pub schema_registry: SchemaRegistry, pub invocation_client: Invocations, - pub bifrost: Bifrost, + pub ingestion_client: IngestionClient, } -impl - AdminServiceState +impl + AdminServiceState +where + Transport: TransportConnect, { pub fn new( schema_registry: SchemaRegistry, invocation_client: Invocations, - bifrost: Bifrost, + ingestion_client: IngestionClient, ) -> Self { Self { schema_registry, invocation_client, - bifrost, + ingestion_client, } } } diff --git a/crates/bifrost/src/background_appender.rs b/crates/bifrost/src/background_appender.rs index 5ad0bbe1db..5d8142d524 100644 --- a/crates/bifrost/src/background_appender.rs +++ b/crates/bifrost/src/background_appender.rs @@ -8,12 +8,10 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use std::sync::Arc; - use futures::FutureExt; use pin_project::pin_project; use restate_types::logs::Record; -use tokio::sync::{Notify, mpsc, oneshot}; +use tokio::sync::{mpsc, oneshot}; use tracing::{trace, warn}; use restate_core::{ShutdownError, TaskCenter, TaskHandle, cancellation_watcher}; @@ -130,8 +128,8 @@ where batch.push(record); notif_buffer.push(tx); } - AppendOperation::Canary(notify) => { - notify.notify_one(); + AppendOperation::Canary(tx) => { + notif_buffer.push(tx); } AppendOperation::MarkAsPreferred => { appender.mark_as_preferred(); @@ -353,23 +351,19 @@ impl LogSender { Ok(CommitToken { rx }) } - /// Wait for previously enqueued records to be committed - /// - /// Not cancellation safe. Every call will attempt to acquire capacity on the channel and send - /// a new message to the appender. - pub async fn notify_committed(&self) -> Result<(), EnqueueError<()>> { + /// Returns a [`CommitToken`] that is resolved once all previously enqueued records are committed. + pub async fn notify_committed(&self) -> Result> { let Ok(permit) = self.tx.reserve().await else { // channel is closed, this should happen the appender is draining or has been darained // already return Err(EnqueueError::Closed(())); }; - let notify = Arc::new(Notify::new()); - let canary = AppendOperation::Canary(notify.clone()); + let (tx, rx) = oneshot::channel(); + let canary = AppendOperation::Canary(tx); permit.send(canary); - notify.notified().await; - Ok(()) + Ok(CommitToken { rx }) } /// Marks this node as a preferred writer for the underlying log @@ -422,7 +416,7 @@ enum AppendOperation { EnqueueWithNotification(Record, oneshot::Sender<()>), // A message denoting a request to be notified when it's processed by the appender. // It's used to check if previously enqueued appends have been committed or not - Canary(Arc), + Canary(oneshot::Sender<()>), /// Let's bifrost know that this node is the preferred writer of this log MarkAsPreferred, /// Let's bifrost know that this node might not be the preferred writer of this log diff --git a/crates/bifrost/src/record.rs b/crates/bifrost/src/record.rs index aba7d1aed6..48a71d7136 100644 --- a/crates/bifrost/src/record.rs +++ b/crates/bifrost/src/record.rs @@ -12,6 +12,7 @@ use core::str; use std::marker::PhantomData; use std::sync::Arc; +use bytes::Bytes; use restate_types::logs::{BodyWithKeys, HasRecordKeys, Keys, Lsn, Record}; use restate_types::logs::{LogletOffset, SequenceNumber}; use restate_types::storage::{PolyBytes, StorageDecode, StorageDecodeError, StorageEncode}; @@ -200,29 +201,19 @@ pub struct Gap { pub to: S, } +#[derive(Clone)] pub struct InputRecord { created_at: NanosSinceEpoch, keys: Keys, - body: Arc, + body: PolyBytes, _phantom: PhantomData, } -impl Clone for InputRecord { - fn clone(&self) -> Self { - Self { - created_at: self.created_at, - keys: self.keys.clone(), - body: Arc::clone(&self.body), - _phantom: self._phantom, - } - } -} - // This is a zero-cost transformation. The type is erased at runtime, but the underlying // layout is identical. impl InputRecord { pub fn into_record(self) -> Record { - Record::from_parts(self.created_at, self.keys, PolyBytes::Typed(self.body)) + Record::from_parts(self.created_at, self.keys, self.body) } } @@ -231,7 +222,24 @@ impl InputRecord { Self { created_at, keys, - body, + body: PolyBytes::Typed(body), + _phantom: PhantomData, + } + } + + /// Builds an [`InputRecord`] directly from raw bytes without validating the payload. + /// + /// # Safety + /// Caller must guarantee the bytes are a correctly storage-encoded `T`. + pub unsafe fn from_bytes_unchecked( + created_at: NanosSinceEpoch, + keys: Keys, + body: Bytes, + ) -> Self { + Self { + created_at, + keys, + body: PolyBytes::Bytes(body), _phantom: PhantomData, } } @@ -246,7 +254,7 @@ impl From> for InputRecord { InputRecord { created_at: NanosSinceEpoch::now(), keys: val.record_keys(), - body: val, + body: PolyBytes::Typed(val), _phantom: PhantomData, } } @@ -257,7 +265,7 @@ impl From for InputRecord { InputRecord { created_at: NanosSinceEpoch::now(), keys: Keys::None, - body: Arc::new(val), + body: PolyBytes::Typed(Arc::new(val)), _phantom: PhantomData, } } @@ -268,7 +276,7 @@ impl From<&str> for InputRecord { InputRecord { created_at: NanosSinceEpoch::now(), keys: Keys::None, - body: Arc::new(String::from(val)), + body: PolyBytes::Typed(Arc::new(String::from(val))), _phantom: PhantomData, } } @@ -279,7 +287,7 @@ impl From> for InputRecord { InputRecord { created_at: NanosSinceEpoch::now(), keys: val.record_keys(), - body: Arc::new(val.into_inner()), + body: PolyBytes::Typed(Arc::new(val.into_inner())), _phantom: PhantomData, } } diff --git a/crates/core/src/worker_api/partition_processor_rpc_client.rs b/crates/core/src/worker_api/partition_processor_rpc_client.rs index cc8ba97d70..c85d840533 100644 --- a/crates/core/src/worker_api/partition_processor_rpc_client.rs +++ b/crates/core/src/worker_api/partition_processor_rpc_client.rs @@ -71,10 +71,6 @@ pub enum RpcErrorKind { Busy, #[error("internal error: {0}")] Internal(String), - #[error("partition processor starting")] - Starting, - #[error("partition processor stopping")] - Stopping, } impl PartitionProcessorInvocationClientError { @@ -106,10 +102,8 @@ impl RpcError { match self.source { RpcErrorKind::Connect(_) | RpcErrorKind::NotLeader - | RpcErrorKind::Starting | RpcErrorKind::Busy - | RpcErrorKind::SendFailed - | RpcErrorKind::Stopping => { + | RpcErrorKind::SendFailed => { // These are pre-flight error that we can distinguish, // and for which we know for certain that no message was proposed yet to the log. true @@ -143,7 +137,7 @@ impl From for RpcErrorKind { RpcReplyError::ServiceNotFound | RpcReplyError::SortCodeNotFound => Self::NotLeader, RpcReplyError::LoadShedding => Self::Busy, RpcReplyError::ServiceNotReady => Self::Busy, - RpcReplyError::ServiceStopped => Self::Stopping, + RpcReplyError::ServiceStopped => Self::LostLeadership, } } } @@ -154,8 +148,6 @@ impl From for RpcErrorKind { PartitionProcessorRpcError::NotLeader(_) => RpcErrorKind::NotLeader, PartitionProcessorRpcError::LostLeadership(_) => RpcErrorKind::LostLeadership, PartitionProcessorRpcError::Internal(msg) => RpcErrorKind::Internal(msg), - PartitionProcessorRpcError::Starting => RpcErrorKind::Starting, - PartitionProcessorRpcError::Stopping => RpcErrorKind::Stopping, } } } diff --git a/crates/ingestion-client/Cargo.toml b/crates/ingestion-client/Cargo.toml new file mode 100644 index 0000000000..fff26f471f --- /dev/null +++ b/crates/ingestion-client/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "restate-ingestion-client" +version.workspace = true +authors.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +publish = false + +[dependencies] +arc-swap = { workspace = true } +dashmap = { workspace = true } +futures = { workspace = true } +pin-project-lite = { workspace = true } +thiserror = { workspace = true } +tokio-stream = { workspace = true } +tokio-util = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } + +restate-core = { workspace = true } +restate-types = { workspace = true } +restate-workspace-hack = { workspace = true } + +[dev-dependencies] +bytes = { workspace = true } +googletest = { workspace = true } +test-log = { workspace = true } + +restate-core = { workspace = true, features = ["test-util"] } diff --git a/crates/ingestion-client/src/chunks_size.rs b/crates/ingestion-client/src/chunks_size.rs new file mode 100644 index 0000000000..875b402e21 --- /dev/null +++ b/crates/ingestion-client/src/chunks_size.rs @@ -0,0 +1,105 @@ +// Copyright (c) 2023 - 2025 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use tokio_stream::adapters::Fuse; +use tokio_stream::{Stream, StreamExt}; + +use core::pin::Pin; +use core::task::{Context, Poll}; +use pin_project_lite::pin_project; + +pin_project! { + #[must_use = "streams do nothing unless polled"] + #[derive(Debug)] + pub struct ChunksSize { + #[pin] + stream: Fuse, + items: Vec, + size: usize, + cap: usize, // https://github.com/rust-lang/futures-rs/issues/1475 + size_fn: F, + } +} + +impl ChunksSize +where + F: Fn(&S::Item) -> usize, +{ + pub fn new(stream: S, max_size: usize, size_fn: F) -> Self { + ChunksSize { + stream: stream.fuse(), + items: Vec::default(), + size: 0, + cap: max_size, + size_fn, + } + } + + /// Drains the buffered items, returning them without waiting for the timeout or capacity limit. + pub fn into_remainder(self) -> Vec { + self.items + } +} + +impl Stream for ChunksSize +where + F: Fn(&S::Item) -> usize, +{ + type Item = Vec; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let mut me = self.as_mut().project(); + loop { + match me.stream.as_mut().poll_next(cx) { + Poll::Pending if me.items.is_empty() => return Poll::Pending, + Poll::Pending => { + *me.size = 0; + return Poll::Ready(Some(std::mem::take(me.items))); + } + Poll::Ready(Some(item)) => { + let item_size = (me.size_fn)(&item); + + if me.items.is_empty() || *me.size + item_size <= *me.cap { + *me.size += item_size; + me.items.push(item); + } else { + // not empty and adding the item will go over the cap + let items = std::mem::replace(me.items, vec![item]); + *me.size = item_size; + return Poll::Ready(Some(items)); + } + + if *me.size >= *me.cap { + *me.size = 0; + return Poll::Ready(Some(std::mem::take(me.items))); + } + } + Poll::Ready(None) => { + // Returning Some here is only correct because we fuse the inner stream. + let last = if me.items.is_empty() { + None + } else { + Some(std::mem::take(me.items)) + }; + + return Poll::Ready(last); + } + } + } + } + + fn size_hint(&self) -> (usize, Option) { + let chunk_len = if self.items.is_empty() { 0 } else { 1 }; + let (lower, upper) = self.stream.size_hint(); + let lower = (lower / self.cap).saturating_add(chunk_len); + let upper = upper.and_then(|x| x.checked_add(chunk_len)); + (lower, upper) + } +} diff --git a/crates/ingestion-client/src/client.rs b/crates/ingestion-client/src/client.rs new file mode 100644 index 0000000000..da667bcf85 --- /dev/null +++ b/crates/ingestion-client/src/client.rs @@ -0,0 +1,374 @@ +// Copyright (c) 2023 - 2025 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::{ + marker::PhantomData, + num::NonZeroUsize, + sync::{Arc, atomic::AtomicUsize}, +}; + +use tokio::sync::Semaphore; + +use restate_core::{ + network::{Networking, TransportConnect}, + partitions::PartitionRouting, +}; +use restate_types::{ + identifiers::PartitionKey, + live::Live, + logs::{HasRecordKeys, Keys}, + net::ingest::IngestRecord, + partitions::{FindPartition, PartitionTable, PartitionTableError}, + storage::StorageEncode, +}; + +use crate::{RecordCommit, SessionOptions, session::SessionManager}; + +/// Errors that can be observed when interacting with the ingress facade. +#[derive(Debug, thiserror::Error)] +pub enum IngestionError { + #[error("Ingress closed")] + Closed, + #[error(transparent)] + PartitionTableError(#[from] PartitionTableError), +} + +/// High-level ingestion entry point that allocates permits and hands out session handles per partition. +/// [`IngestionClient`] can be cloned and shared across different routines. All users will share the same budget +/// and underlying partition sessions. +#[derive(Clone)] +pub struct IngestionClient { + manager: SessionManager, + partition_table: Live, + // memory budget for inflight invocations. + permits: Arc, + available_permits: Arc, + _phantom: PhantomData, +} + +impl IngestionClient { + /// Builds a new ingestion facade with the provided networking stack, partition metadata, and + /// budget (in bytes) for inflight records. + pub fn new( + networking: Networking, + partition_table: Live, + partition_routing: PartitionRouting, + memory_budget: NonZeroUsize, + opts: Option, + ) -> Self { + Self { + manager: SessionManager::new(networking, partition_routing, opts), + partition_table, + permits: Arc::new(Semaphore::new(memory_budget.get())), + available_permits: Arc::new(AtomicUsize::new(memory_budget.get())), + _phantom: PhantomData, + } + } + + pub fn ensure_enough_permits(&self, required: usize) { + use std::sync::atomic::Ordering; + let mut available = self.available_permits.load(Ordering::Relaxed); + while available < required { + let delta = required - available; + match self.available_permits.compare_exchange( + available, + required, + Ordering::Release, + Ordering::Relaxed, + ) { + Ok(_) => { + self.permits.add_permits(delta); + } + Err(current) => { + available = current; + } + } + } + } +} + +impl IngestionClient +where + T: TransportConnect, + V: StorageEncode, +{ + /// Ingest a record with `partition_key`. + pub async fn ingest( + &self, + partition_key: PartitionKey, + record: impl Into>, + ) -> Result { + let record = record.into().into_record(); + + let size = record.estimate_size(); + self.ensure_enough_permits(size); + + let permit = self + .permits + .clone() + .acquire_many_owned(size as u32) + .await + .map_err(|_| IngestionError::Closed)?; + + let partition_id = self + .partition_table + .pinned() + .find_partition_id(partition_key)?; + + self.manager + .get(partition_id) + .ingest(permit, record) + .map_err(|_| IngestionError::Closed) + } + + /// Once closed, calls to ingest will return [`IngestionError::Closed`]. + /// Inflight records might still get committed. + pub fn close(&self) { + self.permits.close(); + self.manager.close(); + } +} + +pub struct InputRecord { + keys: Keys, + record: T, +} + +impl InputRecord +where + T: StorageEncode, +{ + fn into_record(self) -> IngestRecord { + IngestRecord::from_parts(self.keys, self.record) + } +} + +impl From for InputRecord +where + T: HasRecordKeys + StorageEncode, +{ + fn from(value: T) -> Self { + InputRecord { + keys: value.record_keys(), + record: value, + } + } +} + +impl InputRecord { + #[cfg(test)] + fn from_str(s: impl Into) -> Self { + InputRecord { + keys: Keys::None, + record: s.into(), + } + } +} + +#[cfg(test)] +mod test { + use std::{num::NonZeroUsize, time::Duration}; + + use futures::{FutureExt, StreamExt}; + use googletest::prelude::*; + use test_log::test; + + use restate_core::{ + Metadata, TaskCenter, TestCoreEnvBuilder, + network::{ + BackPressureMode, FailingConnector, Incoming, Rpc, ServiceMessage, ServiceStream, + }, + partitions::PartitionRouting, + }; + use restate_types::{ + Version, + identifiers::{LeaderEpoch, PartitionId}, + net::{ + self, RpcRequest, + ingest::{IngestResponse, ReceivedIngestRequest}, + partition_processor::PartitionLeaderService, + }, + partitions::{ + PartitionTable, + state::{LeadershipState, PartitionReplicaSetStates}, + }, + }; + + use crate::{CommitError, IngestionClient, SessionOptions, client::InputRecord}; + + async fn init_env( + batch_size: usize, + ) -> ( + ServiceStream, + IngestionClient, + ) { + let mut builder = TestCoreEnvBuilder::with_incoming_only_connector() + .add_mock_nodes_config() + .set_partition_table(PartitionTable::with_equally_sized_partitions( + Version::MIN, + 4, + )); + + let partition_replica_set_states = PartitionReplicaSetStates::default(); + for i in 0..4 { + partition_replica_set_states.note_observed_leader( + i.into(), + LeadershipState { + current_leader: builder.my_node_id, + current_leader_epoch: LeaderEpoch::INITIAL, + }, + ); + } + + let svc = builder + .router_builder + .register_service::( + 10, + BackPressureMode::PushBack, + ); + + let incoming = svc.start(); + + let env = builder.build().await; + let client = IngestionClient::new( + env.networking, + env.metadata.updateable_partition_table(), + PartitionRouting::new(partition_replica_set_states, TaskCenter::current()), + NonZeroUsize::new(10 * 1024 * 1024).unwrap(), // 10MB + SessionOptions { + batch_size, + ..Default::default() + } + .into(), + ); + + (incoming, client) + } + + async fn must_next( + recv: &mut ServiceStream, + ) -> Incoming> { + let Some(ServiceMessage::Rpc(msg)) = recv.next().await else { + panic!("stream closed"); + }; + + assert_eq!(msg.msg_type(), ReceivedIngestRequest::TYPE); + msg.into_typed() + } + + #[test(restate_core::test)] + async fn test_client_single_record() { + let (mut incoming, client) = init_env(10).await; + + let commit = client + .ingest(0, InputRecord::from_str("hello world")) + .await + .unwrap(); + + let msg = must_next(&mut incoming).await; + let (rx, body) = msg.split(); + assert_that!( + body.records, + all!( + len(eq(1)), + contains(eq(InputRecord::from_str("hello world").into_record())) + ) + ); + + rx.send(IngestResponse::Ack); + + commit.await.expect("to resolve"); + } + + #[test(restate_core::test)] + async fn test_client_single_record_retry() { + let (mut incoming, client) = init_env(10).await; + + let mut commit = client + .ingest(0, InputRecord::from_str("hello world")) + .await + .unwrap(); + + let msg = must_next(&mut incoming).await; + let (rx, _) = msg.split(); + rx.send(IngestResponse::NotLeader { of: 0.into() }); + + assert!((&mut commit).now_or_never().is_none()); + + // ingress will retry automatically so we must receive another message + let msg = must_next(&mut incoming).await; + let (rx, body) = msg.split(); + assert_that!( + body.records, + all!( + len(eq(1)), + contains(eq(InputRecord::from_str("hello world").into_record())) + ) + ); + // lets acknowledge it this time + rx.send(IngestResponse::Ack); + + commit.await.expect("to resolve"); + } + + #[test(restate_core::test)] + async fn test_client_close() { + let (_, client) = init_env(10).await; + + let commit = client + .ingest(0, InputRecord::from_str("hello world")) + .await + .unwrap(); + + client.close(); + + assert!(matches!(commit.await, Err(CommitError::Cancelled))); + } + + #[test(restate_core::test(start_paused = true))] + async fn test_client_dispatch() { + let (mut incoming, client) = init_env(10).await; + + let pt = Metadata::with_current(|p| p.partition_table_snapshot()); + + for p in 0..4 { + let partition_id = PartitionId::from(p); + let partition = pt.get(&partition_id).unwrap(); + client + .ingest( + *partition.key_range.start(), + InputRecord::from_str(format!("partition {p}")), + ) + .await + .unwrap(); + } + + tokio::time::advance(Duration::from_millis(10)).await; // batch timeout + + // what happens is that we still get 4 different message because each targets + // a single partition. + let mut received = vec![]; + for _ in 0..4 { + let msg = must_next(&mut incoming).await; + received.push(msg.sort_code()); + } + + assert_that!( + received, + all!( + len(eq(4)), //4 messages for 4 partitions + contains(eq(Some(0))), + contains(eq(Some(1))), + contains(eq(Some(2))), + contains(eq(Some(3))), + ) + ); + } +} diff --git a/crates/ingestion-client/src/lib.rs b/crates/ingestion-client/src/lib.rs new file mode 100644 index 0000000000..830d43e306 --- /dev/null +++ b/crates/ingestion-client/src/lib.rs @@ -0,0 +1,16 @@ +// Copyright (c) 2023 - 2025 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod chunks_size; +mod client; +mod session; + +pub use client::{IngestionClient, IngestionError}; +pub use session::{CommitError, RecordCommit, SessionOptions}; diff --git a/crates/ingestion-client/src/session.rs b/crates/ingestion-client/src/session.rs new file mode 100644 index 0000000000..065e84a8f6 --- /dev/null +++ b/crates/ingestion-client/src/session.rs @@ -0,0 +1,473 @@ +// Copyright (c) 2023 - 2025 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::{collections::VecDeque, sync::Arc, time::Duration}; + +use dashmap::DashMap; +use futures::{FutureExt, StreamExt, future::OptionFuture, ready}; +use tokio::sync::{OwnedSemaphorePermit, mpsc, oneshot}; +use tokio_stream::wrappers::UnboundedReceiverStream; +use tokio_util::sync::CancellationToken; +use tracing::{debug, trace}; + +use restate_core::{ + TaskCenter, TaskKind, cancellation_token, + network::{ + ConnectError, Connection, ConnectionClosed, NetworkSender, Networking, ReplyRx, Swimlane, + TransportConnect, + }, + partitions::PartitionRouting, +}; +use restate_types::{ + identifiers::PartitionId, + net::ingest::{IngestRecord, IngestRequest, IngestResponse}, + retries::{RetryIter, RetryPolicy}, +}; + +use crate::chunks_size::ChunksSize; + +/// Error returned when attempting to use a session that has already been closed. +#[derive(Clone, Copy, Debug, thiserror::Error)] +#[error("Partition session is closed")] +pub struct SessionClosed; + +/// Commitment failures that can be observed when waiting on [`RecordCommit`]. +#[derive(Debug, Clone, Copy, thiserror::Error)] +pub enum CommitError { + #[error("commit cancelled")] + Cancelled, +} + +/// Future that is resolved to the commit result +/// A [`CommitError::Cancelled`] might be returned +/// if [`crate::Ingress`] is closed while record is in +/// flight. This does not guarantee that the record +/// was not processed or committed. +pub struct RecordCommit { + rx: oneshot::Receiver>, +} + +impl Future for RecordCommit { + type Output = Result<(), CommitError>; + + fn poll( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll { + match ready!(self.rx.poll_unpin(cx)) { + Ok(result) => std::task::Poll::Ready(result), + Err(_) => std::task::Poll::Ready(Err(CommitError::Cancelled)), + } + } +} + +impl RecordCommit { + fn new(permit: OwnedSemaphorePermit) -> (Self, RecordCommitResolver) { + let (tx, rx) = oneshot::channel(); + ( + Self { rx }, + RecordCommitResolver { + tx, + _permit: permit, + }, + ) + } +} + +struct RecordCommitResolver { + tx: oneshot::Sender>, + _permit: OwnedSemaphorePermit, +} + +impl RecordCommitResolver { + /// Resolve the [`RecordCommit`] to committed. + pub fn committed(self) { + let _ = self.tx.send(Ok(())); + } + + /// explicitly cancel the RecordCommit + /// If resolver is dropped, the RecordCommit + /// will resolve to [`CommitError::Cancelled`] + #[allow(dead_code)] + pub fn cancelled(self) { + let _ = self.tx.send(Err(CommitError::Cancelled)); + } +} + +struct IngressBatch { + records: Arc<[IngestRecord]>, + resolvers: Vec, + + reply_rx: Option>, +} + +impl IngressBatch { + fn new(batch: impl IntoIterator) -> Self { + let (resolvers, records): (Vec<_>, Vec<_>) = batch.into_iter().unzip(); + let records: Arc<[IngestRecord]> = Arc::from(records); + + Self { + records, + resolvers, + reply_rx: None, + } + } + /// Marks every tracked record in the batch as committed. + fn committed(self) { + for resolver in self.resolvers { + resolver.committed(); + } + } +} + +/// Tunable parameters for batching and networking behaviour of partition sessions. +#[derive(Debug, Clone)] +pub struct SessionOptions { + /// Maximum batch size in `bytes` + pub batch_size: usize, + /// Connection retry policy + /// Retry policy must be infinite (retries forever) + /// If not, the retry will fallback to 2 seconds intervals + pub connect_retry_policy: RetryPolicy, + /// Connection swimlane + pub swimlane: Swimlane, +} + +impl Default for SessionOptions { + fn default() -> Self { + Self { + // The default batch size of 50KB is to avoid + // overwhelming the PP on the hot path. + batch_size: 50 * 1024, // 50 KB + swimlane: Swimlane::IngressData, + connect_retry_policy: RetryPolicy::exponential( + Duration::from_millis(10), + 2.0, + None, + Some(Duration::from_secs(1)), + ), + } + } +} + +/// Cloneable sender that enqueues records for a specific partition session. +#[derive(Clone)] +pub struct SessionHandle { + tx: mpsc::UnboundedSender<(RecordCommitResolver, IngestRecord)>, +} + +impl SessionHandle { + /// Enqueues an ingest request along with the owned permit and returns a future tracking commit outcome. + pub fn ingest( + &self, + permit: OwnedSemaphorePermit, + record: IngestRecord, + ) -> Result { + let (commit, resolver) = RecordCommit::new(permit); + self.tx + .send((resolver, record)) + .map_err(|_| SessionClosed)?; + + Ok(commit) + } +} + +enum SessionState { + Connecting { retry: RetryIter<'static> }, + Connected { connection: Connection }, + Disconnected, + Shutdown, +} + +/// Background task that drives the lifecycle of a single partition connection. +pub struct PartitionSession { + partition: PartitionId, + partition_routing: PartitionRouting, + networking: Networking, + opts: SessionOptions, + rx: UnboundedReceiverStream<(RecordCommitResolver, IngestRecord)>, + tx: mpsc::UnboundedSender<(RecordCommitResolver, IngestRecord)>, + inflight: VecDeque, +} + +impl PartitionSession { + fn new( + networking: Networking, + partition_routing: PartitionRouting, + partition: PartitionId, + opts: SessionOptions, + ) -> Self { + let (tx, rx) = mpsc::unbounded_channel(); + let rx = UnboundedReceiverStream::new(rx); + + Self { + partition, + partition_routing, + networking, + opts, + inflight: Default::default(), + rx, + tx, + } + } + + /// Returns a handle that can be used by callers to enqueue new records. + pub fn handle(&self) -> SessionHandle { + SessionHandle { + tx: self.tx.clone(), + } + } +} + +impl PartitionSession +where + T: TransportConnect, +{ + /// Runs the session state machine until shut down, reacting to cancellation and connection errors. + pub async fn start(mut self, cancellation: CancellationToken) { + let mut state = SessionState::Connecting { + retry: self.opts.connect_retry_policy.clone().into_iter(), + }; + + debug!( + partition_id = %self.partition, + "Starting ingress partition session", + ); + + loop { + state = match state { + SessionState::Connecting { retry } => cancellation + .run_until_cancelled(self.connect(retry)) + .await + .unwrap_or(SessionState::Shutdown), + SessionState::Connected { connection } => cancellation + .run_until_cancelled(self.connected(connection)) + .await + .unwrap_or(SessionState::Shutdown), + SessionState::Disconnected => SessionState::Connecting { + retry: self.opts.connect_retry_policy.clone().into_iter(), + }, + SessionState::Shutdown => { + self.rx.close(); + break; + } + } + } + } + + async fn connect(&self, mut retry: RetryIter<'static>) -> SessionState { + let Some(node_id) = self.partition_routing.get_node_by_partition(self.partition) else { + tokio::time::sleep(retry.next().unwrap_or_else(|| Duration::from_secs(2))).await; + + return SessionState::Connecting { retry }; + }; + + let result = self + .networking + .get_connection(node_id, self.opts.swimlane) + .await; + + match result { + Ok(connection) => SessionState::Connected { connection }, + Err(ConnectError::Shutdown(_)) => SessionState::Shutdown, + Err(err) => { + debug!("Failed to connect to node {node_id}: {err}"); + tokio::time::sleep(retry.next().unwrap_or_else(|| Duration::from_secs(1))).await; + SessionState::Connecting { retry } + } + } + } + + /// Re-sends all inflight batches after a connection is restored. + async fn replay(&mut self, connection: &Connection) -> Result<(), ConnectionClosed> { + // todo(azmy): to avoid all the inflight batches again and waste traffic + // maybe test the connection first by sending an empty batch and wait for response + // before proceeding? + + for batch in self.inflight.iter_mut() { + let Some(permit) = connection.reserve().await else { + return Err(ConnectionClosed); + }; + + // resend batch + let reply_rx = permit + .send_rpc( + IngestRequest::from(Arc::clone(&batch.records)), + Some(self.partition.into()), + ) + .expect("encoding version to match"); + batch.reply_rx = Some(reply_rx); + } + + Ok(()) + } + + async fn connected(&mut self, connection: Connection) -> SessionState { + if self.replay(&connection).await.is_err() { + return SessionState::Disconnected; + } + + let mut chunked = ChunksSize::new(&mut self.rx, self.opts.batch_size, |(_, item)| { + item.estimate_size() + }); + + let state = loop { + let head: OptionFuture<_> = self + .inflight + .front_mut() + .and_then(|batch| batch.reply_rx.as_mut()) + .into(); + + tokio::select! { + _ = connection.closed() => { + break SessionState::Disconnected; + } + Some(batch) = chunked.next() => { + let batch = IngressBatch::new(batch); + let records = Arc::clone(&batch.records); + + self.inflight.push_back(batch); + + let Some(permit) = connection.reserve().await else { + break SessionState::Disconnected; + }; + + trace!("Sending ingest batch, len: {}", records.len()); + let reply_rx = permit + .send_rpc(IngestRequest::from(records), Some(self.partition.into())) + .expect("encoding version to match"); + + self.inflight.back_mut().expect("to exist").reply_rx = Some(reply_rx); + } + Some(result) = head => { + match result { + Ok(IngestResponse::Ack) => { + let batch = self.inflight.pop_front().expect("not empty"); + batch.committed(); + } + Ok(response) => { + // Handle any other response code as a connection loss + // and retry all inflight batches. + debug!("Ingest response '{:?}'", response); + break SessionState::Disconnected; + } + Err(_err) => { + // we can assume that for any error + // we need to retry all the inflight batches. + // special case for load shedding we could + // throttle the stream a little bit then + // speed up over a period of time. + + break SessionState::Disconnected; + } + } + } + } + }; + + // state == Disconnected + assert!(matches!(state, SessionState::Disconnected)); + + // don't lose the buffered batch + let remainder = chunked.into_remainder(); + if !remainder.is_empty() { + self.inflight.push_back(IngressBatch::new(remainder)); + } + + state + } +} + +struct SessionManagerInner { + networking: Networking, + partition_routing: PartitionRouting, + opts: SessionOptions, + ctx: CancellationToken, + handles: DashMap, +} + +impl SessionManagerInner +where + T: TransportConnect, +{ + /// Gets or start a new session to partition with given partition id. + /// It guarantees that only one session is started per partition id. + pub fn get(&self, id: PartitionId) -> SessionHandle { + self.handles + .entry(id) + .or_insert_with(|| { + let session = PartitionSession::new( + self.networking.clone(), + self.partition_routing.clone(), + id, + self.opts.clone(), + ); + + let handle = session.handle(); + + //todo(azmy): handle spawn result + let ctx = self.ctx.clone(); + let _ = TaskCenter::spawn( + TaskKind::Background, + "ingestion-partition-session", + async move { + session.start(ctx).await; + Ok(()) + }, + ); + + handle + }) + .value() + .clone() + } +} + +/// Manager that owns all partition sessions and caches their handles. +#[derive(Clone)] +pub struct SessionManager { + inner: Arc>, +} + +impl SessionManager { + /// Creates a new session manager with optional overrides for session behaviour. + pub fn new( + networking: Networking, + partition_routing: PartitionRouting, + opts: Option, + ) -> Self { + let inner = SessionManagerInner { + networking, + partition_routing, + opts: opts.unwrap_or_default(), + handles: Default::default(), + ctx: cancellation_token().child_token(), + }; + + Self { + inner: Arc::new(inner), + } + } +} + +impl SessionManager +where + T: TransportConnect, +{ + /// Returns a handle to the session for the given partition, creating it if needed. + pub fn get(&self, id: PartitionId) -> SessionHandle { + self.inner.get(id) + } + + /// Signals all sessions to shut down and prevents new work from being scheduled. + pub fn close(&self) { + self.inner.ctx.cancel(); + } +} diff --git a/crates/ingress-kafka/Cargo.toml b/crates/ingress-kafka/Cargo.toml index ed90980cbc..19b5b66dcc 100644 --- a/crates/ingress-kafka/Cargo.toml +++ b/crates/ingress-kafka/Cargo.toml @@ -18,8 +18,8 @@ oidc = ["rdkafka/curl-static", "rdkafka/gssapi-vendored"] [dependencies] restate-workspace-hack = { workspace = true } -restate-bifrost = { workspace = true } restate-core = { workspace = true } +restate-ingestion-client = { workspace = true } restate-serde-util = { workspace = true } restate-storage-api = { workspace = true } restate-timer-queue = { workspace = true } @@ -31,6 +31,7 @@ anyhow = { workspace = true } base64 = { workspace = true } bytes = { workspace = true } derive_more = { workspace = true } +futures = { workspace = true } metrics = { workspace = true } opentelemetry = { workspace = true } opentelemetry_sdk = { workspace = true } @@ -45,6 +46,7 @@ thiserror = { workspace = true } tokio = { workspace = true, features = ["sync", "rt"] } tracing = { workspace = true } tracing-opentelemetry = { workspace = true } +xxhash-rust = { workspace = true, features = ["xxh3", "std"] } [dev-dependencies] restate-types = { workspace = true, features = ["test-util"] } diff --git a/crates/ingress-kafka/src/builder.rs b/crates/ingress-kafka/src/builder.rs new file mode 100644 index 0000000000..8aa21c0cd1 --- /dev/null +++ b/crates/ingress-kafka/src/builder.rs @@ -0,0 +1,310 @@ +// Copyright (c) 2023 - 2025 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::borrow::Borrow; + +use anyhow::bail; +use base64::Engine; +use bytes::Bytes; +use opentelemetry::propagation::{Extractor, TextMapPropagator}; +use opentelemetry::trace::{Span, SpanContext, TraceContextExt}; +use opentelemetry_sdk::propagation::TraceContextPropagator; +use rdkafka::Message; +use rdkafka::message::BorrowedMessage; +use tracing::{info_span, trace}; + +use restate_storage_api::deduplication_table::DedupInformation; +use restate_types::identifiers::{InvocationId, WithPartitionKey, partitioner}; +use restate_types::invocation::{Header, InvocationTarget, ServiceInvocation, SpanRelation}; +use restate_types::live::Live; +use restate_types::schema::Schema; +use restate_types::schema::invocation_target::{DeploymentStatus, InvocationTargetResolver}; +use restate_types::schema::subscriptions::{EventInvocationTargetTemplate, Sink, Subscription}; +use restate_wal_protocol::{Command, Destination, Envelope, Source}; + +use crate::Error; + +#[derive(Clone)] +pub struct EnvelopeBuilder { + subscription: Subscription, + schema: Live, + // avoids creating a new string with each invocation + subscription_id: String, +} + +impl EnvelopeBuilder { + pub fn new(subscription: Subscription, schema: Live) -> Self { + Self { + subscription_id: subscription.id().to_string(), + subscription, + schema, + } + } + + pub fn subscription(&self) -> &Subscription { + &self.subscription + } + + pub fn build( + &mut self, + producer_id: u128, + consumer_group_id: &str, + msg: BorrowedMessage<'_>, + ) -> Result { + // Prepare ingress span + let ingress_span = info_span!( + "kafka_ingress_consume", + otel.name = "kafka_ingress_consume", + messaging.system = "kafka", + messaging.operation = "receive", + messaging.source.name = msg.topic(), + messaging.source.partition = msg.partition(), + messaging.destination.name = %self.subscription.sink(), + restate.subscription.id = %self.subscription.id(), + messaging.consumer.group.name = consumer_group_id + ); + + trace!(parent: &ingress_span, "Building Kafka ingress request"); + + let key = if let Some(k) = msg.key() { + Bytes::copy_from_slice(k) + } else { + Bytes::default() + }; + let payload = if let Some(p) = msg.payload() { + Bytes::copy_from_slice(p) + } else { + Bytes::default() + }; + + let headers = Self::generate_events_attributes(&msg, &self.subscription_id); + let dedup = DedupInformation::producer(producer_id, msg.offset() as u64); + + let invocation = InvocationBuilder::create( + &self.subscription, + producer_id, + self.schema.live_load(), + key, + payload, + headers, + consumer_group_id, + msg.topic(), + msg.partition(), + msg.offset(), + ) + .map_err(|cause| Error::Event { + topic: msg.topic().to_string(), + partition: msg.partition(), + offset: msg.offset(), + cause, + })?; + + Ok(self.wrap_service_invocation_in_envelope(invocation, dedup)) + } + + fn wrap_service_invocation_in_envelope( + &self, + service_invocation: Box, + dedup_information: DedupInformation, + ) -> Envelope { + let header = restate_wal_protocol::Header { + source: Source::Ingress {}, + dest: Destination::Processor { + partition_key: service_invocation.partition_key(), + dedup: Some(dedup_information), + }, + }; + + Envelope::new(header, Command::Invoke(service_invocation)) + } + + fn generate_events_attributes(msg: &impl Message, subscription_id: &str) -> Vec
{ + let mut headers = Vec::with_capacity(6); + headers.push(Header::new("kafka.offset", msg.offset().to_string())); + headers.push(Header::new("kafka.topic", msg.topic())); + headers.push(Header::new("kafka.partition", msg.partition().to_string())); + if let Some(timestamp) = msg.timestamp().to_millis() { + headers.push(Header::new("kafka.timestamp", timestamp.to_string())); + } + headers.push(Header::new("restate.subscription.id", subscription_id)); + + if let Some(key) = msg.key() { + headers.push(Header::new( + "kafka.key", + &*base64::prelude::BASE64_URL_SAFE.encode(key), + )); + } + + headers + } +} + +#[derive(Debug)] +pub struct InvocationBuilder; + +impl InvocationBuilder { + #[allow(clippy::too_many_arguments)] + pub fn create( + subscription: &Subscription, + producer_id: u128, + schema: &Schema, + key: Bytes, + payload: Bytes, + headers: Vec, + consumer_group_id: &str, + topic: &str, + partition: i32, + offset: i64, + ) -> Result, anyhow::Error> { + let Sink::Invocation { + event_invocation_target_template, + } = subscription.sink(); + + let invocation_target = match event_invocation_target_template { + EventInvocationTargetTemplate::Service { name, handler } => { + InvocationTarget::service(name.clone(), handler.clone()) + } + EventInvocationTargetTemplate::VirtualObject { + name, + handler, + handler_ty, + } => InvocationTarget::virtual_object( + name.clone(), + std::str::from_utf8(&key) + .map_err(|e| anyhow::anyhow!("The Kafka record key must be valid UTF-8: {e}"))? + .to_owned(), + handler.clone(), + *handler_ty, + ), + EventInvocationTargetTemplate::Workflow { + name, + handler, + handler_ty, + } => InvocationTarget::workflow( + name.clone(), + std::str::from_utf8(&key) + .map_err(|e| anyhow::anyhow!("The Kafka record key must be valid UTF-8: {e}"))? + .to_owned(), + handler.clone(), + *handler_ty, + ), + }; + + // Compute the retention values + let target = schema + .resolve_latest_invocation_target( + invocation_target.service_name(), + invocation_target.handler_name(), + ) + .ok_or_else(|| anyhow::anyhow!("Service and handler are not registered"))?; + + if let DeploymentStatus::Deprecated(dp_id) = target.deployment_status { + bail!( + "the service {} is exposed by the deprecated deployment {dp_id}, please upgrade the SDK.", + invocation_target.service_name() + ) + } + + let invocation_retention = target.compute_retention(false); + + let seed = KafkaPartitionKeySeed { + producer: &producer_id, + offset: &offset, + }; + + let invocation_id = InvocationId::generate_or_else(&invocation_target, None, || { + partitioner::HashPartitioner::compute_partition_key(seed) + }); + + // Figure out tracing span + let ingress_span_context = prepare_tracing_span( + &invocation_id, + &invocation_target, + &headers, + consumer_group_id, + topic, + partition as i64, + offset, + ); + + // Finally generate service invocation + let mut service_invocation = Box::new(ServiceInvocation::initialize( + invocation_id, + invocation_target, + restate_types::invocation::Source::Subscription(subscription.id()), + )); + service_invocation.with_related_span(SpanRelation::parent(ingress_span_context)); + service_invocation.argument = payload; + service_invocation.headers = headers; + service_invocation.with_retention(invocation_retention); + + Ok(service_invocation) + } +} + +#[derive(Hash)] +/// Hashable seed that yields a deterministic partition key for service invocations, keeping +/// identical invocations on the same partition for deduplication. +struct KafkaPartitionKeySeed<'a> { + producer: &'a u128, + offset: &'a i64, +} + +#[allow(clippy::too_many_arguments)] +pub(crate) fn prepare_tracing_span( + invocation_id: &InvocationId, + invocation_target: &InvocationTarget, + headers: &[restate_types::invocation::Header], + consumer_group_name: &str, + topic: &str, + partition: i64, + offset: i64, +) -> SpanContext { + let tracing_context = TraceContextPropagator::new().extract(&HeaderExtractor(headers)); + let inbound_span = tracing_context.span(); + + let relation = if inbound_span.span_context().is_valid() { + SpanRelation::parent(inbound_span.span_context()) + } else { + SpanRelation::None + }; + + let span = restate_tracing_instrumentation::info_invocation_span!( + relation = relation, + prefix = "ingress_kafka", + id = invocation_id, + target = invocation_target, + tags = ( + messaging.system = "kafka", + messaging.consumer.group.name = consumer_group_name.to_owned(), + messaging.operation.type = "process", + messaging.kafka.offset = offset, + messaging.source.partition.id = partition, + messaging.source.name = topic.to_owned() + ) + ); + + span.span_context().clone() +} + +struct HeaderExtractor<'a>(pub &'a [restate_types::invocation::Header]); + +impl Extractor for HeaderExtractor<'_> { + fn get(&self, key: &str) -> Option<&str> { + self.0 + .iter() + .find(|h| h.name.eq_ignore_ascii_case(key)) + .map(|value| value.value.borrow()) + } + + fn keys(&self) -> Vec<&str> { + self.0.iter().map(|h| h.name.borrow()).collect::>() + } +} diff --git a/crates/ingress-kafka/src/consumer_task.rs b/crates/ingress-kafka/src/consumer_task.rs index f3b1300f86..988b907de0 100644 --- a/crates/ingress-kafka/src/consumer_task.rs +++ b/crates/ingress-kafka/src/consumer_task.rs @@ -8,238 +8,66 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use std::collections::HashMap; +use std::collections::{HashMap, VecDeque}; use std::fmt; use std::sync::{Arc, OnceLock, Weak}; -use crate::dispatcher::{DispatchKafkaEvent, KafkaIngressDispatcher, KafkaIngressEvent}; -use crate::metric_definitions::{KAFKA_INGRESS_CONSUMER_LAG, KAFKA_INGRESS_REQUESTS}; -use base64::Engine; -use bytes::Bytes; +use futures::future::OptionFuture; use metrics::{counter, gauge}; use rdkafka::consumer::stream_consumer::StreamPartitionQueue; use rdkafka::consumer::{ BaseConsumer, CommitMode, Consumer, ConsumerContext, Rebalance, StreamConsumer, }; use rdkafka::error::KafkaError; -use rdkafka::message::BorrowedMessage; use rdkafka::topic_partition_list::TopicPartitionListElem; use rdkafka::types::RDKafkaErrorCode; use rdkafka::{ClientConfig, ClientContext, Message, Statistics}; + +use restate_core::network::TransportConnect; use restate_core::{TaskCenter, TaskHandle, TaskKind, task_center}; -use restate_types::invocation::Header; -use restate_types::live::Live; -use restate_types::message::MessageIndex; -use restate_types::schema::Schema; -use restate_types::schema::subscriptions::{EventInvocationTargetTemplate, Sink, Subscription}; +use restate_ingestion_client::{CommitError, IngestionClient, IngestionError, RecordCommit}; +use restate_types::identifiers::WithPartitionKey; +use restate_wal_protocol::Envelope; use tokio::sync::{mpsc, oneshot}; -use tracing::{Instrument, debug, info, info_span, warn}; - -#[derive(Debug, thiserror::Error)] -pub enum Error { - #[error(transparent)] - Kafka(#[from] KafkaError), - #[error( - "error processing message topic {topic} partition {partition} offset {offset}: {cause}" - )] - Event { - topic: String, - partition: i32, - offset: i64, - #[source] - cause: anyhow::Error, - }, - #[error("ingress dispatcher channel is closed")] - IngressDispatcherClosed, - #[error( - "received a message on the main partition queue for topic {0} partition {1} despite partitioned queues" - )] - UnexpectedMainQueueMessage(String, i32), -} - -type MessageConsumer = StreamConsumer; - -#[derive(Debug, Hash)] -pub struct KafkaDeduplicationId { - consumer_group: String, - topic: String, - partition: i32, -} - -impl fmt::Display for KafkaDeduplicationId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "{}-{}-{}", - self.consumer_group, self.topic, self.partition - ) - } -} - -impl KafkaDeduplicationId { - pub(crate) fn requires_proxying(subscription: &Subscription) -> bool { - // Service event receiver requires proxying because we don't want to scatter deduplication ids (kafka topic/partition offsets) in all the Restate partitions. - matches!( - subscription.sink(), - Sink::Invocation { - event_invocation_target_template: EventInvocationTargetTemplate::Service { .. } - }, - ) - } -} - -#[derive(Clone)] -pub struct MessageSender { - subscription: Subscription, - dispatcher: KafkaIngressDispatcher, - schema: Live, - - subscription_id: String, - ingress_request_counter: metrics::Counter, -} - -impl MessageSender { - pub fn new( - subscription: Subscription, - dispatcher: KafkaIngressDispatcher, - schema: Live, - ) -> Self { - Self { - subscription_id: subscription.id().to_string(), - ingress_request_counter: counter!( - KAFKA_INGRESS_REQUESTS, - "subscription" => subscription.id().to_string() - ), - subscription, - dispatcher, - schema, - } - } +use tracing::{debug, info, trace, warn}; - async fn send(&self, consumer_group_id: &str, msg: BorrowedMessage<'_>) -> Result<(), Error> { - // Prepare ingress span - let ingress_span = info_span!( - "kafka_ingress_consume", - otel.name = "kafka_ingress_consume", - messaging.system = "kafka", - messaging.operation = "receive", - messaging.source.name = msg.topic(), - messaging.destination.name = %self.subscription.sink(), - restate.subscription.id = %self.subscription.id(), - messaging.consumer.group.name = consumer_group_id - ); - info!(parent: &ingress_span, "Processing Kafka ingress request"); - - let key = if let Some(k) = msg.key() { - Bytes::copy_from_slice(k) - } else { - Bytes::default() - }; - let payload = if let Some(p) = msg.payload() { - Bytes::copy_from_slice(p) - } else { - Bytes::default() - }; - let headers = Self::generate_events_attributes(&msg, &self.subscription_id); - - let (deduplication_id, deduplication_index) = - Self::generate_deduplication_id(consumer_group_id, &msg); - let req = KafkaIngressEvent::new( - &self.subscription, - self.schema.pinned(), - key, - payload, - deduplication_id, - deduplication_index, - headers, - consumer_group_id, - msg.topic(), - msg.partition(), - msg.offset(), - ) - .map_err(|cause| Error::Event { - topic: msg.topic().to_string(), - partition: msg.partition(), - offset: msg.offset(), - cause, - })?; - - self.ingress_request_counter.increment(1); - - self.dispatcher - .dispatch_kafka_event(req) - .instrument(ingress_span) - .await - .map_err(|_| Error::IngressDispatcherClosed)?; - Ok(()) - } - - fn generate_events_attributes(msg: &impl Message, subscription_id: &str) -> Vec
{ - let mut headers = Vec::with_capacity(6); - headers.push(Header::new("kafka.offset", msg.offset().to_string())); - headers.push(Header::new("kafka.topic", msg.topic())); - headers.push(Header::new("kafka.partition", msg.partition().to_string())); - if let Some(timestamp) = msg.timestamp().to_millis() { - headers.push(Header::new("kafka.timestamp", timestamp.to_string())); - } - headers.push(Header::new( - "restate.subscription.id".to_string(), - subscription_id, - )); - - if let Some(key) = msg.key() { - headers.push(Header::new( - "kafka.key", - &*base64::prelude::BASE64_URL_SAFE.encode(key), - )); - } - - headers - } - - fn generate_deduplication_id( - consumer_group: &str, - msg: &impl Message, - ) -> (KafkaDeduplicationId, MessageIndex) { - ( - KafkaDeduplicationId { - consumer_group: consumer_group.to_owned(), - topic: msg.topic().to_owned(), - partition: msg.partition(), - }, - msg.offset() as u64, - ) - } +use crate::Error; +use crate::builder::EnvelopeBuilder; +use crate::metric_definitions::{KAFKA_INGRESS_CONSUMER_LAG, KAFKA_INGRESS_REQUESTS}; - fn update_consumer_stats(&self, stats: Statistics) { - for topic in stats.topics { - for partition in topic.1.partitions { - let lag = partition.1.consumer_lag as f64; - gauge!( - KAFKA_INGRESS_CONSUMER_LAG, - "subscription" => self.subscription.id().to_string(), - "topic" => topic.0.to_string(), - "partition" => partition.0.to_string() - ) - .set(lag); - } +impl From for Error { + fn from(value: IngestionError) -> Self { + match value { + IngestionError::Closed => Self::IngressClosed, + IngestionError::PartitionTableError(err) => Self::PartitionTableError(err), } } } +type MessageConsumer = StreamConsumer>; #[derive(Clone)] -pub struct ConsumerTask { +pub struct ConsumerTask { client_config: ClientConfig, topics: Vec, - sender: MessageSender, + ingestion: IngestionClient, + builder: EnvelopeBuilder, } -impl ConsumerTask { - pub fn new(client_config: ClientConfig, topics: Vec, sender: MessageSender) -> Self { +impl ConsumerTask +where + T: TransportConnect, +{ + pub fn new( + client_config: ClientConfig, + topics: Vec, + ingestion: IngestionClient, + builder: EnvelopeBuilder, + ) -> Self { Self { client_config, topics, - sender, + ingestion, + builder, } } @@ -251,7 +79,7 @@ impl ConsumerTask { .expect("group.id must be set") .to_string(); debug!( - restate.subscription.id = %self.sender.subscription.id(), + restate.subscription.id = %self.builder.subscription().id(), messaging.consumer.group.name = consumer_group_id, "Starting consumer for topics {:?} with configuration {:?}", self.topics, self.client_config @@ -264,10 +92,11 @@ impl ConsumerTask { consumer: OnceLock::new(), topic_partition_tasks: parking_lot::Mutex::new(HashMap::new()), failures_tx, - sender: self.sender.clone(), + ingestion: self.ingestion.clone(), + builder: self.builder.clone(), consumer_group_id, }; - let consumer: Arc = + let consumer: Arc> = Arc::new(self.client_config.create_with_context(rebalance_context)?); // this OnceLock dance is needed because the rebalance callbacks don't get a handle on the consumer, // which is strange because practically everything you'd want to do with them involves the consumer. @@ -303,9 +132,9 @@ impl ConsumerTask { } #[derive(derive_more::Deref)] -struct ConsumerDrop(Arc); +struct ConsumerDrop(Arc>); -impl Drop for ConsumerDrop { +impl Drop for ConsumerDrop { fn drop(&mut self) { debug!( "Stopping consumer with id {}", @@ -332,18 +161,33 @@ impl fmt::Display for TopicPartition { } } -struct RebalanceContext { +struct RebalanceContext { task_center_handle: task_center::Handle, - consumer: OnceLock>, + consumer: OnceLock>>, topic_partition_tasks: parking_lot::Mutex>, failures_tx: mpsc::UnboundedSender, - sender: MessageSender, + ingestion: IngestionClient, + builder: EnvelopeBuilder, consumer_group_id: String, } -impl ClientContext for RebalanceContext { +impl ClientContext for RebalanceContext +where + T: TransportConnect, +{ fn stats(&self, statistics: Statistics) { - self.sender.update_consumer_stats(statistics); + for topic in statistics.topics { + for partition in topic.1.partitions { + let lag = partition.1.consumer_lag as f64; + gauge!( + KAFKA_INGRESS_CONSUMER_LAG, + "subscription" => self.builder.subscription().id().to_string(), + "topic" => topic.0.to_string(), + "partition" => partition.0.to_string() + ) + .set(lag); + } + } } } @@ -358,7 +202,10 @@ impl ClientContext for RebalanceContext { // and their queues are destroyed. Split partition queues will stop working in this case. We should ensure // that they are not polled again after the assign. Then there will be a further rebalance callback after the revoke // and we will set up new split partition streams before the assign. -impl ConsumerContext for RebalanceContext { +impl ConsumerContext for RebalanceContext +where + T: TransportConnect, +{ fn pre_rebalance(&self, _base_consumer: &BaseConsumer, rebalance: &Rebalance<'_>) { let mut topic_partition_tasks = self.topic_partition_tasks.lock(); let consumer = self @@ -375,6 +222,12 @@ impl ConsumerContext for RebalanceContext { Rebalance::Assign(partitions) if partitions.count() > 0 => { for partition in partitions.elements() { let partition: TopicPartition = partition.into(); + info!( + subscription = %self.builder.subscription().id(), + topic = %partition.0, + partition = %partition.1, + "Assigned kafka partition" + ); if let Some(task_id) = topic_partition_tasks.remove(&partition) { // This probably implies a problem in our assumptions, because librdkafka shouldn't be assigning us a partition again without having revoked it. @@ -387,8 +240,9 @@ impl ConsumerContext for RebalanceContext { match consumer.split_partition_queue(&partition.0, partition.1) { Some(queue) => { - let task = topic_partition_queue_consumption_loop( - self.sender.clone(), + let task = TopicPartitionConsumptionTask::new( + self.ingestion.clone(), + self.builder.clone(), partition.clone(), queue, Arc::clone(&consumer), @@ -399,7 +253,7 @@ impl ConsumerContext for RebalanceContext { if let Ok(task_handle) = self.task_center_handle.spawn_unmanaged( TaskKind::Ingress, "kafka-partition-ingest", - task, + task.run(), ) { topic_partition_tasks.insert(partition, AbortOnDrop(task_handle)); } else { @@ -418,7 +272,14 @@ impl ConsumerContext for RebalanceContext { } Rebalance::Revoke(partitions) if partitions.count() > 0 => { for partition in partitions.elements() { - let partition = partition.into(); + let partition: TopicPartition = partition.into(); + info!( + subscription = %self.builder.subscription().id(), + topic = %partition.0, + partition = %partition.1, + "Revoked kafka partition" + ); + match topic_partition_tasks.remove(&partition) { Some(task_id) => { debug!( @@ -457,36 +318,159 @@ impl Drop for AbortOnDrop { } } -async fn topic_partition_queue_consumption_loop( - sender: MessageSender, +struct TopicPartitionConsumptionTask +where + T: TransportConnect, + C: ConsumerContext, +{ + ingestion: IngestionClient, + builder: EnvelopeBuilder, topic_partition: TopicPartition, - topic_partition_consumer: StreamPartitionQueue, - consumer: Arc, + topic_partition_consumer: StreamPartitionQueue, + consumer: Arc>, consumer_group_id: String, failed: mpsc::UnboundedSender, -) { - debug!( - restate.subscription.id = %sender.subscription.id(), - messaging.consumer.group.name = consumer_group_id, - "Starting topic '{}' partition '{}' consumption loop", - topic_partition.0, - topic_partition.1 - ); - // this future will be aborted when the partition is no longer needed, so any exit is a failure - let err = loop { - let res = topic_partition_consumer.recv().await; - let msg = match res { - Ok(msg) => msg, - Err(err) => break err.into(), - }; - let offset = msg.offset(); - if let Err(err) = sender.send(&consumer_group_id, msg).await { - break err; +} + +impl TopicPartitionConsumptionTask +where + T: TransportConnect, + C: ConsumerContext, +{ + fn new( + ingestion: IngestionClient, + builder: EnvelopeBuilder, + topic_partition: TopicPartition, + topic_partition_consumer: StreamPartitionQueue, + consumer: Arc>, + consumer_group_id: String, + failed: mpsc::UnboundedSender, + ) -> Self { + Self { + ingestion, + builder, + topic_partition, + topic_partition_consumer, + consumer, + consumer_group_id, + failed, } - if let Err(err) = consumer.store_offset(&topic_partition.0, topic_partition.1, offset) { - break err.into(); + } + + async fn run(mut self) { + // this future will be aborted when the partition is no longer needed, so any exit is a failure + if let Err(err) = self.run_inner().await { + _ = self.failed.send(err); } - }; + } + + async fn run_inner(&mut self) -> Result<(), Error> { + debug!( + restate.subscription.id = %self.builder.subscription().id(), + messaging.consumer.group.name = self.consumer_group_id, + "Starting topic '{}' partition '{}' consumption loop", + self.topic_partition.0, + self.topic_partition.1 + ); + + let producer_id = dedup_producer_id( + &self.consumer_group_id, + &self.topic_partition.0, + self.topic_partition.1, + ); + + let ingress_request_counter = counter!( + KAFKA_INGRESS_REQUESTS, + "subscription" => self.builder.subscription().id().to_string(), + "topic" => self.topic_partition.0.to_string(), + "partition" => self.topic_partition.1.to_string(), + ); + + let mut inflight = VecDeque::new(); + + loop { + tokio::select! { + biased; + Some(committed) = Self::head_committed(&mut inflight) => { + let head = inflight.pop_front().expect("to exist"); + if let Err(CommitError::Cancelled) = committed { + return Err(Error::IngressClosed); + } + ingress_request_counter.increment(1); + trace!( + topic=%self.topic_partition.0, kafka_partition=%self.topic_partition.1, offset=%head.offset, + "store kafka offset", + ); + self.consumer.store_offset(&self.topic_partition.0, self.topic_partition.1, head.offset)?; + }, + received = self.topic_partition_consumer.recv() => { + trace!( + topic=%self.topic_partition.0, kafka_partition=%self.topic_partition.1, + "ingesting kafka message" + ); + + let msg = received?; + let offset = msg.offset(); + + let envelope = self.builder.build(producer_id, &self.consumer_group_id, msg)?; + + // while trying to send the message, we also make sure to process committed messages to release permits as soon as + // possible, which gives better chances for sending to + // not block + let mut ingest = std::pin::pin!(self.ingestion.ingest(envelope.partition_key(), envelope)); + + let commit_token = loop { + tokio::select!{ + biased; + token = &mut ingest => { + break token?; + } + Some(committed) = Self::head_committed(&mut inflight) => { + let head = inflight.pop_front().expect("to exist"); + if let Err(CommitError::Cancelled) = committed { + return Err(Error::IngressClosed); + } + ingress_request_counter.increment(1); + trace!( + topic=%self.topic_partition.0, kafka_partition=%self.topic_partition.1, offset=%head.offset, + "store kafka offset", + ); + self.consumer.store_offset(&self.topic_partition.0, self.topic_partition.1, head.offset)?; + } + } + }; + + inflight.push_back(InflightMessage{offset, commit_token}); + } + } + } + } + + #[inline] + pub fn head_committed( + inflight: &mut VecDeque, + ) -> OptionFuture<&mut RecordCommit> { + OptionFuture::from( + inflight + .front_mut() + .map(|i: &mut InflightMessage| &mut i.commit_token), + ) + } +} + +// Do not change. Changing this hasher will create new producer-id which can +// cause duplicates +fn dedup_producer_id(consumer_group: &str, topic: &str, partition: i32) -> u128 { + use std::io::Write; + + let mut hasher = xxhash_rust::xxh3::Xxh3::new(); + // todo(azmy): add cluster name to the hash? + write!(hasher, "{consumer_group}:{topic}:{partition}").unwrap(); + + hasher.digest128() +} - _ = failed.send(err); +struct InflightMessage { + offset: i64, + commit_token: RecordCommit, } diff --git a/crates/ingress-kafka/src/dispatcher.rs b/crates/ingress-kafka/src/dispatcher.rs deleted file mode 100644 index 558c23b448..0000000000 --- a/crates/ingress-kafka/src/dispatcher.rs +++ /dev/null @@ -1,284 +0,0 @@ -// Copyright (c) 2023 - 2025 Restate Software, Inc., Restate GmbH. -// All rights reserved. -// -// Use of this software is governed by the Business Source License -// included in the LICENSE file. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0. - -use anyhow::bail; -use bytes::Bytes; -use opentelemetry::propagation::{Extractor, TextMapPropagator}; -use opentelemetry::trace::{Span, SpanContext, TraceContextExt}; -use opentelemetry_sdk::propagation::TraceContextPropagator; -use std::borrow::Borrow; -use std::sync::Arc; -use tracing::debug; - -use restate_bifrost::Bifrost; -use restate_storage_api::deduplication_table::DedupInformation; -use restate_types::identifiers::{InvocationId, PartitionKey, WithPartitionKey, partitioner}; -use restate_types::invocation::{InvocationTarget, ServiceInvocation, SpanRelation}; -use restate_types::live; -use restate_types::message::MessageIndex; -use restate_types::partition_table::PartitionTableError; -use restate_types::schema::Schema; -use restate_types::schema::invocation_target::{DeploymentStatus, InvocationTargetResolver}; -use restate_types::schema::subscriptions::{EventInvocationTargetTemplate, Sink, Subscription}; -use restate_wal_protocol::{Command, Destination, Envelope, Header, Source}; - -use crate::consumer_task::KafkaDeduplicationId; - -#[derive(Debug)] -pub struct KafkaIngressEvent { - service_invocation: Box, - deduplication_id: KafkaDeduplicationId, - deduplication_index: MessageIndex, - proxying_partition_key: Option, -} - -impl KafkaIngressEvent { - #[allow(clippy::too_many_arguments)] - pub fn new( - subscription: &Subscription, - schema: live::Pinned, - key: Bytes, - payload: Bytes, - deduplication_id: KafkaDeduplicationId, - deduplication_index: MessageIndex, - headers: Vec, - consumer_group_id: &str, - topic: &str, - partition: i32, - offset: i64, - ) -> Result { - // Check if we need to proxy or not - let proxying_partition_key = if KafkaDeduplicationId::requires_proxying(subscription) { - Some(partitioner::HashPartitioner::compute_partition_key( - &deduplication_id, - )) - } else { - None - }; - - let invocation_target = match subscription.sink() { - Sink::Invocation { - event_invocation_target_template, - } => match event_invocation_target_template { - EventInvocationTargetTemplate::Service { name, handler } => { - InvocationTarget::service(name.clone(), handler.clone()) - } - EventInvocationTargetTemplate::VirtualObject { - name, - handler, - handler_ty, - } => InvocationTarget::virtual_object( - name.clone(), - std::str::from_utf8(&key) - .map_err(|e| { - anyhow::anyhow!("The Kafka record key must be valid UTF-8: {e}") - })? - .to_owned(), - handler.clone(), - *handler_ty, - ), - EventInvocationTargetTemplate::Workflow { - name, - handler, - handler_ty, - } => InvocationTarget::workflow( - name.clone(), - std::str::from_utf8(&key) - .map_err(|e| { - anyhow::anyhow!("The Kafka record key must be valid UTF-8: {e}") - })? - .to_owned(), - handler.clone(), - *handler_ty, - ), - }, - }; - - // Compute the retention values - let target = schema - .resolve_latest_invocation_target( - invocation_target.service_name(), - invocation_target.handler_name(), - ) - .ok_or_else(|| anyhow::anyhow!("Service and handler are not registered"))?; - - if let DeploymentStatus::Deprecated(dp_id) = target.deployment_status { - bail!( - "the service {} is exposed by the deprecated deployment {dp_id}, please upgrade the SDK.", - invocation_target.service_name() - ) - } - - let invocation_retention = target.compute_retention(false); - - // Time to generate invocation id - let invocation_id = InvocationId::generate(&invocation_target, None); - - // Figure out tracing span - let ingress_span_context = prepare_tracing_span( - &invocation_id, - &invocation_target, - &headers, - consumer_group_id, - topic, - partition as i64, - offset, - ); - - // Finally generate service invocation - let mut service_invocation = Box::new(ServiceInvocation::initialize( - invocation_id, - invocation_target, - restate_types::invocation::Source::Subscription(subscription.id()), - )); - service_invocation.with_related_span(SpanRelation::parent(ingress_span_context)); - service_invocation.argument = payload; - service_invocation.headers = headers; - service_invocation.with_retention(invocation_retention); - - Ok(KafkaIngressEvent { - service_invocation, - deduplication_id, - deduplication_index, - proxying_partition_key, - }) - } -} - -#[derive(Debug, thiserror::Error)] -pub enum IngressDispatchError { - #[error("bifrost error: {0}")] - WalProtocol(#[from] restate_bifrost::AppendError), - #[error("partition routing error: {0}")] - PartitionRoutingError(#[from] PartitionTableError), -} - -/// Dispatches a request from kafka ingress to bifrost -pub trait DispatchKafkaEvent { - fn dispatch_kafka_event( - &self, - event: KafkaIngressEvent, - ) -> impl std::future::Future> + Send; -} - -#[derive(Clone)] -pub(crate) struct KafkaIngressDispatcher { - bifrost: Bifrost, -} - -impl KafkaIngressDispatcher { - pub(crate) fn new(bifrost: Bifrost) -> Self { - Self { bifrost } - } -} - -impl DispatchKafkaEvent for KafkaIngressDispatcher { - async fn dispatch_kafka_event( - &self, - ingress_request: KafkaIngressEvent, - ) -> Result<(), IngressDispatchError> { - let KafkaIngressEvent { - service_invocation: inner, - deduplication_id, - deduplication_index, - proxying_partition_key, - } = ingress_request; - - let partition_key = proxying_partition_key.unwrap_or_else(|| inner.partition_key()); - - let envelope = wrap_service_invocation_in_envelope( - partition_key, - inner, - deduplication_id.to_string(), - deduplication_index, - ); - let (log_id, lsn) = - restate_bifrost::append_to_bifrost(&self.bifrost, Arc::new(envelope)).await?; - - debug!( - log_id = %log_id, - lsn = %lsn, - "Ingress request written to bifrost" - ); - Ok(()) - } -} - -fn wrap_service_invocation_in_envelope( - partition_key: PartitionKey, - service_invocation: Box, - deduplication_source: String, - deduplication_index: MessageIndex, -) -> Envelope { - let header = Header { - source: Source::Ingress {}, - dest: Destination::Processor { - partition_key, - dedup: Some(DedupInformation::ingress( - deduplication_source, - deduplication_index, - )), - }, - }; - - Envelope::new(header, Command::ProxyThrough(service_invocation)) -} - -#[allow(clippy::too_many_arguments)] -pub(crate) fn prepare_tracing_span( - invocation_id: &InvocationId, - invocation_target: &InvocationTarget, - headers: &[restate_types::invocation::Header], - consumer_group_name: &str, - topic: &str, - partition: i64, - offset: i64, -) -> SpanContext { - let tracing_context = TraceContextPropagator::new().extract(&HeaderExtractor(headers)); - let inbound_span = tracing_context.span(); - - let relation = if inbound_span.span_context().is_valid() { - SpanRelation::parent(inbound_span.span_context()) - } else { - SpanRelation::None - }; - - let span = restate_tracing_instrumentation::info_invocation_span!( - relation = relation, - prefix = "ingress_kafka", - id = invocation_id, - target = invocation_target, - tags = ( - messaging.system = "kafka", - messaging.consumer.group.name = consumer_group_name.to_owned(), - messaging.operation.type = "process", - messaging.kafka.offset = offset, - messaging.source.partition.id = partition, - messaging.source.name = topic.to_owned() - ) - ); - - span.span_context().clone() -} - -struct HeaderExtractor<'a>(pub &'a [restate_types::invocation::Header]); - -impl Extractor for HeaderExtractor<'_> { - fn get(&self, key: &str) -> Option<&str> { - self.0 - .iter() - .find(|h| h.name.eq_ignore_ascii_case(key)) - .map(|value| value.value.borrow()) - } - - fn keys(&self) -> Vec<&str> { - self.0.iter().map(|h| h.name.borrow()).collect::>() - } -} diff --git a/crates/ingress-kafka/src/lib.rs b/crates/ingress-kafka/src/lib.rs index 9709e51bd0..02446b20bb 100644 --- a/crates/ingress-kafka/src/lib.rs +++ b/crates/ingress-kafka/src/lib.rs @@ -8,14 +8,40 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +mod builder; mod consumer_task; -mod dispatcher; mod metric_definitions; mod subscription_controller; +use rdkafka::error::KafkaError; +use restate_types::partitions::PartitionTableError; use tokio::sync::mpsc; -pub use subscription_controller::{Command, Error, Service}; +pub use subscription_controller::{Command, Service}; pub type SubscriptionCommandSender = mpsc::Sender; pub type SubscriptionCommandReceiver = mpsc::Receiver; + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error(transparent)] + Kafka(#[from] KafkaError), + #[error( + "error processing message topic {topic} partition {partition} offset {offset}: {cause}" + )] + Event { + topic: String, + partition: i32, + offset: i64, + #[source] + cause: anyhow::Error, + }, + #[error("ingress stream is closed")] + IngressClosed, + #[error(transparent)] + PartitionTableError(PartitionTableError), + #[error( + "received a message on the main partition queue for topic {0} partition {1} despite partitioned queues" + )] + UnexpectedMainQueueMessage(String, i32), +} diff --git a/crates/ingress-kafka/src/subscription_controller.rs b/crates/ingress-kafka/src/subscription_controller.rs index 0b6a3023b9..aac976dd05 100644 --- a/crates/ingress-kafka/src/subscription_controller.rs +++ b/crates/ingress-kafka/src/subscription_controller.rs @@ -8,25 +8,27 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use super::consumer_task::MessageSender; -use super::*; use std::collections::HashSet; +use std::time::Duration; -use crate::dispatcher::KafkaIngressDispatcher; -use crate::subscription_controller::task_orchestrator::TaskOrchestrator; use anyhow::Context; -use rdkafka::error::KafkaError; -use restate_bifrost::Bifrost; +use restate_wal_protocol::Envelope; +use tokio::sync::mpsc; +use tracing::warn; + use restate_core::cancellation_watcher; +use restate_core::network::TransportConnect; +use restate_ingestion_client::IngestionClient; use restate_types::config::IngressOptions; use restate_types::identifiers::SubscriptionId; use restate_types::live::{Live, LiveLoad}; use restate_types::retries::RetryPolicy; use restate_types::schema::Schema; use restate_types::schema::subscriptions::{Source, Subscription}; -use std::time::Duration; -use tokio::sync::mpsc; -use tracing::warn; + +use super::*; +use crate::builder::EnvelopeBuilder; +use crate::subscription_controller::task_orchestrator::TaskOrchestrator; #[derive(Debug)] pub enum Command { @@ -35,29 +37,26 @@ pub enum Command { UpdateSubscriptions(Vec), } -#[derive(Debug, thiserror::Error)] -pub enum Error { - #[error(transparent)] - Kafka(#[from] KafkaError), -} - // For simplicity of the current implementation, this currently lives in this module // In future versions, we should either pull this out in a separate process, or generify it and move it to the worker, or an ad-hoc module -pub struct Service { - dispatcher: KafkaIngressDispatcher, +pub struct Service { + ingestion: IngestionClient, schema: Live, commands_tx: SubscriptionCommandSender, commands_rx: SubscriptionCommandReceiver, } -impl Service { - pub fn new(bifrost: Bifrost, schema: Live) -> Service { +impl Service +where + T: TransportConnect, +{ + pub fn new(ingestion: IngestionClient, schema: Live) -> Self { metric_definitions::describe_metrics(); let (commands_tx, commands_rx) = mpsc::channel(10); Service { - dispatcher: KafkaIngressDispatcher::new(bifrost), + ingestion, schema, commands_tx, commands_rx, @@ -117,7 +116,7 @@ impl Service { &mut self, options: &IngressOptions, subscription: Subscription, - task_orchestrator: &mut TaskOrchestrator, + task_orchestrator: &mut TaskOrchestrator, ) -> anyhow::Result<()> { let mut client_config = rdkafka::ClientConfig::new(); // enabling probing for the ca certificates if the user does not specify anything else @@ -149,7 +148,8 @@ impl Service { let consumer_task = consumer_task::ConsumerTask::new( client_config, vec![topic.to_string()], - MessageSender::new(subscription, self.dispatcher.clone(), self.schema.clone()), + self.ingestion.clone(), + EnvelopeBuilder::new(subscription, self.schema.clone()), ); task_orchestrator.start(subscription_id, consumer_task); @@ -160,7 +160,7 @@ impl Service { fn handle_stop_subscription( &mut self, subscription_id: SubscriptionId, - task_orchestrator: &mut TaskOrchestrator, + task_orchestrator: &mut TaskOrchestrator, ) { task_orchestrator.stop(subscription_id); } @@ -169,7 +169,7 @@ impl Service { &mut self, options: &IngressOptions, subscriptions: Vec, - task_orchestrator: &mut TaskOrchestrator, + task_orchestrator: &mut TaskOrchestrator, ) -> anyhow::Result<()> { let mut running_subscriptions: HashSet<_> = task_orchestrator.running_subscriptions().cloned().collect(); @@ -191,6 +191,7 @@ impl Service { mod task_orchestrator { use crate::consumer_task; + use restate_core::network::TransportConnect; use restate_core::{TaskCenterFutureExt, TaskKind}; use restate_timer_queue::TimerQueue; use restate_types::identifiers::SubscriptionId; @@ -202,9 +203,9 @@ mod task_orchestrator { use tokio::task::{JoinError, JoinSet}; use tracing::{debug, warn}; - struct TaskState { + struct TaskState { // We use this to restart the consumer task in case of a failure - consumer_task_clone: consumer_task::ConsumerTask, + consumer_task_clone: consumer_task::ConsumerTask, task_state_inner: TaskStateInner, retry_iter: RetryIter<'static>, } @@ -217,15 +218,18 @@ mod task_orchestrator { WaitingRetryTimer, } - pub(super) struct TaskOrchestrator { + pub(super) struct TaskOrchestrator { retry_policy: RetryPolicy, running_tasks_to_subscriptions: HashMap, - subscription_id_to_task_state: HashMap, - tasks: JoinSet>, + subscription_id_to_task_state: HashMap>, + tasks: JoinSet>, timer_queue: TimerQueue, } - impl TaskOrchestrator { + impl TaskOrchestrator + where + T: TransportConnect, + { pub(super) fn new(retry_policy: RetryPolicy) -> Self { Self { retry_policy, @@ -253,7 +257,7 @@ mod task_orchestrator { fn handle_task_closed( &mut self, - result: Result<(task::Id, Result<(), consumer_task::Error>), JoinError>, + result: Result<(task::Id, Result<(), crate::Error>), JoinError>, ) { let task_id = match result { Ok((id, _)) => id, @@ -341,7 +345,7 @@ mod task_orchestrator { pub(super) fn start( &mut self, subscription_id: SubscriptionId, - consumer_task_clone: consumer_task::ConsumerTask, + consumer_task_clone: consumer_task::ConsumerTask, ) { // Shutdown old task, if any if let Some(task_state) = self.subscription_id_to_task_state.remove(&subscription_id) { diff --git a/crates/node/Cargo.toml b/crates/node/Cargo.toml index 3803db992b..ad745fefe7 100644 --- a/crates/node/Cargo.toml +++ b/crates/node/Cargo.toml @@ -31,6 +31,7 @@ restate-admin = { workspace = true, features = ["storage-query"]} restate-bifrost = { workspace = true, features = ["local-loglet", "replicated-loglet"] } restate-core = { workspace = true } restate-futures-util = { workspace = true } +restate-ingestion-client = { workspace = true } restate-ingress-http = { workspace = true } restate-log-server = { workspace = true } restate-metadata-providers = { workspace = true } @@ -44,6 +45,7 @@ restate-service-protocol = { workspace = true, features = ["discovery"] } restate-storage-query-datafusion = { workspace = true } restate-tracing-instrumentation = { workspace = true, features = ["prometheus"] } restate-types = { workspace = true, features = ["clap"] } +restate-wal-protocol = { workspace = true } restate-worker = { workspace = true } ahash = { workspace = true } diff --git a/crates/node/src/lib.rs b/crates/node/src/lib.rs index 05d8924bbb..49f81a4848 100644 --- a/crates/node/src/lib.rs +++ b/crates/node/src/lib.rs @@ -15,6 +15,7 @@ mod metric_definitions; mod network_server; mod roles; +use std::num::NonZeroUsize; use std::time::Duration; use anyhow::Context; @@ -31,6 +32,7 @@ use restate_core::partitions::PartitionRouting; use restate_core::{Metadata, MetadataKind, MetadataWriter, TaskKind}; use restate_core::{MetadataBuilder, MetadataManager, TaskCenter, spawn_metadata_manager}; use restate_futures_util::overdue::OverdueLoggingExt; +use restate_ingestion_client::IngestionClient; use restate_log_server::LogServerService; use restate_metadata_server::{ BoxedMetadataServer, MetadataServer, MetadataStoreClient, ReadModifyWriteError, @@ -133,7 +135,7 @@ pub struct Node { metadata_server_role: Option, failure_detector: FailureDetector>, admin_role: Option>, - worker_role: Option, + worker_role: Option>, ingress_role: Option>, log_server: Option, networking: Networking, @@ -256,6 +258,17 @@ impl Node { None }; + // initialize the ingestion client. Limit the size of unconfirmed inflight + // records by setting the memory_budget to 1M. A small value that maximizes + // throughput without exhausting the memory + let ingestion_client = IngestionClient::new( + networking.clone(), + Metadata::with_current(|m| m.updateable_partition_table()), + PartitionRouting::new(replica_set_states.clone(), TaskCenter::current()), + NonZeroUsize::new(1024 * 1024).unwrap(), //1MB + None, + ); + let worker_role = if config.has_role(Role::Worker) { Some( WorkerRole::create( @@ -265,6 +278,7 @@ impl Node { partition_store_manager.clone(), networking.clone(), bifrost_svc.handle(), + ingestion_client.clone(), metadata_manager.writer(), ) .await?, @@ -295,6 +309,7 @@ impl Node { AdminRole::create( tc.health().admin_status(), bifrost.clone(), + ingestion_client, updateable_config.clone(), PartitionRouting::new(replica_set_states.clone(), tc), metadata.updateable_partition_table(), diff --git a/crates/node/src/roles/admin.rs b/crates/node/src/roles/admin.rs index 07e47f29b9..a88171441d 100644 --- a/crates/node/src/roles/admin.rs +++ b/crates/node/src/roles/admin.rs @@ -23,6 +23,7 @@ use restate_core::network::TransportConnect; use restate_core::partitions::PartitionRouting; use restate_core::worker_api::PartitionProcessorInvocationClient; use restate_core::{Metadata, MetadataWriter, TaskCenter, TaskKind}; +use restate_ingestion_client::IngestionClient; use restate_partition_store::PartitionStoreManager; use restate_service_client::{AssumeRoleCacheMode, HttpClient, ServiceClient}; use restate_service_protocol::discovery::ServiceDiscovery; @@ -42,6 +43,7 @@ use restate_types::partition_table::PartitionTable; use restate_types::partitions::state::PartitionReplicaSetStates; use restate_types::protobuf::common::AdminStatus; use restate_types::retries::RetryPolicy; +use restate_wal_protocol::Envelope; #[derive(Debug, thiserror::Error, CodedError)] pub enum AdminRoleBuildError { @@ -67,6 +69,7 @@ pub struct AdminRole { ServiceDiscovery, TelemetryClient, PartitionProcessorInvocationClient, + T, >, storage_accounting_task: Option, } @@ -76,6 +79,7 @@ impl AdminRole { pub async fn create( health_status: HealthStatus, bifrost: Bifrost, + ingestion_client: IngestionClient, updateable_config: Live, partition_routing: PartitionRouting, partition_table: Live, @@ -127,7 +131,7 @@ impl AdminRole { let admin = AdminService::new( listeners, metadata_writer.clone(), - bifrost.clone(), + ingestion_client, PartitionProcessorInvocationClient::new( networking.clone(), partition_table, diff --git a/crates/node/src/roles/worker.rs b/crates/node/src/roles/worker.rs index 44deabb9ba..a7bdaf2712 100644 --- a/crates/node/src/roles/worker.rs +++ b/crates/node/src/roles/worker.rs @@ -19,11 +19,13 @@ use restate_core::network::Networking; use restate_core::network::TransportConnect; use restate_core::worker_api::ProcessorsManagerHandle; use restate_core::{MetadataWriter, TaskCenter}; +use restate_ingestion_client::IngestionClient; use restate_partition_store::PartitionStoreManager; use restate_storage_query_datafusion::context::QueryContext; use restate_types::health::HealthStatus; use restate_types::partitions::state::PartitionReplicaSetStates; use restate_types::protobuf::common::WorkerStatus; +use restate_wal_protocol::Envelope; use restate_worker::Worker; #[derive(Debug, thiserror::Error, CodedError)] @@ -36,18 +38,23 @@ pub enum WorkerRoleBuildError { ), } -pub struct WorkerRole { - worker: Worker, +pub struct WorkerRole { + worker: Worker, } -impl WorkerRole { - pub async fn create( +impl WorkerRole +where + T: TransportConnect, +{ + #[allow(clippy::too_many_arguments)] + pub async fn create( health_status: HealthStatus, replica_set_states: PartitionReplicaSetStates, router_builder: &mut MessageRouterBuilder, partition_store_manager: Arc, networking: Networking, bifrost: Bifrost, + ingestion_client: IngestionClient, metadata_writer: MetadataWriter, ) -> Result { let worker = Worker::create( @@ -56,6 +63,7 @@ impl WorkerRole { partition_store_manager, networking, bifrost, + ingestion_client, router_builder, metadata_writer, ) diff --git a/crates/partition-store/src/keys.rs b/crates/partition-store/src/keys.rs index b8dc587fdd..7012df9022 100644 --- a/crates/partition-store/src/keys.rs +++ b/crates/partition-store/src/keys.rs @@ -483,6 +483,21 @@ impl KeyCodec for PaddedPartitionId { } } +impl KeyCodec for u128 { + fn encode(&self, target: &mut B) { + // store u64 in big-endian order to support byte-wise increment operation. See `crate::scan::try_increment`. + target.put_u128(*self); + } + + fn decode(source: &mut B) -> crate::Result { + Ok(source.get_u128()) + } + + fn serialized_length(&self) -> usize { + 16 + } +} + impl KeyCodec for UniqueTimestamp { fn encode(&self, target: &mut B) { // store u64 in big-endian order to support byte-wise increment operation. See `crate::scan::try_increment`. @@ -636,6 +651,10 @@ impl KeyCodec for ProducerId { target.put_u8(1); KeyCodec::encode(i, target) } + ProducerId::Producer(i) => { + target.put_u8(2); + KeyCodec::encode(&u128::from(*i), target) + } } } @@ -659,6 +678,7 @@ impl KeyCodec for ProducerId { 1 + match self { ProducerId::Partition(p) => KeyCodec::serialized_length(&PaddedPartitionId::from(*p)), ProducerId::Other(i) => KeyCodec::serialized_length(i), + ProducerId::Producer(i) => KeyCodec::serialized_length(&u128::from(*i)), } } } diff --git a/crates/storage-api/src/deduplication_table/mod.rs b/crates/storage-api/src/deduplication_table/mod.rs index a8bc3479a7..81839318aa 100644 --- a/crates/storage-api/src/deduplication_table/mod.rs +++ b/crates/storage-api/src/deduplication_table/mod.rs @@ -14,6 +14,7 @@ use bytestring::ByteString; use restate_types::identifiers::{LeaderEpoch, PartitionId}; use restate_types::message::MessageIndex; +use serde::{Deserialize, Serialize}; use crate::Result; use crate::protobuf_types::PartitionStoreProtobufValue; @@ -45,6 +46,13 @@ impl DedupInformation { sequence_number: DedupSequenceNumber::Sn(sequence_number), } } + + pub fn producer(producer_id: u128, message_index: MessageIndex) -> Self { + DedupInformation { + producer_id: ProducerId::Producer(producer_id.into()), + sequence_number: DedupSequenceNumber::Sn(message_index), + } + } } static SELF_PRODUCER: ByteString = ByteString::from_static("SELF"); @@ -52,6 +60,7 @@ static SELF_PRODUCER: ByteString = ByteString::from_static("SELF"); #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum ProducerId { Partition(PartitionId), + Producer(U128), Other(ByteString), } @@ -140,3 +149,40 @@ pub trait WriteDeduplicationTable { dedup_sequence_number: &DedupSequenceNumber, ) -> Result<()>; } + +// Flexbuffers does not support u128 so we need to +// make this representation for it. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct U128 { + h: u64, + l: u64, +} + +impl From for U128 { + fn from(value: u128) -> Self { + Self { + h: (value >> 64) as u64, + l: value as u64, + } + } +} + +impl From for u128 { + fn from(value: U128) -> Self { + let v = (value.h as u128) << 64; + v | (value.l as u128) + } +} + +#[cfg(test)] +mod test { + use crate::deduplication_table::U128; + + #[test] + fn test_u128() { + let x = u128::MAX; + let y = U128::from(x); + let z = u128::from(y); + assert_eq!(x, z); + } +} diff --git a/crates/types/src/identifiers.rs b/crates/types/src/identifiers.rs index e4e850679f..93af3c2a82 100644 --- a/crates/types/src/identifiers.rs +++ b/crates/types/src/identifiers.rs @@ -438,7 +438,24 @@ pub trait WithInvocationId { pub type EncodedInvocationId = [u8; InvocationId::RAW_BYTES_LEN]; impl InvocationId { + /// Creates a new [`InvocationId`], using a deterministic partition key from the invocation + /// target/idempotency key when available; otherwise a random partition key is used. pub fn generate(invocation_target: &InvocationTarget, idempotency_key: Option<&str>) -> Self { + Self::generate_or_else(invocation_target, idempotency_key, || { + rand::rng().next_u64() + }) + } + + /// Creates a new [`InvocationId`]; prefers a deterministic partition key from the invocation + /// target/idempotency key, otherwise uses the provided fallback function to produce one. + pub fn generate_or_else( + invocation_target: &InvocationTarget, + idempotency_key: Option<&str>, + f: F, + ) -> Self + where + F: FnOnce() -> PartitionKey, + { // --- Partition key generation let partition_key = // Either try to generate the deterministic partition key, if possible @@ -447,7 +464,7 @@ impl InvocationId { idempotency_key, ) // If no deterministic partition key can be generated, just pick a random number - .unwrap_or_else(|| rand::rng().next_u64()); + .unwrap_or_else(f); // --- Invocation UUID generation InvocationId::from_parts( diff --git a/crates/types/src/net/ingest.rs b/crates/types/src/net/ingest.rs new file mode 100644 index 0000000000..994a265f4f --- /dev/null +++ b/crates/types/src/net/ingest.rs @@ -0,0 +1,112 @@ +// Copyright (c) 2023 - 2025 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use bytes::{Bytes, BytesMut}; +use metrics::Key; + +use crate::identifiers::PartitionId; +use crate::logs::{HasRecordKeys, Keys}; +use crate::net::partition_processor::PartitionLeaderService; +use crate::net::{RpcRequest, bilrost_wire_codec, default_wire_codec, define_rpc}; +use crate::storage::{StorageCodec, StorageEncode}; + +#[derive(Debug, Eq, PartialEq, Clone, serde::Serialize, serde::Deserialize)] +pub struct IngestRecord { + pub keys: Keys, + pub record: Bytes, +} + +impl IngestRecord { + pub fn estimate_size(&self) -> usize { + size_of::() + self.record.len() + } + + pub fn from_parts(keys: Keys, record: T) -> Self + where + T: StorageEncode, + { + let mut buf = BytesMut::new(); + StorageCodec::encode(&record, &mut buf).expect("encode to pass"); + + Self { + keys, + record: buf.freeze(), + } + } +} + +impl HasRecordKeys for IngestRecord { + fn record_keys(&self) -> Keys { + self.keys.clone() + } +} + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct IngestRequest { + pub records: Arc<[IngestRecord]>, +} + +impl IngestRequest { + pub fn estimate_size(&self) -> usize { + self.records + .iter() + .fold(0, |size, item| size + item.estimate_size()) + } +} + +impl From> for IngestRequest { + fn from(records: Arc<[IngestRecord]>) -> Self { + Self { records } + } +} + +// todo(azmy): Use bilrost (depends on the payload) +default_wire_codec!(IngestRequest); + +#[derive(Debug, bilrost::Oneof, bilrost::Message)] +pub enum IngestResponse { + Unknown, + #[bilrost(tag = 1, message)] + Ack, + #[bilrost(tag = 2, message)] + NotLeader { + of: PartitionId, + }, + #[bilrost(tag = 3, message)] + Internal { + msg: String, + }, +} + +bilrost_wire_codec!(IngestResponse); + +define_rpc! { + @request=IngestRequest, + @response=IngestResponse, + @service=PartitionLeaderService, +} + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct ReceivedIngestRequest { + pub records: Vec, +} + +default_wire_codec!(ReceivedIngestRequest); + +/// The [`ReceivedIngestRequest`] uses the same TYPE +/// as [`IngestRequest`] to be able to directly decode +/// received RPC message to this type. +impl RpcRequest for ReceivedIngestRequest { + const TYPE: &str = stringify!(IngestRequest); + type Response = IngestResponse; + type Service = PartitionLeaderService; +} diff --git a/crates/types/src/net/mod.rs b/crates/types/src/net/mod.rs index 463a0e5646..e861254fa0 100644 --- a/crates/types/src/net/mod.rs +++ b/crates/types/src/net/mod.rs @@ -11,6 +11,7 @@ pub mod address; pub mod codec; pub mod connect_opts; +pub mod ingest; pub mod listener; pub mod log_server; pub mod metadata; diff --git a/crates/types/src/net/partition_processor.rs b/crates/types/src/net/partition_processor.rs index 8ebf4569c1..e555ea9c36 100644 --- a/crates/types/src/net/partition_processor.rs +++ b/crates/types/src/net/partition_processor.rs @@ -142,10 +142,6 @@ pub enum PartitionProcessorRpcError { //Busy, #[error("internal error: {0}")] Internal(String), - #[error("partition processor starting")] - Starting, - #[error("partition processor stopping")] - Stopping, } impl PartitionProcessorRpcError { @@ -153,9 +149,7 @@ impl PartitionProcessorRpcError { match self { PartitionProcessorRpcError::NotLeader(_) => true, PartitionProcessorRpcError::LostLeadership(_) => true, - PartitionProcessorRpcError::Stopping => true, PartitionProcessorRpcError::Internal(_) => false, - PartitionProcessorRpcError::Starting => false, } } } diff --git a/crates/worker/Cargo.toml b/crates/worker/Cargo.toml index b5853cc799..11c8c1515d 100644 --- a/crates/worker/Cargo.toml +++ b/crates/worker/Cargo.toml @@ -27,6 +27,7 @@ restate-core = { workspace = true } restate-errors = { workspace = true } restate-ingress-http = { workspace = true } restate-ingress-kafka = { workspace = true } +restate-ingestion-client = { workspace = true } restate-invoker-api = { workspace = true } restate-invoker-impl = { workspace = true } restate-metadata-server = { workspace = true } diff --git a/crates/worker/src/lib.rs b/crates/worker/src/lib.rs index 60b75c0ece..60fd096342 100644 --- a/crates/worker/src/lib.rs +++ b/crates/worker/src/lib.rs @@ -19,9 +19,13 @@ mod partition_processor_manager; mod subscription_controller; mod subscription_integration; +use std::num::NonZeroUsize; use std::sync::Arc; use codederror::CodedError; +use restate_core::network::Swimlane; +use restate_ingestion_client::SessionOptions; +use restate_wal_protocol::Envelope; use tracing::info; use restate_bifrost::Bifrost; @@ -34,6 +38,7 @@ use restate_core::partitions::PartitionRouting; use restate_core::worker_api::ProcessorsManagerHandle; use restate_core::{Metadata, TaskKind}; use restate_core::{MetadataWriter, TaskCenter}; +use restate_ingestion_client::IngestionClient; use restate_ingress_kafka::Service as IngressKafkaService; use restate_invoker_impl::InvokerHandle as InvokerChannelServiceHandle; use restate_partition_store::snapshots::SnapshotRepository; @@ -91,21 +96,26 @@ pub enum BuildError { SnapshotRepository(#[from] anyhow::Error), } -pub struct Worker { +pub struct Worker { storage_query_context: QueryContext, datafusion_remote_scanner: RemoteQueryScannerServer, - ingress_kafka: IngressKafkaService, + ingress_kafka: IngressKafkaService, subscription_controller_handle: SubscriptionControllerHandle, - partition_processor_manager: PartitionProcessorManager, + partition_processor_manager: PartitionProcessorManager, } -impl Worker { - pub async fn create( +impl Worker +where + T: TransportConnect, +{ + #[allow(clippy::too_many_arguments)] + pub async fn create( health_status: HealthStatus, replica_set_states: PartitionReplicaSetStates, partition_store_manager: Arc, networking: Networking, bifrost: Bifrost, + ingestion_client: IngestionClient, router_builder: &mut MessageRouterBuilder, metadata_writer: MetadataWriter, ) -> Result { @@ -121,7 +131,7 @@ impl Worker { let schema = metadata.updateable_schema(); // ingress_kafka - let ingress_kafka = IngressKafkaService::new(bifrost.clone(), schema.clone()); + let ingress_kafka = IngressKafkaService::new(ingestion_client.clone(), schema.clone()); let subscription_controller_handle = SubscriptionControllerHandle::new(ingress_kafka.create_command_sender()); @@ -135,6 +145,18 @@ impl Worker { ))); } + // A dedicated ingestion client for PPM that uses + // BifrostData swimlane + let ppm_ingestion_client = IngestionClient::new( + networking.clone(), + Metadata::with_current(|m| m.updateable_partition_table()), + partition_routing.clone(), + NonZeroUsize::new(1024 * 1024).unwrap(), // 1MB + Some(SessionOptions { + swimlane: Swimlane::BifrostData, + ..Default::default() + }), + ); let partition_processor_manager = PartitionProcessorManager::new( health_status, Configuration::live(), @@ -149,6 +171,7 @@ impl Worker { ) .await .map_err(BuildError::SnapshotRepository)?, + ppm_ingestion_client, ); let remote_scanner_manager = RemoteScannerManager::new( diff --git a/crates/worker/src/metric_definitions.rs b/crates/worker/src/metric_definitions.rs index 4330554f91..5351e1b8e7 100644 --- a/crates/worker/src/metric_definitions.rs +++ b/crates/worker/src/metric_definitions.rs @@ -33,6 +33,9 @@ pub const PARTITION_IS_EFFECTIVE_LEADER: &str = "restate.partition.is_effective_ pub const PARTITION_RECORD_COMMITTED_TO_READ_LATENCY_SECONDS: &str = "restate.partition.record_committed_to_read_latency.seconds"; +pub const PARTITION_INGESTION_REQUEST_LEN: &str = "restate.partition.ingest.request.len"; +pub const PARTITION_INGESTION_REQUEST_SIZE: &str = "restate.partition.ingest.request.size.bytes"; + pub(crate) fn describe_metrics() { describe_gauge!( PARTITION_BLOCKED_FLARE, @@ -97,4 +100,16 @@ pub(crate) fn describe_metrics() { Unit::Count, "Number of records between last applied lsn and the log tail" ); + + describe_histogram!( + PARTITION_INGESTION_REQUEST_LEN, + Unit::Count, + "Number of records in a single ingestion request" + ); + + describe_histogram!( + PARTITION_INGESTION_REQUEST_SIZE, + Unit::Bytes, + "Total size of records in a single ingestion request" + ); } diff --git a/crates/worker/src/partition/cleaner.rs b/crates/worker/src/partition/cleaner.rs index 200ee93345..5c19486717 100644 --- a/crates/worker/src/partition/cleaner.rs +++ b/crates/worker/src/partition/cleaner.rs @@ -9,28 +9,48 @@ // by the Apache License, Version 2.0. use std::ops::RangeInclusive; -use std::sync::Arc; use std::time::{Duration, SystemTime}; use anyhow::Context; -use futures::StreamExt; +use futures::{Stream, StreamExt}; +use tokio::sync::mpsc::{self, Sender}; use tokio::time::{Instant, MissedTickBehavior}; +use tokio_stream::wrappers::ReceiverStream; use tracing::{debug, instrument, warn}; -use restate_bifrost::Bifrost; -use restate_core::cancellation_watcher; +use restate_core::{ShutdownError, TaskCenter, TaskHandle, TaskId, TaskKind, cancellation_watcher}; use restate_storage_api::invocation_status_table::{InvocationStatus, ScanInvocationStatusTable}; -use restate_types::identifiers::WithPartitionKey; -use restate_types::identifiers::{LeaderEpoch, PartitionKey}; -use restate_types::invocation::PurgeInvocationRequest; +use restate_types::identifiers::PartitionKey; +use restate_types::identifiers::{InvocationId, PartitionId}; use restate_types::retries::with_jitter; -use restate_wal_protocol::{Command, Destination, Envelope, Header, Source}; + +const CLEANER_EFFECT_QUEUE_SIZE: usize = 10; + +#[derive(Debug, Clone)] +pub enum CleanerEffect { + PurgeInvocation(InvocationId), + PurgeJournal(InvocationId), +} + +pub(super) struct CleanerHandle { + task_id: TaskId, + rx: ReceiverStream, +} + +impl CleanerHandle { + pub fn stop(self) -> Option> { + TaskCenter::cancel_task(self.task_id) + } + + pub fn effects(&mut self) -> impl Stream { + &mut self.rx + } +} pub(super) struct Cleaner { - leader_epoch: LeaderEpoch, + partition_id: PartitionId, partition_key_range: RangeInclusive, storage: Storage, - bifrost: Bifrost, cleanup_interval: Duration, } @@ -39,53 +59,54 @@ where Storage: ScanInvocationStatusTable + Send + Sync + 'static, { pub(super) fn new( - leader_epoch: LeaderEpoch, storage: Storage, - bifrost: Bifrost, + partition_id: PartitionId, partition_key_range: RangeInclusive, cleanup_interval: Duration, ) -> Self { Self { - leader_epoch, + partition_id, partition_key_range, storage, - bifrost, cleanup_interval, } } - #[instrument(skip_all)] - pub(super) async fn run(self) -> anyhow::Result<()> { - let Self { - leader_epoch, - partition_key_range, - storage, - bifrost, - cleanup_interval, - } = self; + pub(super) fn start(self) -> Result { + let (tx, rx) = mpsc::channel(CLEANER_EFFECT_QUEUE_SIZE); + let task_id = TaskCenter::spawn_child(TaskKind::Cleaner, "cleaner", self.run(tx))?; - debug!(?cleanup_interval, "Running cleaner"); + Ok(CleanerHandle { + task_id, + rx: ReceiverStream::new(rx), + }) + } - let bifrost_envelope_source = Source::Processor { - partition_id: None, - partition_key: None, - leader_epoch, - }; + #[instrument(skip_all)] + async fn run(self, tx: Sender) -> anyhow::Result<()> { + debug!( + partition_id=%self.partition_id, + cleanup_interval=?self.cleanup_interval, + "Running cleaner" + ); // the cleaner is currently quite an expensive scan and we don't strictly need to do it on startup, so we will wait // for 20-40% of the interval (so, 12-24 minutes by default) before doing the first one - let initial_wait = with_jitter(cleanup_interval.mul_f32(0.2), 1.0); + let initial_wait = with_jitter(self.cleanup_interval.mul_f32(0.2), 1.0); // the first tick will fire after initial_wait let mut interval = - tokio::time::interval_at(Instant::now() + initial_wait, cleanup_interval); + tokio::time::interval_at(Instant::now() + initial_wait, self.cleanup_interval); interval.set_missed_tick_behavior(MissedTickBehavior::Delay); loop { tokio::select! { _ = interval.tick() => { - if let Err(e) = Self::do_cleanup(&storage, &bifrost, partition_key_range.clone(), &bifrost_envelope_source).await { - warn!("Error when trying to cleanup completed invocations: {e:?}"); + if let Err(e) = self.do_cleanup(&tx).await { + warn!( + partition_id=%self.partition_id, + "Error when trying to cleanup completed invocations: {e:?}" + ); } }, _ = cancellation_watcher() => { @@ -99,15 +120,12 @@ where Ok(()) } - pub(super) async fn do_cleanup( - storage: &Storage, - bifrost: &Bifrost, - partition_key_range: RangeInclusive, - bifrost_envelope_source: &Source, - ) -> anyhow::Result<()> { - debug!("Executing completed invocations cleanup"); + pub(super) async fn do_cleanup(&self, tx: &Sender) -> anyhow::Result<()> { + debug!(partition_id=%self.partition_id, "Starting invocation cleanup"); - let invocations_stream = storage.scan_invocation_statuses(partition_key_range)?; + let invocations_stream = self + .storage + .scan_invocation_statuses(self.partition_key_range.clone())?; tokio::pin!(invocations_stream); while let Some((invocation_id, invocation_status)) = invocations_stream @@ -132,24 +150,9 @@ where .checked_add(completed_invocation.completion_retention_duration) && now >= status_expiration_time { - restate_bifrost::append_to_bifrost( - bifrost, - Arc::new(Envelope { - header: Header { - source: bifrost_envelope_source.clone(), - dest: Destination::Processor { - partition_key: invocation_id.partition_key(), - dedup: None, - }, - }, - command: Command::PurgeInvocation(PurgeInvocationRequest { - invocation_id, - response_sink: None, - }), - }), - ) - .await - .context("Cannot append to bifrost purge invocation")?; + tx.send(CleanerEffect::PurgeInvocation(invocation_id)) + .await + .context("Cannot append to bifrost purge invocation")?; continue; } @@ -165,24 +168,9 @@ where }; if now >= journal_expiration_time { - restate_bifrost::append_to_bifrost( - bifrost, - Arc::new(Envelope { - header: Header { - source: bifrost_envelope_source.clone(), - dest: Destination::Processor { - partition_key: invocation_id.partition_key(), - dedup: None, - }, - }, - command: Command::PurgeJournal(PurgeInvocationRequest { - invocation_id, - response_sink: None, - }), - }), - ) - .await - .context("Cannot append to bifrost purge journal")?; + tx.send(CleanerEffect::PurgeJournal(invocation_id)) + .await + .context("Cannot append to bifrost purge journal")?; continue; } } @@ -198,16 +186,13 @@ mod tests { use futures::{Stream, stream}; use googletest::prelude::*; - use restate_core::{Metadata, TaskCenter, TaskKind, TestCoreEnvBuilder}; use restate_storage_api::StorageError; use restate_storage_api::invocation_status_table::{ CompletedInvocation, InFlightInvocationMetadata, InvocationStatus, InvokedInvocationStatusLite, JournalMetadata, ScanInvocationStatusTable, }; use restate_storage_api::protobuf_types::v1::lazy::InvocationStatusV2Lazy; - use restate_types::Version; use restate_types::identifiers::{InvocationId, InvocationUuid}; - use restate_types::partition_table::{FindPartition, PartitionTable}; use test_log::test; #[allow(dead_code)] @@ -256,15 +241,6 @@ mod tests { // Start paused makes sure the timer is immediately fired #[test(restate_core::test(start_paused = true))] pub async fn cleanup_works() { - let env = TestCoreEnvBuilder::with_incoming_only_connector() - .set_partition_table(PartitionTable::with_equally_sized_partitions( - Version::MIN, - 1, - )) - .build() - .await; - let bifrost = Bifrost::init_in_memory(env.metadata_writer).await; - let expired_invocation = InvocationId::from_parts(PartitionKey::MIN, InvocationUuid::mock_random()); let expired_journal = @@ -315,50 +291,26 @@ mod tests { ), ]); - TaskCenter::spawn( - TaskKind::Cleaner, - "cleaner", - Cleaner::new( - LeaderEpoch::INITIAL, - mock_storage, - bifrost.clone(), - RangeInclusive::new(PartitionKey::MIN, PartitionKey::MAX), - Duration::from_secs(1), - ) - .run(), + let mut handle = Cleaner::new( + mock_storage, + 0.into(), + RangeInclusive::new(PartitionKey::MIN, PartitionKey::MAX), + Duration::from_secs(1), ) + .start() .unwrap(); // cleanup will run after around 200ms - tokio::time::sleep(Duration::from_secs(1)).await; + tokio::time::advance(Duration::from_secs(1)).await; - // All the invocation ids were created with same partition keys, hence same partition id. - let partition_id = Metadata::with_current(|m| { - m.partition_table_snapshot() - .find_partition_id(expired_invocation.partition_key()) - }) - .unwrap(); - - let log_entries: Vec<_> = bifrost - .read_all(partition_id.into()) - .await - .unwrap() - .into_iter() - .map(|e| e.try_decode::().unwrap().unwrap().command) - .collect(); + let received: Vec<_> = handle.effects().ready_chunks(10).next().await.unwrap(); assert_that!( - log_entries, + received, all!( len(eq(2)), - contains(pat!(Command::PurgeInvocation(pat!( - PurgeInvocationRequest { - invocation_id: eq(expired_invocation), - } - )))), - contains(pat!(Command::PurgeJournal(pat!(PurgeInvocationRequest { - invocation_id: eq(expired_journal), - })))), + contains(pat!(CleanerEffect::PurgeInvocation(eq(expired_invocation)))), + contains(pat!(CleanerEffect::PurgeJournal(eq(expired_journal)))) ) ); } diff --git a/crates/worker/src/partition/leadership/leader_state.rs b/crates/worker/src/partition/leadership/leader_state.rs index b6ed98a828..0c884ec6c2 100644 --- a/crates/worker/src/partition/leadership/leader_state.rs +++ b/crates/worker/src/partition/leadership/leader_state.rs @@ -8,19 +8,18 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +use std::collections::HashMap; use std::collections::hash_map::Entry; -use std::collections::{HashMap, VecDeque}; -use std::future; use std::future::Future; use std::ops::RangeInclusive; use std::pin::Pin; use std::task::{Context, Poll, ready}; -use std::time::{Duration, SystemTime}; use futures::future::OptionFuture; use futures::stream::FuturesUnordered; use futures::{FutureExt, StreamExt, stream}; use metrics::counter; +use restate_types::invocation::PurgeInvocationRequest; use tokio_stream::wrappers::{ReceiverStream, WatchStream}; use tracing::{debug, trace}; @@ -32,11 +31,11 @@ use restate_partition_store::{PartitionDb, PartitionStore}; use restate_storage_api::vqueue_table::EntryCard; use restate_types::clock::UniqueTimestamp; use restate_types::identifiers::{ - InvocationId, LeaderEpoch, PartitionId, PartitionKey, PartitionProcessorRpcRequestId, - WithPartitionKey, + LeaderEpoch, PartitionId, PartitionKey, PartitionProcessorRpcRequestId, WithPartitionKey, }; use restate_types::invocation::client::{InvocationOutput, SubmittedInvocationNotification}; use restate_types::logs::Keys; +use restate_types::net::ingest::IngestRecord; use restate_types::net::partition_processor::{ PartitionProcessorRpcError, PartitionProcessorRpcResponse, }; @@ -46,10 +45,10 @@ use restate_vqueues::VQueueEvent; use restate_vqueues::{SchedulerService, VQueuesMeta, scheduler}; use restate_wal_protocol::Command; use restate_wal_protocol::control::UpsertSchema; -use restate_wal_protocol::timer::TimerKeyValue; use restate_wal_protocol::vqueues::Cards; use crate::metric_definitions::{PARTITION_HANDLE_LEADER_ACTIONS, USAGE_LEADER_ACTION_COUNT}; +use crate::partition::cleaner::{CleanerEffect, CleanerHandle}; use crate::partition::invoker_storage_reader::InvokerStorageReader; use crate::partition::leadership::self_proposer::SelfProposer; use crate::partition::leadership::{ActionEffect, Error, InvokerStream, TimerService}; @@ -85,8 +84,7 @@ pub struct LeaderState { invoker_stream: InvokerStream, shuffle_stream: ReceiverStream, schema_stream: WatchStream, - pub pending_cleanup_timers_to_schedule: VecDeque<(InvocationId, Duration)>, - cleaner_task_id: TaskId, + cleaner_handle: CleanerHandle, trimmer_task_id: TaskId, durability_tracker: DurabilityTracker, } @@ -98,7 +96,7 @@ impl LeaderState { leader_epoch: LeaderEpoch, partition_key_range: RangeInclusive, shuffle_task_handle: TaskHandle>, - cleaner_task_id: TaskId, + cleaner_handle: CleanerHandle, trimmer_task_id: TaskId, shuffle_hint_tx: HintSender, timer_service: TimerService, @@ -113,7 +111,7 @@ impl LeaderState { leader_epoch, partition_key_range, shuffle_task_handle: Some(shuffle_task_handle), - cleaner_task_id, + cleaner_handle, trimmer_task_id, shuffle_hint_tx, schema_stream: Metadata::with_current(|m| { @@ -126,7 +124,6 @@ impl LeaderState { awaiting_rpc_self_propose: Default::default(), invoker_stream: invoker_rx, shuffle_stream: ReceiverStream::new(shuffle_rx), - pending_cleanup_timers_to_schedule: Default::default(), durability_tracker, } } @@ -174,22 +171,11 @@ impl LeaderState { let invoker_stream = (&mut self.invoker_stream).map(ActionEffect::Invoker); let shuffle_stream = (&mut self.shuffle_stream).map(ActionEffect::Shuffle); + let cleaner_stream = self.cleaner_handle.effects().map(ActionEffect::Cleaner); + let dur_tracker_stream = (&mut self.durability_tracker).map(ActionEffect::PartitionMaintenance); - let action_effects_stream = stream::unfold( - &mut self.pending_cleanup_timers_to_schedule, - |pending_cleanup_timers_to_schedule| { - let result = pending_cleanup_timers_to_schedule.pop_front(); - future::ready(result.map(|(invocation_id, duration)| { - ( - ActionEffect::ScheduleCleanupTimer(invocation_id, duration), - pending_cleanup_timers_to_schedule, - ) - })) - }, - ) - .fuse(); let awaiting_rpc_self_propose_stream = (&mut self.awaiting_rpc_self_propose).map(|_| ActionEffect::AwaitingRpcSelfProposeDone); @@ -198,7 +184,7 @@ impl LeaderState { invoker_stream, shuffle_stream, timer_stream, - action_effects_stream, + cleaner_stream, awaiting_rpc_self_propose_stream, dur_tracker_stream, schema_stream @@ -247,7 +233,7 @@ impl LeaderState { // re-use of the self proposer self.self_proposer.mark_as_non_leader().await; - let cleaner_handle = OptionFuture::from(TaskCenter::cancel_task(self.cleaner_task_id)); + let cleaner_handle = OptionFuture::from(self.cleaner_handle.stop()); // We don't really care about waiting for the trimmer to finish cancelling TaskCenter::cancel_task(self.trimmer_task_id); @@ -353,15 +339,26 @@ impl LeaderState { .propose(timer.invocation_id().partition_key(), Command::Timer(timer)) .await?; } - ActionEffect::ScheduleCleanupTimer(invocation_id, duration) => { - self.self_proposer - .propose( - invocation_id.partition_key(), - Command::ScheduleTimer(TimerKeyValue::clean_invocation_status( - MillisSinceEpoch::from(SystemTime::now() + duration), + ActionEffect::Cleaner(effect) => { + let (invocation_id, cmd) = match effect { + CleanerEffect::PurgeJournal(invocation_id) => ( + invocation_id, + Command::PurgeJournal(PurgeInvocationRequest { invocation_id, - )), - ) + response_sink: None, + }), + ), + CleanerEffect::PurgeInvocation(invocation_id) => ( + invocation_id, + Command::PurgeInvocation(PurgeInvocationRequest { + invocation_id, + response_sink: None, + }), + ), + }; + + self.self_proposer + .propose(invocation_id.partition_key(), cmd) .await?; } ActionEffect::UpsertSchema(schema) => { @@ -436,14 +433,35 @@ impl LeaderState { Ok(commit_token) => { self.awaiting_rpc_self_propose.push(SelfAppendFuture::new( commit_token, - success_response, - reciprocal, + |result: Result<(), PartitionProcessorRpcError>| { + reciprocal.send(result.map(|_| success_response)); + }, )); } Err(e) => reciprocal.send(Err(PartitionProcessorRpcError::Internal(e.to_string()))), } } + pub async fn propose_many_with_callback( + &mut self, + records: impl ExactSizeIterator, + callback: F, + ) where + F: FnOnce(Result<(), PartitionProcessorRpcError>) + Send + Sync + 'static, + { + match self + .self_proposer + .propose_many_with_notification(records) + .await + { + Ok(commit_token) => { + self.awaiting_rpc_self_propose + .push(SelfAppendFuture::new(commit_token, callback)); + } + Err(e) => callback(Err(PartitionProcessorRpcError::Internal(e.to_string()))), + } + } + pub fn handle_actions( &mut self, invoker_tx: &mut impl restate_invoker_api::InvokerHandle>, @@ -568,13 +586,6 @@ impl LeaderState { ))); } } - Action::ScheduleInvocationStatusCleanup { - invocation_id, - retention, - } => { - self.pending_cleanup_timers_to_schedule - .push_back((invocation_id, retention)); - } Action::ForwardNotification { invocation_id, invocation_epoch, @@ -691,42 +702,72 @@ impl LeaderState { } } +trait CallbackInner: Send + Sync + 'static { + fn call(self: Box, result: Result<(), PartitionProcessorRpcError>); +} + +impl CallbackInner for F +where + F: FnOnce(Result<(), PartitionProcessorRpcError>) + Send + Sync + 'static, +{ + fn call(self: Box, result: Result<(), PartitionProcessorRpcError>) { + self(result) + } +} + +struct Callback { + inner: Box, +} + +impl Callback { + fn call(self, result: Result<(), PartitionProcessorRpcError>) { + self.inner.call(result); + } +} + +impl From for Callback +where + I: CallbackInner, +{ + fn from(value: I) -> Self { + Self { + inner: Box::new(value), + } + } +} + struct SelfAppendFuture { commit_token: CommitToken, - response: Option<(PartitionProcessorRpcResponse, RpcReciprocal)>, + callback: Option, } impl SelfAppendFuture { - fn new( - commit_token: CommitToken, - success_response: PartitionProcessorRpcResponse, - response_reciprocal: RpcReciprocal, - ) -> Self { + fn new(commit_token: CommitToken, callback: impl Into) -> Self { Self { commit_token, - response: Some((success_response, response_reciprocal)), + callback: Some(callback.into()), } } fn fail_with_internal(&mut self) { - if let Some((_, reciprocal)) = self.response.take() { - reciprocal.send(Err(PartitionProcessorRpcError::Internal( + if let Some(callback) = self.callback.take() { + callback.call(Err(PartitionProcessorRpcError::Internal( "error when proposing to bifrost".to_string(), ))); } } fn fail_with_lost_leadership(&mut self, this_partition_id: PartitionId) { - if let Some((_, reciprocal)) = self.response.take() { - reciprocal.send(Err(PartitionProcessorRpcError::LostLeadership( + if let Some(callback) = self.callback.take() { + callback.call(Err(PartitionProcessorRpcError::LostLeadership( this_partition_id, ))); } } fn succeed_with_appended(&mut self) { - if let Some((success_response, reciprocal)) = self.response.take() { - reciprocal.send(Ok(success_response)); + if let Some(callback) = self.callback.take() { + callback.call(Ok(())) } } } diff --git a/crates/worker/src/partition/leadership/mod.rs b/crates/worker/src/partition/leadership/mod.rs index 621117304b..91e079d48e 100644 --- a/crates/worker/src/partition/leadership/mod.rs +++ b/crates/worker/src/partition/leadership/mod.rs @@ -25,9 +25,10 @@ use tokio_stream::wrappers::ReceiverStream; use tracing::{debug, instrument, warn}; use restate_bifrost::Bifrost; -use restate_core::network::{Oneshot, Reciprocal}; +use restate_core::network::{Oneshot, Reciprocal, TransportConnect}; use restate_core::{ShutdownError, TaskCenter, TaskKind, my_node_id}; use restate_errors::NotRunningError; +use restate_ingestion_client::IngestionClient; use restate_invoker_api::InvokeInputJournal; use restate_invoker_api::capacity::InvokerCapacity; use restate_partition_store::PartitionStore; @@ -46,9 +47,10 @@ use restate_types::GenerationalNodeId; use restate_types::cluster::cluster_state::RunMode; use restate_types::config::Configuration; use restate_types::errors::GenericError; -use restate_types::identifiers::{InvocationId, PartitionKey, PartitionProcessorRpcRequestId}; use restate_types::identifiers::{LeaderEpoch, PartitionLeaderEpoch}; +use restate_types::identifiers::{PartitionKey, PartitionProcessorRpcRequestId}; use restate_types::message::MessageIndex; +use restate_types::net::ingest::IngestRecord; use restate_types::net::partition_processor::{ PartitionProcessorRpcError, PartitionProcessorRpcResponse, }; @@ -56,13 +58,13 @@ use restate_types::partitions::Partition; use restate_types::partitions::state::PartitionReplicaSetStates; use restate_types::retries::with_jitter; use restate_types::schema::Schema; -use restate_types::storage::StorageEncodeError; +use restate_types::storage::{StorageDecodeError, StorageEncodeError}; use restate_vqueues::{SchedulerService, VQueuesMeta, VQueuesMetaMut}; -use restate_wal_protocol::Command; use restate_wal_protocol::control::{AnnounceLeader, PartitionDurability}; use restate_wal_protocol::timer::TimerKeyValue; +use restate_wal_protocol::{Command, Envelope}; -use crate::partition::cleaner::Cleaner; +use crate::partition::cleaner::{self, Cleaner}; use crate::partition::invoker_storage_reader::InvokerStorageReader; use crate::partition::leadership::leader_state::LeaderState; use crate::partition::leadership::self_proposer::SelfProposer; @@ -86,7 +88,9 @@ pub(crate) enum Error { #[error("failed writing to bifrost: {0}")] Bifrost(#[from] restate_bifrost::Error), #[error("failed serializing payload: {0}")] - Codec(#[from] StorageEncodeError), + Encode(#[from] StorageEncodeError), + #[error("failed deserializing payload: {0}")] + Decode(#[from] StorageDecodeError), #[error(transparent)] Shutdown(#[from] ShutdownError), #[error("error when self proposing")] @@ -128,7 +132,7 @@ pub(crate) enum ActionEffect { Invoker(Box), Shuffle(shuffle::OutboxTruncation), Timer(TimerKeyValue), - ScheduleCleanupTimer(InvocationId, Duration), + Cleaner(cleaner::CleanerEffect), PartitionMaintenance(PartitionDurability), UpsertSchema(Schema), AwaitingRpcSelfProposeDone, @@ -153,26 +157,29 @@ impl State { } } -pub(crate) struct LeadershipState { +pub(crate) struct LeadershipState { state: State, last_seen_leader_epoch: Option, partition: Arc, invoker_tx: I, + ingestion_client: IngestionClient, invoker_capacity: InvokerCapacity, bifrost: Bifrost, trim_queue: TrimQueue, } -impl LeadershipState +impl LeadershipState where I: restate_invoker_api::InvokerHandle>, + T: TransportConnect, { #[allow(clippy::too_many_arguments)] pub(crate) fn new( partition: Arc, invoker_tx: I, invoker_capacity: InvokerCapacity, + ingestion_client: IngestionClient, bifrost: Bifrost, last_seen_leader_epoch: Option, trim_queue: TrimQueue, @@ -181,6 +188,7 @@ where state: State::Follower, partition, invoker_tx, + ingestion_client, invoker_capacity, bifrost, last_seen_leader_epoch, @@ -408,7 +416,7 @@ where OutboxReader::from(partition_store.clone()), shuffle_tx, config.worker.internal_queue_length(), - self.bifrost.clone(), + self.ingestion_client.clone(), ); let shuffle_hint_tx = shuffle.create_hint_sender(); @@ -417,15 +425,13 @@ where TaskCenter::spawn_unmanaged(TaskKind::Shuffle, "shuffle", shuffle.run())?; let cleaner = Cleaner::new( - *leader_epoch, partition_store.clone(), - self.bifrost.clone(), + self.partition.partition_id, self.partition.key_range.clone(), config.worker.cleanup_interval(), ); - let cleaner_task_id = - TaskCenter::spawn_child(TaskKind::Cleaner, "cleaner", cleaner.run())?; + let cleaner_handle = cleaner.start()?; let trimmer_task_id = LogTrimmer::spawn( self.bifrost.clone(), @@ -454,7 +460,7 @@ where *leader_epoch, self.partition.key_range.clone(), shuffle_task_handle, - cleaner_task_id, + cleaner_handle, trimmer_task_id, shuffle_hint_tx, timer_service, @@ -597,7 +603,7 @@ where } } -impl LeadershipState { +impl LeadershipState { pub async fn handle_rpc_proposal_command( &mut self, request_id: PartitionProcessorRpcRequestId, @@ -648,6 +654,26 @@ impl LeadershipState { } } } + + /// propose to this partition + pub async fn propose_many_with_callback( + &mut self, + records: impl ExactSizeIterator, + callback: F, + ) where + F: FnOnce(Result<(), PartitionProcessorRpcError>) + Send + Sync + 'static, + { + match &mut self.state { + State::Follower | State::Candidate { .. } => callback(Err( + PartitionProcessorRpcError::NotLeader(self.partition.partition_id), + )), + State::Leader(leader_state) => { + leader_state + .propose_many_with_callback(records, callback) + .await; + } + } + } } #[derive(Debug, derive_more::From)] struct TimerReader(PartitionStore); @@ -697,7 +723,9 @@ mod tests { use crate::partition::leadership::{LeadershipState, State}; use assert2::let_assert; use restate_bifrost::Bifrost; + use restate_core::partitions::PartitionRouting; use restate_core::{TaskCenter, TestCoreEnv}; + use restate_ingestion_client::IngestionClient; use restate_invoker_api::capacity::InvokerCapacity; use restate_invoker_api::test_util::MockInvokerHandle; use restate_partition_store::PartitionStoreManager; @@ -711,6 +739,7 @@ mod tests { use restate_vqueues::VQueuesMetaMut; use restate_wal_protocol::control::AnnounceLeader; use restate_wal_protocol::{Command, Envelope}; + use std::num::NonZeroUsize; use std::ops::RangeInclusive; use std::sync::Arc; use test_log::test; @@ -731,11 +760,20 @@ mod tests { let partition_store_manager = PartitionStoreManager::create().await?; + let ingress = IngestionClient::new( + env.networking.clone(), + env.metadata.updateable_partition_table(), + PartitionRouting::new(replica_set_states.clone(), TaskCenter::current()), + NonZeroUsize::new(10 * 1024 * 1024).unwrap(), + None, + ); + let invoker_tx = MockInvokerHandle::default(); let mut state = LeadershipState::new( Arc::new(PARTITION), invoker_tx, InvokerCapacity::new_unlimited(), + ingress, bifrost.clone(), None, TrimQueue::default(), diff --git a/crates/worker/src/partition/leadership/self_proposer.rs b/crates/worker/src/partition/leadership/self_proposer.rs index eea7bc2ba9..cb1e445e3a 100644 --- a/crates/worker/src/partition/leadership/self_proposer.rs +++ b/crates/worker/src/partition/leadership/self_proposer.rs @@ -12,9 +12,11 @@ use std::sync::Arc; use futures::never::Never; -use restate_bifrost::{Bifrost, CommitToken, ErrorRecoveryStrategy}; +use restate_bifrost::{Bifrost, CommitToken, ErrorRecoveryStrategy, InputRecord}; use restate_storage_api::deduplication_table::{DedupInformation, EpochSequenceNumber}; -use restate_types::{identifiers::PartitionKey, logs::LogId}; +use restate_types::{ + identifiers::PartitionKey, logs::LogId, net::ingest::IngestRecord, time::NanosSinceEpoch, +}; use restate_wal_protocol::{Command, Destination, Envelope, Header, Source}; use crate::partition::leadership::Error; @@ -149,6 +151,48 @@ impl SelfProposer { Ok(commit_token) } + pub async fn propose_many_with_notification( + &mut self, + records: impl ExactSizeIterator, + ) -> Result where { + let sender = self.bifrost_appender.sender(); + + // This should ideally be implemented + // by using `sender.enqueue_many` + // but since we have no guarantee over the + // underlying channel size a `reserve_many()` might + // return a misleading Closed error + // + // sender + // .enqueue_many(records) + // .await + // .map_err(|_| Error::SelfProposer)?; + // + // so instead we do this. + + for record in records { + // Skip decoding the envelope; build the InputRecord directly from the raw bytes. + // The ingestion client should only handle payloads of type Envelope. + let input = unsafe { + InputRecord::from_bytes_unchecked( + NanosSinceEpoch::now(), + record.keys, + record.record, + ) + }; + + sender + .enqueue(input) + .await + .map_err(|_| Error::SelfProposer)?; + } + + sender + .notify_committed() + .await + .map_err(|_| Error::SelfProposer) + } + fn create_header(&mut self, partition_key: PartitionKey) -> Header { let esn = self.epoch_sequence_number; self.epoch_sequence_number = self.epoch_sequence_number.next(); diff --git a/crates/worker/src/partition/mod.rs b/crates/worker/src/partition/mod.rs index 1f229f5fca..42f959d1e5 100644 --- a/crates/worker/src/partition/mod.rs +++ b/crates/worker/src/partition/mod.rs @@ -31,8 +31,11 @@ use tracing::{Span, debug, error, info, instrument, trace, warn}; use restate_bifrost::loglet::FindTailOptions; use restate_bifrost::{Bifrost, LogEntry, MaybeRecord}; -use restate_core::network::{Oneshot, Reciprocal, ServiceMessage, Verdict}; +use restate_core::network::{ + Incoming, Oneshot, Reciprocal, Rpc, ServiceMessage, TransportConnect, Verdict, +}; use restate_core::{Metadata, ShutdownError, cancellation_watcher, my_node_id}; +use restate_ingestion_client::IngestionClient; use restate_invoker_api::capacity::InvokerCapacity; use restate_partition_store::{PartitionStore, PartitionStoreTransaction}; use restate_storage_api::deduplication_table::{ @@ -48,6 +51,7 @@ use restate_types::config::Configuration; use restate_types::identifiers::LeaderEpoch; use restate_types::logs::{KeyFilter, Lsn, Record, SequenceNumber}; use restate_types::net::RpcRequest; +use restate_types::net::ingest::{IngestResponse, ReceivedIngestRequest}; use restate_types::net::partition_processor::{ PartitionLeaderService, PartitionProcessorRpcError, PartitionProcessorRpcRequest, PartitionProcessorRpcResponse, @@ -64,7 +68,8 @@ use restate_wal_protocol::{Command, Destination, Envelope, Header}; use self::leadership::trim_queue::TrimQueue; use crate::metric_definitions::{ - PARTITION_BLOCKED_FLARE, PARTITION_LABEL, PARTITION_RECORD_COMMITTED_TO_READ_LATENCY_SECONDS, + PARTITION_BLOCKED_FLARE, PARTITION_INGESTION_REQUEST_LEN, PARTITION_INGESTION_REQUEST_SIZE, + PARTITION_LABEL, PARTITION_RECORD_COMMITTED_TO_READ_LATENCY_SECONDS, }; use crate::partition::invoker_storage_reader::InvokerStorageReader; use crate::partition::leadership::LeadershipState; @@ -110,12 +115,16 @@ where } } - pub async fn build( + pub async fn build( self, bifrost: Bifrost, + ingestion_client: IngestionClient, mut partition_store: PartitionStore, replica_set_states: PartitionReplicaSetStates, - ) -> Result, state_machine::Error> { + ) -> Result, state_machine::Error> + where + T: TransportConnect, + { let PartitionProcessorBuilder { invoker_tx, target_leader_state_rx, @@ -160,6 +169,7 @@ where Arc::clone(partition_store.partition()), invoker_tx, invoker_capacity, + ingestion_client, bifrost.clone(), last_seen_leader_epoch, trim_queue.clone(), @@ -213,9 +223,9 @@ where } } -pub struct PartitionProcessor { +pub struct PartitionProcessor { partition_id_str: SharedString, - leadership_state: LeadershipState, + leadership_state: LeadershipState, state_machine: StateMachine, bifrost: Bifrost, target_leader_state_rx: watch::Receiver, @@ -279,8 +289,9 @@ struct LsnEnvelope { pub envelope: Arc, } -impl PartitionProcessor +impl PartitionProcessor where + T: TransportConnect, InvokerSender: restate_invoker_api::InvokerHandle> + Clone, { #[instrument( @@ -474,15 +485,7 @@ where self.status.effective_mode = self.leadership_state.effective_mode(); } Some(msg) = self.network_leader_svc_rx.recv() => { - match msg { - ServiceMessage::Rpc(msg) if msg.msg_type() == PartitionProcessorRpcRequest::TYPE => { - let msg = msg.into_typed::(); - // note: split() decodes the payload - let (response_tx, body) = msg.split(); - self.on_rpc(response_tx, body, &mut partition_store, live_schemas.live_load()).await; - } - msg => { msg.fail(Verdict::MessageUnrecognized); } - } + self.on_rpc(msg, &mut partition_store, live_schemas.live_load()).await; } _ = status_update_timer.tick() => { if durable_lsn_watch.has_changed().map_err(|e| ProcessorError::Other(e.into()))? { @@ -611,7 +614,7 @@ where Ok(()) } - async fn on_rpc( + async fn on_pp_rpc_request( &mut self, response_tx: Reciprocal< Oneshot>, @@ -627,6 +630,61 @@ where ) .await; } + + async fn on_rpc( + &mut self, + msg: ServiceMessage, + partition_store: &mut PartitionStore, + schemas: &Schema, + ) { + match msg { + ServiceMessage::Rpc(msg) if msg.msg_type() == PartitionProcessorRpcRequest::TYPE => { + let msg = msg.into_typed::(); + // note: split() decodes the payload + let (response_tx, body) = msg.split(); + self.on_pp_rpc_request(response_tx, body, partition_store, schemas) + .await; + } + ServiceMessage::Rpc(msg) if msg.msg_type() == ReceivedIngestRequest::TYPE => { + self.on_pp_ingest_request(msg.into_typed()).await; + } + msg => { + msg.fail(Verdict::MessageUnrecognized); + } + } + } + + async fn on_pp_ingest_request(&mut self, msg: Incoming>) { + let (reciprocal, request) = msg.split(); + histogram!( + PARTITION_INGESTION_REQUEST_LEN, PARTITION_LABEL => self.partition_id_str.clone() + ) + .record(request.records.len() as f64); + + histogram!( + PARTITION_INGESTION_REQUEST_SIZE, PARTITION_LABEL => self.partition_id_str.clone() + ) + .record(request.records.iter().fold(0, |s, r| s + r.estimate_size()) as f64); + + self.leadership_state + .propose_many_with_callback( + request.records.into_iter(), + |result: Result<(), PartitionProcessorRpcError>| match result { + Ok(_) => reciprocal.send(IngestResponse::Ack), + Err(err) => match err { + PartitionProcessorRpcError::NotLeader(id) + | PartitionProcessorRpcError::LostLeadership(id) => { + reciprocal.send(IngestResponse::NotLeader { of: id }) + } + PartitionProcessorRpcError::Internal(msg) => { + reciprocal.send(IngestResponse::Internal { msg }) + } + }, + }, + ) + .await; + } + async fn maybe_advance<'a>( &mut self, maybe_record: LogEntry, diff --git a/crates/worker/src/partition/rpc/mod.rs b/crates/worker/src/partition/rpc/mod.rs index 878507727d..f15c56cedb 100644 --- a/crates/worker/src/partition/rpc/mod.rs +++ b/crates/worker/src/partition/rpc/mod.rs @@ -22,7 +22,7 @@ mod resume_invocation; use crate::partition; use crate::partition::leadership::LeadershipState; -use restate_core::network::{Oneshot, Reciprocal}; +use restate_core::network::{Oneshot, Reciprocal, TransportConnect}; use restate_invoker_api::InvokerHandle; use restate_storage_api::idempotency_table::ReadOnlyIdempotencyTable; use restate_storage_api::invocation_status_table::ReadInvocationStatusTable; @@ -71,13 +71,14 @@ pub(super) trait Actuator { ); } -impl< +impl Actuator for LeadershipState +where + T: TransportConnect, I: InvokerHandle< partition::invoker_storage_reader::InvokerStorageReader< restate_partition_store::PartitionStore, >, >, -> Actuator for LeadershipState { async fn self_propose_and_respond_asynchronously>( &mut self, diff --git a/crates/worker/src/partition/shuffle.rs b/crates/worker/src/partition/shuffle.rs index 7e384b386d..7387771f06 100644 --- a/crates/worker/src/partition/shuffle.rs +++ b/crates/worker/src/partition/shuffle.rs @@ -9,14 +9,14 @@ // by the Apache License, Version 2.0. use std::future::Future; -use std::sync::Arc; use async_channel::{TryRecvError, TrySendError}; use tokio::sync::mpsc; use tracing::debug; -use restate_bifrost::Bifrost; use restate_core::cancellation_watcher; +use restate_core::network::TransportConnect; +use restate_ingestion_client::IngestionClient; use restate_storage_api::deduplication_table::DedupInformation; use restate_storage_api::outbox_table::OutboxMessage; use restate_types::identifiers::{LeaderEpoch, PartitionId, PartitionKey, WithPartitionKey}; @@ -161,12 +161,12 @@ impl ShuffleMetadata { } } -pub(super) struct Shuffle { +pub(super) struct Shuffle { metadata: ShuffleMetadata, outbox_reader: OR, - bifrost: Bifrost, + ingestion_client: IngestionClient, // used to tell partition processor about outbox truncations truncation_tx: mpsc::Sender, @@ -177,8 +177,9 @@ pub(super) struct Shuffle { hint_tx: async_channel::Sender, } -impl Shuffle +impl Shuffle where + T: TransportConnect, OR: OutboxReader + Send + Sync + 'static, { pub(super) fn new( @@ -186,7 +187,7 @@ where outbox_reader: OR, truncation_tx: mpsc::Sender, channel_size: usize, - bifrost: Bifrost, + ingestion_client: IngestionClient, ) -> Self { let (hint_tx, hint_rx) = async_channel::bounded(channel_size); @@ -196,7 +197,7 @@ where truncation_tx, hint_rx, hint_tx, - bifrost, + ingestion_client, } } @@ -210,7 +211,7 @@ where mut hint_rx, outbox_reader, truncation_tx, - bifrost, + ingestion_client, .. } = self; @@ -220,9 +221,9 @@ where metadata, outbox_reader, move |msg| { - let bifrost = bifrost.clone(); + let client = ingestion_client.clone(); async move { - restate_bifrost::append_to_bifrost(&bifrost, Arc::new(msg)).await?; + client.ingest(msg.partition_key(), msg).await?; Ok(()) } }, @@ -252,10 +253,11 @@ where } mod state_machine { - use pin_project::pin_project; use std::cmp::Ordering; use std::future::Future; use std::pin::Pin; + + use pin_project::pin_project; use tokio_util::sync::ReusableBoxFuture; use tracing::trace; @@ -426,24 +428,32 @@ mod state_machine { #[cfg(test)] mod tests { use std::iter; + use std::num::NonZeroUsize; use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; - use anyhow::anyhow; + use anyhow::{Context, anyhow}; use assert2::let_assert; - use futures::{Stream, StreamExt}; + use futures::StreamExt; + use restate_core::partitions::PartitionRouting; + use restate_ingestion_client::IngestionClient; + use restate_types::net::RpcRequest; + use restate_types::net::ingest::{IngestResponse, ReceivedIngestRequest}; + use restate_types::net::partition_processor::PartitionLeaderService; + use restate_types::partitions::state::{LeadershipState, PartitionReplicaSetStates}; + use restate_types::storage::StorageCodec; use test_log::test; use tokio::sync::mpsc; - use restate_bifrost::{Bifrost, LogEntry}; - use restate_core::network::FailingConnector; + use restate_core::network::{ + BackPressureMode, FailingConnector, ServiceMessage, ServiceStream, + }; use restate_core::{TaskCenter, TaskKind, TestCoreEnv, TestCoreEnvBuilder}; use restate_storage_api::StorageError; use restate_storage_api::outbox_table::OutboxMessage; use restate_types::Version; use restate_types::identifiers::{InvocationId, LeaderEpoch, PartitionId}; use restate_types::invocation::ServiceInvocation; - use restate_types::logs::{KeyFilter, LogId, Lsn, SequenceNumber}; use restate_types::message::MessageIndex; use restate_types::partition_table::PartitionTable; use restate_wal_protocol::{Command, Envelope}; @@ -556,22 +566,27 @@ mod tests { } async fn collect_invoke_commands_until( - stream: impl Stream>, + stream: &mut ServiceStream, last_invocation_id: InvocationId, ) -> anyhow::Result> { let mut messages = Vec::new(); let mut stream = std::pin::pin!(stream); - while let Some(record) = stream.next().await { - let record = record?; + 'out: while let Some(ServiceMessage::Rpc(incoming)) = stream.next().await { + assert_eq!(incoming.msg_type(), ReceivedIngestRequest::TYPE); + let incoming = incoming.into_typed::(); + let (r, body) = incoming.split(); + r.send(IngestResponse::Ack); + for mut record in body.records { + let envelope = StorageCodec::decode::(&mut record.record) + .context("Failed to decode envelope")?; - if let Some(envelope) = record.try_decode::().transpose()? { let_assert!(Command::Invoke(service_invocation) = envelope.command); let invocation_id = service_invocation.invocation_id; messages.push(*service_invocation); if last_invocation_id == invocation_id { - break; + break 'out; } } } @@ -608,31 +623,55 @@ mod tests { struct ShuffleEnv { #[allow(dead_code)] env: TestCoreEnv, - bifrost: Bifrost, - shuffle: Shuffle, + stream: ServiceStream, + ingress: IngestionClient, + shuffle: Shuffle, } async fn create_shuffle_env( outbox_reader: OR, ) -> ShuffleEnv { // set numbers of partitions to 1 to easily find all sent messages by the shuffle - let env = TestCoreEnvBuilder::with_incoming_only_connector() - .set_partition_table(PartitionTable::with_equally_sized_partitions( - Version::MIN, - 1, - )) - .build() - .await; + let mut builder = TestCoreEnvBuilder::with_incoming_only_connector().set_partition_table( + PartitionTable::with_equally_sized_partitions(Version::MIN, 1), + ); + let metadata = ShuffleMetadata::new(PartitionId::from(0), LeaderEpoch::from(0)); + let partition_replica_set_states = PartitionReplicaSetStates::default(); + + partition_replica_set_states.note_observed_leader( + 0.into(), + LeadershipState { + current_leader: builder.my_node_id, + current_leader_epoch: LeaderEpoch::INITIAL, + }, + ); + + let svc = builder + .router_builder + .register_service::(10, BackPressureMode::PushBack); + + let env = builder.build().await; + + let stream = svc.start(); + + let ingress = IngestionClient::new( + env.networking.clone(), + env.metadata.updateable_partition_table(), + PartitionRouting::new(partition_replica_set_states, TaskCenter::current()), + NonZeroUsize::new(10 * 1024 * 1024).unwrap(), + None, + ); + let (truncation_tx, _truncation_rx) = mpsc::channel(1); - let bifrost = Bifrost::init_in_memory(env.metadata_writer.clone()).await; - let shuffle = Shuffle::new(metadata, outbox_reader, truncation_tx, 1, bifrost.clone()); + let shuffle = Shuffle::new(metadata, outbox_reader, truncation_tx, 1, ingress.clone()); ShuffleEnv { env, - bifrost, + stream, + ingress, shuffle, } } @@ -652,18 +691,12 @@ mod tests { .expect("service invocation should be present"); let outbox_reader = MockOutboxReader::new(42, expected_messages.clone()); - let shuffle_env = create_shuffle_env(outbox_reader).await; + let mut shuffle_env = create_shuffle_env(outbox_reader).await; - let partition_id = shuffle_env.shuffle.metadata.partition_id; TaskCenter::spawn_child(TaskKind::Shuffle, "shuffle", shuffle_env.shuffle.run())?; - let reader = shuffle_env.bifrost.create_reader( - LogId::from(partition_id), - KeyFilter::Any, - Lsn::OLDEST, - Lsn::MAX, - )?; - let messages = collect_invoke_commands_until(reader, last_invocation_id).await?; + let messages = + collect_invoke_commands_until(&mut shuffle_env.stream, last_invocation_id).await?; assert_received_invoke_commands(messages, expected_messages); @@ -689,18 +722,12 @@ mod tests { .expect("service invocation should be present"); let outbox_reader = MockOutboxReader::new(42, expected_messages.clone()); - let shuffle_env = create_shuffle_env(outbox_reader).await; + let mut shuffle_env = create_shuffle_env(outbox_reader).await; - let partition_id = shuffle_env.shuffle.metadata.partition_id; TaskCenter::spawn_child(TaskKind::Shuffle, "shuffle", shuffle_env.shuffle.run())?; - let reader = shuffle_env.bifrost.create_reader( - LogId::from(partition_id), - KeyFilter::Any, - Lsn::OLDEST, - Lsn::MAX, - )?; - let messages = collect_invoke_commands_until(reader, last_invocation_id).await?; + let messages = + collect_invoke_commands_until(&mut shuffle_env.stream, last_invocation_id).await?; assert_received_invoke_commands(messages, expected_messages); @@ -722,17 +749,9 @@ mod tests { .expect("service invocation should be present"); let mut outbox_reader = Arc::new(FailingOutboxReader::new(expected_messages.clone(), 10)); - let shuffle_env = create_shuffle_env(Arc::clone(&outbox_reader)).await; + let mut shuffle_env = create_shuffle_env(Arc::clone(&outbox_reader)).await; let total_restarts = Arc::new(AtomicUsize::new(0)); - let partition_id = shuffle_env.shuffle.metadata.partition_id; - let reader = shuffle_env.bifrost.create_reader( - LogId::from(partition_id), - KeyFilter::Any, - Lsn::INVALID, - Lsn::MAX, - )?; - let shuffle_task = TaskCenter::spawn_child(TaskKind::Shuffle, "shuffle", { let total_restarts = Arc::clone(&total_restarts); async move { @@ -765,7 +784,7 @@ mod tests { Arc::clone(&outbox_reader), truncation_tx.clone(), 1, - shuffle_env.bifrost.clone(), + shuffle_env.ingress.clone(), ); } @@ -775,7 +794,8 @@ mod tests { } })?; - let messages = collect_invoke_commands_until(reader, last_invocation_id).await?; + let messages = + collect_invoke_commands_until(&mut shuffle_env.stream, last_invocation_id).await?; assert_received_invoke_commands(messages, expected_messages); diff --git a/crates/worker/src/partition/state_machine/actions.rs b/crates/worker/src/partition/state_machine/actions.rs index 4d27496b79..60bab0187b 100644 --- a/crates/worker/src/partition/state_machine/actions.rs +++ b/crates/worker/src/partition/state_machine/actions.rs @@ -8,8 +8,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use std::time::Duration; - use restate_invoker_api::InvokeInputJournal; use restate_storage_api::outbox_table::OutboxMessage; use restate_storage_api::timer_table::TimerKey; @@ -90,10 +88,6 @@ pub enum Action { /// otherwise the invocation was previously submitted. is_new_invocation: bool, }, - ScheduleInvocationStatusCleanup { - invocation_id: InvocationId, - retention: Duration, - }, ForwardKillResponse { request_id: PartitionProcessorRpcRequestId, response: KillInvocationResponse, diff --git a/crates/worker/src/partition_processor_manager.rs b/crates/worker/src/partition_processor_manager.rs index ed7006e03e..9b2c849d6c 100644 --- a/crates/worker/src/partition_processor_manager.rs +++ b/crates/worker/src/partition_processor_manager.rs @@ -34,7 +34,8 @@ use tracing::{debug, error, info, info_span, instrument, trace, warn}; use restate_bifrost::Bifrost; use restate_bifrost::loglet::FindTailOptions; use restate_core::network::{ - BackPressureMode, Incoming, MessageRouterBuilder, Rpc, ServiceMessage, ServiceReceiver, Verdict, + BackPressureMode, Incoming, MessageRouterBuilder, Rpc, ServiceMessage, ServiceReceiver, + TransportConnect, Verdict, }; use restate_core::worker_api::{ProcessorsManagerCommand, ProcessorsManagerHandle}; use restate_core::{ @@ -42,6 +43,7 @@ use restate_core::{ my_node_id, }; use restate_core::{RuntimeTaskHandle, TaskCenter}; +use restate_ingestion_client::IngestionClient; use restate_invoker_api::StatusHandle; use restate_invoker_api::capacity::InvokerCapacity; use restate_invoker_impl::ChannelStatusReader; @@ -77,6 +79,7 @@ use restate_types::partitions::state::PartitionReplicaSetStates; use restate_types::protobuf::common::WorkerStatus; use restate_types::retries::with_jitter; use restate_types::{GenerationalNodeId, SharedString}; +use restate_wal_protocol::Envelope; use crate::metric_definitions::NUM_PARTITIONS; use crate::metric_definitions::PARTITION_IS_EFFECTIVE_LEADER; @@ -107,7 +110,7 @@ impl From<&PartitionSnapshotMetadata> for SnapshotCreated { } } -pub struct PartitionProcessorManager { +pub struct PartitionProcessorManager { health_status: HealthStatus, updateable_config: Live, processor_states: BTreeMap, @@ -138,6 +141,8 @@ pub struct PartitionProcessorManager { wait_for_partition_table_update: bool, invoker_capacity: InvokerCapacity, + + ingestion_client: IngestionClient, } type SnapshotResult = Result; @@ -243,7 +248,10 @@ impl StatusHandle for MultiplexedInvokerStatusReader { } } -impl PartitionProcessorManager { +impl PartitionProcessorManager +where + T: TransportConnect, +{ #[allow(clippy::too_many_arguments)] pub fn new( health_status: HealthStatus, @@ -254,6 +262,7 @@ impl PartitionProcessorManager { router_builder: &mut MessageRouterBuilder, bifrost: Bifrost, snapshot_repository: Option, + ingestion_client: IngestionClient, ) -> Self { let ppm_svc_rx = router_builder.register_service(24, BackPressureMode::PushBack); let pp_rpc_rx = router_builder.register_service(24, BackPressureMode::PushBack); @@ -292,6 +301,7 @@ impl PartitionProcessorManager { partition_table: Metadata::with_current(|m| m.updateable_partition_table()), wait_for_partition_table_update: false, invoker_capacity, + ingestion_client, } } @@ -1300,6 +1310,7 @@ impl PartitionProcessorManager { self.partition_store_manager.clone(), self.fast_forward_on_startup.remove(&partition_id), self.invoker_capacity.clone(), + self.ingestion_client.clone(), ); self.asynchronous_operations @@ -1457,7 +1468,9 @@ mod tests { use googletest::IntoTestResult; use restate_bifrost::BifrostService; use restate_bifrost::providers::memory_loglet; + use restate_core::partitions::PartitionRouting; use restate_core::{TaskCenter, TaskKind, TestCoreEnvBuilder}; + use restate_ingestion_client::IngestionClient; use restate_partition_store::PartitionStoreManager; use restate_rocksdb::RocksDbManager; use restate_types::config::Configuration; @@ -1471,6 +1484,7 @@ mod tests { MemberState, PartitionReplicaSetStates, ReplicaSetState, }; use restate_types::{GenerationalNodeId, Version}; + use std::num::NonZeroUsize; use std::time::Duration; use test_log::test; use tracing::info; @@ -1505,6 +1519,14 @@ mod tests { let partition_store_manager = PartitionStoreManager::create().await?; + let ingress = IngestionClient::new( + env_builder.networking.clone(), + env_builder.metadata.updateable_partition_table(), + PartitionRouting::new(replica_set_states.clone(), TaskCenter::current()), + NonZeroUsize::new(10 * 1024 * 1024).unwrap(), + None, + ); + let partition_processor_manager = PartitionProcessorManager::new( health_status, Live::from_value(Configuration::default()), @@ -1514,6 +1536,7 @@ mod tests { &mut env_builder.router_builder, bifrost, None, + ingress, ); // only needed for setting up the metadata diff --git a/crates/worker/src/partition_processor_manager/spawn_processor_task.rs b/crates/worker/src/partition_processor_manager/spawn_processor_task.rs index 5be8c278bb..9492b106f3 100644 --- a/crates/worker/src/partition_processor_manager/spawn_processor_task.rs +++ b/crates/worker/src/partition_processor_manager/spawn_processor_task.rs @@ -16,7 +16,9 @@ use tracing::info; use tracing::{instrument, warn}; use restate_bifrost::Bifrost; +use restate_core::network::TransportConnect; use restate_core::{Metadata, RuntimeTaskHandle, TaskCenter, TaskKind, cancellation_token}; +use restate_ingestion_client::IngestionClient; use restate_invoker_api::capacity::InvokerCapacity; use restate_invoker_impl::Service as InvokerService; use restate_partition_store::{PartitionStore, PartitionStoreManager}; @@ -30,6 +32,7 @@ use restate_types::logs::Lsn; use restate_types::partitions::Partition; use restate_types::partitions::state::PartitionReplicaSetStates; use restate_types::schema::Schema; +use restate_wal_protocol::Envelope; use crate::PartitionProcessorBuilder; use crate::invoker_integration::EntryEnricher; @@ -37,7 +40,7 @@ use crate::partition::invoker_storage_reader::InvokerStorageReader; use crate::partition::{ProcessorError, TargetLeaderState}; use crate::partition_processor_manager::processor_state::StartedProcessor; -pub struct SpawnPartitionProcessorTask { +pub struct SpawnPartitionProcessorTask { task_name: SharedString, partition: Partition, configuration: Live, @@ -46,9 +49,13 @@ pub struct SpawnPartitionProcessorTask { partition_store_manager: Arc, fast_forward_lsn: Option, invoker_capacity: InvokerCapacity, + ingestion_client: IngestionClient, } -impl SpawnPartitionProcessorTask { +impl SpawnPartitionProcessorTask +where + T: TransportConnect, +{ #[allow(clippy::too_many_arguments)] pub fn new( task_name: SharedString, @@ -59,6 +66,7 @@ impl SpawnPartitionProcessorTask { partition_store_manager: Arc, fast_forward_lsn: Option, invoker_capacity: InvokerCapacity, + ingestion_client: IngestionClient, ) -> Self { Self { task_name, @@ -69,6 +77,7 @@ impl SpawnPartitionProcessorTask { partition_store_manager, fast_forward_lsn, invoker_capacity, + ingestion_client, } } @@ -96,6 +105,7 @@ impl SpawnPartitionProcessorTask { partition_store_manager, fast_forward_lsn, invoker_capacity, + ingestion_client, } = self; let config = configuration.pinned(); @@ -166,7 +176,7 @@ impl SpawnPartitionProcessorTask { let partition_store = partition_store?; let pp = pp_builder - .build(bifrost, partition_store, replica_set_states) + .build(bifrost, ingestion_client, partition_store, replica_set_states) .await .map_err(ProcessorError::from)?; diff --git a/tools/xtask/Cargo.toml b/tools/xtask/Cargo.toml index 6779167cea..9b68120987 100644 --- a/tools/xtask/Cargo.toml +++ b/tools/xtask/Cargo.toml @@ -9,8 +9,8 @@ publish = false [dependencies] restate-admin = { workspace = true, features = ["options_schema"] } -restate-bifrost = { workspace = true, features = ["test-util"] } restate-core = { workspace = true, features = ["test-util"] } +restate-ingestion-client = { workspace = true } restate-metadata-server = { workspace = true } restate-service-client = { workspace = true } restate-service-protocol = { workspace = true, features = ["discovery"]} diff --git a/tools/xtask/src/main.rs b/tools/xtask/src/main.rs index e0439c2144..5565ff7e39 100644 --- a/tools/xtask/src/main.rs +++ b/tools/xtask/src/main.rs @@ -10,15 +10,18 @@ use std::future::pending; use std::io::Write; +use std::num::NonZeroUsize; use std::sync::Arc; use std::{env, io}; use anyhow::bail; use reqwest::header::ACCEPT; +use restate_core::partitions::PartitionRouting; +use restate_ingestion_client::IngestionClient; +use restate_types::partitions::state::PartitionReplicaSetStates; use schemars::r#gen::SchemaSettings; use restate_admin::service::AdminService; -use restate_bifrost::Bifrost; use restate_core::{TaskCenter, TaskCenterBuilder, TestCoreEnv}; use restate_core::{TaskCenterFutureExt, TaskKind}; use restate_service_client::{AssumeRoleCacheMode, ServiceClient}; @@ -214,14 +217,21 @@ async fn generate_rest_api_doc() -> anyhow::Result<()> { // We start the Meta service, then download the openapi schema generated let node_env = TestCoreEnv::create_with_single_node(1, 1).await; - let bifrost = Bifrost::init_in_memory(node_env.metadata_writer.clone()).await; + + let ingress_client = IngestionClient::new( + node_env.networking, + node_env.metadata.updateable_partition_table(), + PartitionRouting::new(PartitionReplicaSetStates::default(), TaskCenter::current()), + NonZeroUsize::new(1000).unwrap(), + None, + ); let socket_dir = tempfile::tempdir()?; let socket_path = socket_dir.path().join("admin.sock"); let admin_service = AdminService::new( Listeners::new_unix_listener(socket_path.clone())?, node_env.metadata_writer.clone(), - bifrost, + ingress_client, Mock, ServiceDiscovery::new( RetryPolicy::default(), diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index c7cc031421..d636e9274f 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -142,6 +142,7 @@ tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "parki typenum = { version = "1", default-features = false, features = ["const-generics"] } ulid = { version = "1", features = ["serde"] } uuid = { version = "1", features = ["js", "serde", "v4", "v7"] } +xxhash-rust = { version = "0.8", default-features = false, features = ["std", "xxh3"] } zerocopy = { version = "0.7", features = ["derive", "simd"] } zeroize = { version = "1", features = ["zeroize_derive"] } zstd = { version = "0.13" } @@ -279,6 +280,7 @@ tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "parki typenum = { version = "1", default-features = false, features = ["const-generics"] } ulid = { version = "1", features = ["serde"] } uuid = { version = "1", features = ["js", "serde", "v4", "v7"] } +xxhash-rust = { version = "0.8", default-features = false, features = ["std", "xxh3"] } zerocopy = { version = "0.7", features = ["derive", "simd"] } zeroize = { version = "1", features = ["zeroize_derive"] } zstd = { version = "0.13" }