diff --git a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs index 64074b5f9..0b001f453 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs @@ -805,21 +805,31 @@ impl Hypervisor for HypervLinuxDriver { HyperlightExit::Cancelled() } else { #[cfg(gdb)] - if debug_interrupt { - self.interrupt_handle - .debug_interrupt - .store(false, Ordering::Relaxed); - - // If the vCPU was stopped because of an interrupt, we need to - // return a special exit reason so that the gdb thread can handle it - // and resume execution - HyperlightExit::Debug(VcpuStopReason::Interrupt) - } else { - HyperlightExit::Retry() + { + if debug_interrupt { + self.interrupt_handle + .debug_interrupt + .store(false, Ordering::Relaxed); + + // If the vCPU was stopped because of an interrupt, we need to + // return a special exit reason so that the gdb thread can handle it + // and resume execution + HyperlightExit::Debug(VcpuStopReason::Interrupt) + } else { + // Track erroneous vCPU kick - stale signal from previous call + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK) + .increment(1); + HyperlightExit::Retry() + } } #[cfg(not(gdb))] - HyperlightExit::Retry() + { + // Track erroneous vCPU kick - stale signal from previous call + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK) + .increment(1); + HyperlightExit::Retry() + } } } libc::EAGAIN => HyperlightExit::Retry(), diff --git a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs index ca1f75997..1baa1e98a 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs @@ -662,25 +662,29 @@ impl Hypervisor for HypervWindowsDriver { WHV_RUN_VP_EXIT_REASON(8193i32) => { debug!("HyperV Cancelled Details :\n {:#?}", &self); #[cfg(gdb)] - if debug_interrupt { - self.interrupt_handle - .debug_interrupt - .store(false, Ordering::Relaxed); - - // If the vCPU was stopped because of an interrupt, we need to - // return a special exit reason so that the gdb thread can handle it - // and resume execution - HyperlightExit::Debug(VcpuStopReason::Interrupt) - } else if !cancel_was_requested_manually { - // This was an internal cancellation - // The virtualization stack can use this function to return the control - // of a virtual processor back to the virtualization stack in case it - // needs to change the state of a VM or to inject an event into the processor - // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks - debug!("Internal cancellation detected, returning Retry error"); - HyperlightExit::Retry() - } else { - HyperlightExit::Cancelled() + { + if debug_interrupt { + self.interrupt_handle + .debug_interrupt + .store(false, Ordering::Relaxed); + + // If the vCPU was stopped because of an interrupt, we need to + // return a special exit reason so that the gdb thread can handle it + // and resume execution + HyperlightExit::Debug(VcpuStopReason::Interrupt) + } else if !cancel_was_requested_manually { + // This was an internal cancellation + // The virtualization stack can use this function to return the control + // of a virtual processor back to the virtualization stack in case it + // needs to change the state of a VM or to inject an event into the processor + // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks + debug!("Internal cancellation detected, returning Retry error"); + // Track erroneous vCPU kick - internal cancellation not requested by user + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); + HyperlightExit::Retry() + } else { + HyperlightExit::Cancelled() + } } #[cfg(not(gdb))] @@ -692,6 +696,8 @@ impl Hypervisor for HypervWindowsDriver { // needs to change the state of a VM or to inject an event into the processor // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks debug!("Internal cancellation detected, returning Retry error"); + // Track erroneous vCPU kick - internal cancellation not requested by user + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); HyperlightExit::Retry() } else { HyperlightExit::Cancelled() diff --git a/src/hyperlight_host/src/hypervisor/kvm.rs b/src/hyperlight_host/src/hypervisor/kvm.rs index 04b8ed60f..e62e624b5 100644 --- a/src/hyperlight_host/src/hypervisor/kvm.rs +++ b/src/hyperlight_host/src/hypervisor/kvm.rs @@ -750,21 +750,31 @@ impl Hypervisor for KVMDriver { HyperlightExit::Cancelled() } else { #[cfg(gdb)] - if debug_interrupt { - self.interrupt_handle - .debug_interrupt - .store(false, Ordering::Relaxed); - - // If the vCPU was stopped because of an interrupt, we need to - // return a special exit reason so that the gdb thread can handle it - // and resume execution - HyperlightExit::Debug(VcpuStopReason::Interrupt) - } else { - HyperlightExit::Retry() + { + if debug_interrupt { + self.interrupt_handle + .debug_interrupt + .store(false, Ordering::Relaxed); + + // If the vCPU was stopped because of an interrupt, we need to + // return a special exit reason so that the gdb thread can handle it + // and resume execution + HyperlightExit::Debug(VcpuStopReason::Interrupt) + } else { + // Track erroneous vCPU kick - stale signal from previous call + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK) + .increment(1); + HyperlightExit::Retry() + } } #[cfg(not(gdb))] - HyperlightExit::Retry() + { + // Track erroneous vCPU kick - stale signal from previous call + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK) + .increment(1); + HyperlightExit::Retry() + } } } libc::EAGAIN => HyperlightExit::Retry(), diff --git a/src/hyperlight_host/src/metrics/mod.rs b/src/hyperlight_host/src/metrics/mod.rs index 3a630fa44..033936e8f 100644 --- a/src/hyperlight_host/src/metrics/mod.rs +++ b/src/hyperlight_host/src/metrics/mod.rs @@ -21,6 +21,9 @@ pub(crate) static METRIC_GUEST_ERROR_LABEL_CODE: &str = "code"; // Counter metric that counts the number of times a guest function was called due to timing out pub(crate) static METRIC_GUEST_CANCELLATION: &str = "guest_cancellations_total"; +// Counter metric that counts the number of times a vcpu was kicked by a stale signal +pub(crate) static METRIC_ERRONEOUS_VCPU_KICK: &str = "erroneous_vcpu_kicks_total"; + // Histogram metric that measures the duration of guest function calls #[cfg(feature = "function_call_metrics")] pub(crate) static METRIC_GUEST_FUNC_DURATION: &str = "guest_call_duration_seconds"; @@ -196,4 +199,33 @@ mod tests { } } } + + #[test] + fn test_erroneous_vcpu_kick_metric_exists() { + // This test verifies that the metric can be incremented without errors. + // The actual scenario where this metric is incremented (stale signal delivery) + // is a race condition that's difficult to reliably reproduce in a test. + let recorder = metrics_util::debugging::DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + let snapshot = with_local_recorder(&recorder, || { + // Manually increment the metric to verify it works + metrics::counter!(METRIC_ERRONEOUS_VCPU_KICK).increment(1); + snapshotter.snapshot() + }); + + // Verify the metric was recorded + #[expect(clippy::mutable_key_type)] + let snapshot = snapshot.into_hashmap(); + + let counter_key = CompositeKey::new( + metrics_util::MetricKind::Counter, + Key::from_name(METRIC_ERRONEOUS_VCPU_KICK), + ); + assert_eq!( + snapshot.get(&counter_key).unwrap().2, + metrics_util::debugging::DebugValue::Counter(1), + "Erroneous vCPU kick metric should be recorded" + ); + } }