-
Notifications
You must be signed in to change notification settings - Fork 126
feat[array]: pushdown struct validity on write #5923
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
1d028f8
003256a
12b6c68
c558293
648ab8b
390f592
a43054c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
|
|
||
| use std::fmt::Debug; | ||
| use std::iter::once; | ||
| use std::ops::Not; | ||
| use std::sync::Arc; | ||
|
|
||
| use vortex_dtype::DType; | ||
|
|
@@ -18,10 +19,23 @@ use vortex_error::vortex_err; | |
| use crate::Array; | ||
| use crate::ArrayRef; | ||
| use crate::IntoArray; | ||
| use crate::builtins::ArrayBuiltins; | ||
| use crate::compute::mask; | ||
| use crate::stats::ArrayStats; | ||
| use crate::validity::Validity; | ||
| use crate::vtable::ValidityHelper; | ||
|
|
||
| /// Metadata for StructArray serialization. | ||
| #[derive(Clone, prost::Message)] | ||
| pub struct StructMetadata { | ||
| /// If true, child validity is a superset of struct validity (validity was pushed down). | ||
| /// For nullable children, their validity already includes struct nulls. For non-nullable | ||
| /// children, we apply struct validity on field read. If false (default), no guarantee | ||
| /// about relationship - must intersect validities on read. | ||
| #[prost(bool, tag = "1", default = false)] | ||
| pub(super) validity_pushed_down: bool, | ||
| } | ||
|
|
||
| /// A struct array that stores multiple named fields as columns, similar to a database row. | ||
| /// | ||
| /// This mirrors the Apache Arrow Struct array encoding and provides a columnar representation | ||
|
|
@@ -147,6 +161,9 @@ pub struct StructArray { | |
| pub(super) fields: Arc<[ArrayRef]>, | ||
| pub(super) validity: Validity, | ||
| pub(super) stats_set: ArrayStats, | ||
| /// true = child validity is a superset of struct validity (validity was pushed down) | ||
| /// false = default, no guarantee about relationship | ||
| pub(super) validity_pushed_down: bool, | ||
| } | ||
|
|
||
| pub struct StructArrayParts { | ||
|
|
@@ -157,12 +174,78 @@ pub struct StructArrayParts { | |
| } | ||
|
|
||
| impl StructArray { | ||
| /// Return the struct fields without the validity of the struct applied | ||
| /// Note this field may not have the validity of the parent struct applied. | ||
| /// Should use `masked_fields` instead, unless you know what you are doing. | ||
| pub fn unmasked_fields(&self) -> &Arc<[ArrayRef]> { | ||
| &self.fields | ||
| } | ||
|
|
||
| /// Return the struct field without the validity of the struct applied | ||
| pub fn masked_fields(&self) -> VortexResult<Vec<ArrayRef>> { | ||
| if !self.dtype.is_nullable() { | ||
| // fields need not be masked | ||
| return Ok(self.fields.to_vec()); | ||
| } | ||
|
|
||
| if self.has_validity_pushed_down() { | ||
| self.fields | ||
| .iter() | ||
| .cloned() | ||
| .map(|f| { | ||
| if f.dtype().is_nullable() { | ||
| Ok(f.into_array()) | ||
| } else { | ||
| let validity = self.validity().to_array(self.len); | ||
| f.mask(validity) | ||
| } | ||
| }) | ||
| .collect::<VortexResult<Vec<_>>>() | ||
| } else { | ||
| // Apply struct validity to all fields | ||
| let struct_validity = self.validity().to_array(self.len); | ||
| self.fields | ||
| .iter() | ||
| .map(move |f| f.clone().mask(struct_validity.clone())) | ||
| .collect::<VortexResult<Vec<_>>>() | ||
| } | ||
| } | ||
|
|
||
| /// Return the struct field with name `name` with the struct validity already applied. | ||
| /// If the struct has no field with that `name` an error is returned. | ||
| pub fn field_by_name(&self, name: impl AsRef<str>) -> VortexResult<ArrayRef> { | ||
| let name = name.as_ref(); | ||
| self.field_by_name_opt(name)?.ok_or_else(|| { | ||
| vortex_err!( | ||
| "Field {name} not found in struct array with names {:?}", | ||
| self.names() | ||
| ) | ||
| }) | ||
| } | ||
|
|
||
| /// Return the struct field with name `name` with the struct validity already applied. | ||
| /// If the struct has no field with that `name` Ok(None) is returned. | ||
| pub fn field_by_name_opt(&self, name: impl AsRef<str>) -> VortexResult<Option<ArrayRef>> { | ||
joseph-isaacs marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| let name = name.as_ref(); | ||
| self.struct_fields() | ||
| .find(name) | ||
| .map(|idx| { | ||
| let field = self.fields[idx].clone(); | ||
| // Non-nullable struct: return field as-is (no struct validity to apply) | ||
| if !self.dtype.is_nullable() { | ||
| return Ok(field); | ||
| } | ||
| // Non-nullable field: always apply struct validity (even with validity_pushed_down, | ||
| // since we can't push validity to non-nullable fields) | ||
| if !field.dtype().is_nullable() { | ||
| return field.mask(self.validity().to_array(self.len)); | ||
| } | ||
| // Nullable field: return as-is (validity is either in the field or was pushed down) | ||
| Ok(field) | ||
|
Comment on lines
+231
to
+242
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like this is actually a larger semantic change than it seems, because we would then not have this logic for the other nested types (list, list view, and fixed size list). For list and fsl maybe this makes sense. But the scariest example would be list view, what does pushing down validity even mean for list view given there can be overlaps in the views? I won't block this since it seems like we need it for other things(?), but it's something we should at least think about.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At the very least, I don't think it should be pushed down by default, only when you request a field with validity pushed down
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would suggest we do this everywhere, since you can never read back the value. |
||
| }) | ||
| .transpose() | ||
| } | ||
|
|
||
| /// Note this field may not have the validity of the parent struct applied. | ||
| /// Should use `field_by_name` instead. | ||
| pub fn unmasked_field_by_name(&self, name: impl AsRef<str>) -> VortexResult<&ArrayRef> { | ||
| let name = name.as_ref(); | ||
| self.unmasked_field_by_name_opt(name).ok_or_else(|| { | ||
|
|
@@ -173,7 +256,8 @@ impl StructArray { | |
| }) | ||
| } | ||
|
|
||
| /// Return the struct field without the validity of the struct applied | ||
| /// Note this field may not have the validity of the parent struct applied. | ||
| /// Should use `field_by_name_opt` instead. | ||
| pub fn unmasked_field_by_name_opt(&self, name: impl AsRef<str>) -> Option<&ArrayRef> { | ||
| let name = name.as_ref(); | ||
| self.struct_fields().find(name).map(|idx| &self.fields[idx]) | ||
|
|
@@ -286,6 +370,7 @@ impl StructArray { | |
| fields, | ||
| validity, | ||
| stats_set: Default::default(), | ||
| validity_pushed_down: false, | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -473,4 +558,138 @@ impl StructArray { | |
|
|
||
| Self::try_new_with_dtype(children, new_fields, self.len, self.validity.clone()) | ||
| } | ||
|
|
||
| /// Returns whether validity has been pushed down into children. | ||
| /// | ||
| /// When true, child validity is a superset of struct validity (children include | ||
| /// the struct's nulls baked in). This is an optimization that allows readers to | ||
| /// skip combining struct+child validity when extracting fields. | ||
| pub fn has_validity_pushed_down(&self) -> bool { | ||
| #[cfg(debug_assertions)] | ||
| if self.validity_pushed_down { | ||
| self.validate_validity_pushed_down() | ||
| .vortex_expect("validity_pushed_down invariant violated"); | ||
| } | ||
| self.validity_pushed_down | ||
| } | ||
|
|
||
| /// Checks that the validity_pushed_down invariant holds. | ||
| /// | ||
| /// When `validity_pushed_down` is true, for every nullable child field, | ||
| /// the child's validity must be a superset of the struct's validity. | ||
| /// That is, wherever the struct is invalid (null), the child must also be invalid. | ||
| #[cfg(debug_assertions)] | ||
| fn validate_validity_pushed_down(&self) -> VortexResult<()> { | ||
| if !self.validity_pushed_down { | ||
| return Ok(()); | ||
| } | ||
|
|
||
| let struct_validity = self.validity_mask()?; | ||
|
|
||
| for (idx, field) in self.fields.iter().enumerate() { | ||
| // Only check nullable children - non-nullable children cannot have validity pushed down | ||
| if !field.dtype().is_nullable() { | ||
| continue; | ||
| } | ||
|
|
||
| let child_validity = field.validity_mask()?; | ||
|
|
||
| // Check invariant: struct_invalid => child_invalid | ||
| // Equivalently: (!struct_validity) & child_validity should be all-false | ||
| // If struct is invalid (false) but child is valid (true), that's a violation | ||
| let violation = &(!&struct_validity) & &child_validity; | ||
| if !violation.all_false() { | ||
| vortex_bail!( | ||
| "validity_pushed_down invariant violated for field {}: \ | ||
| struct has nulls at positions where child is valid", | ||
| idx | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| /// Set the validity_pushed_down flag. | ||
| /// | ||
| /// For non-nullable structs, this is a no-op (flag stays false) since there's no validity to | ||
| /// push down | ||
| /// | ||
| /// For nullable structs, setting this to true indicates that child validity | ||
| /// is a superset of struct validity (children include struct's nulls). | ||
| /// | ||
| /// # Safety | ||
| /// | ||
| /// If set all non-nullable field must have their nullability be a superset of the struct | ||
| /// validity | ||
| pub unsafe fn with_validity_pushed_down(mut self, validity_pushed_down: bool) -> Self { | ||
| // For non-nullable structs, the flag is meaningless - keep it false | ||
| if !self.dtype.is_nullable() { | ||
| return self; | ||
| } | ||
| self.validity_pushed_down = validity_pushed_down; | ||
|
|
||
| #[cfg(debug_assertions)] | ||
| if validity_pushed_down { | ||
| self.validate_validity_pushed_down() | ||
| .vortex_expect("validity_pushed_down invariant violated"); | ||
| } | ||
|
|
||
| self | ||
| } | ||
|
|
||
| /// Push struct validity down into each child field. | ||
| /// | ||
| /// For nullable structs with non-trivial validity, this applies the validity | ||
| /// mask to each **nullable** child field, making child validity a superset | ||
| /// of parent validity. Non-nullable children are left unchanged to preserve | ||
| /// their dtype. | ||
| /// | ||
| /// The struct validity is **preserved** (DType never changes). The | ||
| /// `validity_pushed_down` flag indicates that nullable children already include | ||
| /// the parent's nulls, so readers can skip combining validities for those fields. | ||
| /// | ||
| /// For non-nullable structs or trivial validity, this is essentially a no-op. | ||
| pub fn compact(&self) -> VortexResult<Self> { | ||
| // For non-nullable structs, nothing to push down | ||
| if !self.dtype.is_nullable() { | ||
| return Ok(self.clone()); | ||
| } | ||
|
|
||
| // If validity is trivial (AllValid), nothing to push down | ||
| // but mark as pushed down since children trivially include parent validity | ||
| if self.validity.all_valid(self.len)? { | ||
| // # Safety no validity to push down | ||
| return Ok(unsafe { self.clone().with_validity_pushed_down(true) }); | ||
| } | ||
|
|
||
| // Get the validity mask - mask() expects true = set to null, so we invert the validity | ||
| let validity_mask = self.validity_mask()?.not(); | ||
|
|
||
| // Apply mask only to nullable children - non-nullable children cannot have their | ||
| // dtype changed, so we leave them alone | ||
| let new_fields: Vec<ArrayRef> = self | ||
| .unmasked_fields() | ||
| .iter() | ||
| .map(|field| { | ||
| if field.dtype().is_nullable() { | ||
| mask(field.as_ref(), &validity_mask) | ||
| } else { | ||
| Ok(field.clone()) | ||
| } | ||
| }) | ||
| .collect::<VortexResult<_>>()?; | ||
|
|
||
| // Create new struct with same validity but updated children | ||
| // # Safety mask of struct validity applied to each child. | ||
| Ok(unsafe { | ||
| StructArray::try_new( | ||
| self.names().clone(), | ||
| new_fields, | ||
| self.len(), | ||
| self.validity.clone(), // Keep original validity | ||
| )? | ||
| .with_validity_pushed_down(true) | ||
| }) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why only push this into null children why not all ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is unneeded (free to add on read) and changes the dtype of the child.