Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions vortex-array/src/expr/exprs/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use vortex_error::VortexResult;
use vortex_error::vortex_bail;
use vortex_error::vortex_err;
use vortex_proto::expr as pb;
use vortex_scalar::StringLike;

use crate::ArrayRef;
use crate::compute::LikeOptions;
Expand All @@ -19,9 +20,16 @@ use crate::expr::ExecutionArgs;
use crate::expr::ExecutionResult;
use crate::expr::ExprId;
use crate::expr::Expression;
use crate::expr::Literal;
use crate::expr::StatsCatalog;
use crate::expr::VTable;
use crate::expr::VTableExt;
use crate::expr::and;
use crate::expr::gt;
use crate::expr::gt_eq;
use crate::expr::lit;
use crate::expr::lt;
use crate::expr::or;

/// Expression that performs SQL LIKE pattern matching.
pub struct Like;
Expand Down Expand Up @@ -127,6 +135,67 @@ impl VTable for Like {
fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
false
}

fn stat_falsification(
&self,
like_opts: &LikeOptions,
expr: &Expression,
catalog: &dyn StatsCatalog,
) -> Option<Expression> {
// Attempt to do min/max pruning for LIKE 'exact' or LIKE 'prefix%'

// Don't attempt to handle ilike or negated like
if like_opts.negated || like_opts.case_insensitive {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In theory we could handle negated like if we know that the block ONLY contains things which match the prefix, but we'd need to add a starts_with expression and I didn't want to do that here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, we can do this without a starts_with, if we see that min >= prefix AND min < succ AND max >= prefix AND max < succ AND null_count = 0

return None;
}

// Extract the pattern out
let pat = expr.child(1).as_::<Literal>();

// LIKE NULL is nonsensical, don't try to handle it
let pat_str = pat.as_utf8().value()?;

let src = expr.child(0).clone();
let src_min = src.stat_min(catalog)?;
let src_max = src.stat_max(catalog)?;

match LikeVariant::from_str(&pat_str)? {
LikeVariant::Exact(text) => {
// col LIKE 'exact' ==> col.min > 'exact' || col.max < 'exact'
Some(or(gt(src_min, lit(text)), lt(src_max, lit(text))))
}
LikeVariant::Prefix(prefix) => {
// col LIKE 'prefix%' ==> col.max < 'prefix' || col.min >= 'prefiy'
let succ = prefix.to_string().increment().ok()?;

Some(or(gt_eq(src_min, lit(succ)), lt(src_max, lit(prefix))))
}
}
}
}

/// Variants of the LIKE filter that we know how to turn into a stats pruning predicate.s
#[derive(Debug, PartialEq)]
enum LikeVariant<'a> {
Exact(&'a str),
Prefix(&'a str),
}

impl<'a> LikeVariant<'a> {
/// Parse a LIKE pattern string into its relevant variant
fn from_str(string: &str) -> Option<LikeVariant<'_>> {
let Some(wildcard_pos) = string.find(['%', '_']) else {
return Some(LikeVariant::Exact(string));
};

// Can't handle wildcard in the front.
if wildcard_pos == 0 {
return None;
}

let prefix = &string[..wildcard_pos];
Some(LikeVariant::Prefix(prefix))
}
}

pub fn like(child: Expression, pattern: Expression) -> Expression {
Expand Down Expand Up @@ -176,12 +245,17 @@ mod tests {

use crate::ToCanonical;
use crate::arrays::BoolArray;
use crate::expr::col;
use crate::expr::exprs::get_item::get_item;
use crate::expr::exprs::like::LikeVariant;
use crate::expr::exprs::like::like;
use crate::expr::exprs::like::not_ilike;
use crate::expr::exprs::literal::lit;
use crate::expr::exprs::not::not;
use crate::expr::exprs::root::root;
use crate::expr::ilike;
use crate::expr::not_like;
use crate::expr::pruning::pruning_expr::TrackingStatsCatalog;

#[test]
fn invert_booleans() {
Expand Down Expand Up @@ -217,4 +291,66 @@ mod tests {
let expr2 = not_ilike(root(), lit("test*"));
assert_eq!(expr2.to_string(), "$ not ilike \"test*\"");
}

#[test]
fn test_like_variant() {
// Supported patterns
assert_eq!(
LikeVariant::from_str("simple"),
Some(LikeVariant::Exact("simple"))
);
assert_eq!(
LikeVariant::from_str("prefix%"),
Some(LikeVariant::Prefix("prefix"))
);
assert_eq!(
LikeVariant::from_str("first%rest_stuff"),
Some(LikeVariant::Prefix("first"))
);

// Unsupported patterns
assert_eq!(LikeVariant::from_str("%suffix"), None);
assert_eq!(LikeVariant::from_str("_pattern"), None);
}

#[test]
fn test_like_pushdown() {
// Test that LIKE prefix and exactness filters can be pushed down into stats filtering
// at scan time.
let catalog = TrackingStatsCatalog::default();

let pruning_expr = like(col("a"), lit("prefix%"))
.stat_falsification(&catalog)
.expect("LIKE stat falsification");

insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "prefiy") or ($.a_max < "prefix"))"#);

// Multiple wildcards
let pruning_expr = like(col("a"), lit("pref%ix%"))
.stat_falsification(&catalog)
.expect("LIKE stat falsification");
insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);

let pruning_expr = like(col("a"), lit("pref_ix_"))
.stat_falsification(&catalog)
.expect("LIKE stat falsification");
insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);

// Exact match
let pruning_expr = like(col("a"), lit("exactly"))
.stat_falsification(&catalog)
.expect("LIKE stat falsification");
insta::assert_snapshot!(pruning_expr, @r#"(($.a_min > "exactly") or ($.a_max < "exactly"))"#);

// Suffix search skips pushdown
let pruning_expr = like(col("a"), lit("%suffix")).stat_falsification(&catalog);
assert_eq!(pruning_expr, None);

// NOT LIKE, ILIKE not supported currently
assert_eq!(
None,
not_like(col("a"), lit("a")).stat_falsification(&catalog)
);
assert_eq!(None, ilike(col("a"), lit("a")).stat_falsification(&catalog));
}
}
1 change: 0 additions & 1 deletion vortex-array/src/expr/exprs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ pub(crate) mod operators;
pub(crate) mod pack;
pub(crate) mod root;
pub(crate) mod select;

pub use between::*;
pub use binary::*;
pub use cast::*;
Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/expr/pruning/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

mod pruning_expr;
pub(crate) mod pruning_expr;
mod relation;

pub use pruning_expr::RequiredStats;
Expand Down
10 changes: 5 additions & 5 deletions vortex-array/src/expr/pruning/pruning_expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ pub type RequiredStats = Relation<FieldPath, Stat>;
// A catalog that return a stat column whenever it is required, tracking all accessed
// stats and returning them later.
#[derive(Default)]
struct TrackingStatsCatalog {
pub(crate) struct TrackingStatsCatalog {
usage: RefCell<HashMap<(FieldPath, Stat), Expression>>,
}

Expand All @@ -37,7 +37,7 @@ impl TrackingStatsCatalog {

// A catalog that return a stat column if it exists in the given scope.
struct ScopeStatsCatalog<'a> {
any_catalog: TrackingStatsCatalog,
inner: TrackingStatsCatalog,
available_stats: &'a FieldPathSet,
}

Expand All @@ -46,7 +46,7 @@ impl StatsCatalog for ScopeStatsCatalog<'_> {
let stat_path = field_path.clone().push(stat.name());

if self.available_stats.contains(&stat_path) {
self.any_catalog.stats_ref(field_path, stat)
self.inner.stats_ref(field_path, stat)
} else {
None
}
Expand Down Expand Up @@ -93,15 +93,15 @@ pub fn checked_pruning_expr(
available_stats: &FieldPathSet,
) -> Option<(Expression, RequiredStats)> {
let catalog = ScopeStatsCatalog {
any_catalog: Default::default(),
inner: Default::default(),
available_stats,
};

let expr = expr.stat_falsification(&catalog)?;

// TODO(joe): filter access by used exprs
let mut relation: Relation<FieldPath, Stat> = Relation::new();
for ((field_path, stat), _) in catalog.any_catalog.into_usages() {
for ((field_path, stat), _) in catalog.inner.into_usages() {
relation.insert(field_path, stat)
}

Expand Down
3 changes: 2 additions & 1 deletion vortex-scalar/src/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ impl<'a> BinaryScalar<'a> {
self.value.as_ref().map(|v| v.as_ref())
}

/// Constructs a value at most `max_length` in size that's greater than this value.
/// Constructs the next scalar at most `max_length` bytes that's lexicographically greater than
/// this.
///
/// Returns None if constructing a greater value would overflow.
pub fn upper_bound(self, max_length: usize) -> Option<Self> {
Expand Down
103 changes: 79 additions & 24 deletions vortex-scalar/src/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,72 @@ use crate::InnerScalarValue;
use crate::Scalar;
use crate::ScalarValue;

/// Types that can hold a valid UTF-8 string.
pub trait StringLike: private::Sealed + Sized {
/// Replace the last codepoint in the string with the next codepoint.
///
/// This operation will attempt to reuse the original memory.
///
/// If incrementing the last char fails, or if the string is empty,
/// we return an Err with the original unmodified string.
fn increment(self) -> Result<Self, Self>;
}

mod private {
use vortex_buffer::BufferString;

use crate::StringLike;

pub trait Sealed {}

impl Sealed for String {}

impl StringLike for String {
fn increment(mut self) -> Result<String, String> {
let Some(last_char) = self.pop() else {
return Ok(self);
};

if let Some(next_char) = char::from_u32(last_char as u32 + 1) {
self.push(next_char);
Ok(self)
} else {
// Return the original string
self.push(last_char);
Err(self)
}
}
}

impl Sealed for BufferString {}

impl StringLike for BufferString {
#[allow(clippy::unwrap_in_result, clippy::expect_used)]
fn increment(self) -> Result<BufferString, BufferString> {
if self.is_empty() {
return Err(self);
}

// Chop off the last char and return it here.
let (last_idx, last_char) = self.char_indices().last().expect("non-empty");
if let Some(next_char) = char::from_u32(last_char as u32 + 1)
&& next_char.len_utf8() == last_char.len_utf8()
{
// Because the next char has the same byte width as the last char, we can overwrite
// the memory directly.
let mut bytes = self.into_inner().into_mut();
next_char.encode_utf8(&mut bytes.as_mut()[last_idx..]);

// SAFETY: we overwrite the last valid char with new valid char, so
// the buffer continues to hold valid UTF-8 data.
unsafe { Ok(BufferString::new_unchecked(bytes.freeze())) }
} else {
Err(self)
}
}
}
}

/// A scalar value representing a UTF-8 encoded string.
///
/// This type provides a view into a UTF-8 string scalar value, which can be either
Expand Down Expand Up @@ -92,7 +158,8 @@ impl<'a> Utf8Scalar<'a> {
self.value.as_ref().map(|v| v.as_ref())
}

/// Constructs a value at most `max_length` in size that's greater than this value.
/// Constructs the next scalar at most `max_length` bytes that's lexicographically greater than
/// this.
///
/// Returns None if constructing a greater value would overflow.
pub fn upper_bound(self, max_length: usize) -> Option<Self> {
Expand All @@ -102,29 +169,16 @@ impl<'a> Utf8Scalar<'a> {
.rfind(|p| value.is_char_boundary(*p))
.vortex_expect("Failed to find utf8 character boundary");

let utf8_mut = value
.get(..utf8_split_pos)
.vortex_expect("Slicing with existing index");

for (idx, original_char) in utf8_mut.char_indices().rev() {
let original_len = original_char.len_utf8();
if let Some(next_char) = char::from_u32(original_char as u32 + 1) {
// do not allow increasing byte width of incremented char
if next_char.len_utf8() == original_len {
let sliced = value.inner().slice(0..idx + original_len);
drop(value);
let mut result = sliced.into_mut();
next_char.encode_utf8(&mut result[idx..]);
return Some(Self {
dtype: self.dtype,
value: Some(Arc::new(unsafe {
BufferString::new_unchecked(result.freeze())
})),
});
}
}
}
None
let sliced = value.inner().slice(..utf8_split_pos);
drop(value);

// SAFETY: we slice to a char boundary so the sliced range contains valid UTF-8.
let sliced_buf = unsafe { BufferString::new_unchecked(sliced) };
let incremented = sliced_buf.increment().ok()?;
Some(Self {
dtype: self.dtype,
value: Some(Arc::new(incremented)),
})
} else {
Some(Self {
dtype: self.dtype,
Expand Down Expand Up @@ -382,6 +436,7 @@ mod tests {
#[test]
fn upper_bound_overflow() {
let utf8 = Scalar::utf8("🂑🂒🂓", Nullability::NonNullable);

assert!(
Utf8Scalar::try_from(&utf8)
.vortex_expect("utf8 scalar conversion should succeed")
Expand Down
Loading