Skip to content

Commit 36086d2

Browse files
committed
[ENH] Put both token id and token str in the statistics
1 parent 2eca285 commit 36086d2

File tree

1 file changed

+58
-20
lines changed

1 file changed

+58
-20
lines changed

rust/worker/src/execution/functions/statistics.rs

Lines changed: 58 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ enum StatisticsValue {
6767
/// String metadata value associated with a record.
6868
Str(String),
6969
/// Sparse vector index observed in metadata.
70-
SparseVector(u32),
70+
SparseVector(u32, Option<String>),
7171
}
7272

7373
impl StatisticsValue {
@@ -78,7 +78,7 @@ impl StatisticsValue {
7878
Self::Int(_) => "int",
7979
Self::Float(_) => "float",
8080
Self::Str(_) => "str",
81-
Self::SparseVector(_) => "sparse",
81+
Self::SparseVector(_, _) => "sparse",
8282
}
8383
}
8484

@@ -89,12 +89,12 @@ impl StatisticsValue {
8989
Self::Int(_) => "i",
9090
Self::Float(_) => "f",
9191
Self::Str(_) => "s",
92-
Self::SparseVector(_) => "sv",
92+
Self::SparseVector(_, _) => "sv",
9393
}
9494
}
9595

9696
/// A stable representation of the statistics's value.
97-
fn stable_value(&self) -> String {
97+
fn stable_value_index(&self) -> String {
9898
match self {
9999
Self::Bool(b) => {
100100
format!("{b}")
@@ -104,16 +104,27 @@ impl StatisticsValue {
104104
}
105105
Self::Str(s) => s.clone(),
106106
Self::Float(f) => format!("{f:.16e}"),
107-
Self::SparseVector(index) => {
107+
Self::SparseVector(index, _) => {
108108
format!("{index}")
109109
}
110110
}
111111
}
112112

113+
/// A stable representation of the statistics's value.
114+
fn stable_value_token(&self) -> Option<String> {
115+
match self {
116+
Self::Bool(_) => None,
117+
Self::Int(_) => None,
118+
Self::Str(_) => None,
119+
Self::Float(_) => None,
120+
Self::SparseVector(_, token) => token.clone(),
121+
}
122+
}
123+
113124
/// A stable string representation of a statistics value with type tag.
114125
/// Separate so display repr can change.
115-
fn stable_string(&self) -> String {
116-
format!("{}:{}", self.type_prefix(), self.stable_value())
126+
fn stable_value_string(&self) -> String {
127+
format!("{}:{}", self.type_prefix(), self.stable_value_index())
117128
}
118129

119130
/// Convert MetadataValue to a vector of StatisticsValue.
@@ -124,18 +135,31 @@ impl StatisticsValue {
124135
MetadataValue::Int(i) => vec![StatisticsValue::Int(*i)],
125136
MetadataValue::Float(f) => vec![StatisticsValue::Float(*f)],
126137
MetadataValue::Str(s) => vec![StatisticsValue::Str(s.clone())],
127-
MetadataValue::SparseVector(sparse) => sparse
128-
.indices
129-
.iter()
130-
.map(|index| StatisticsValue::SparseVector(*index))
131-
.collect(),
138+
MetadataValue::SparseVector(sparse) => {
139+
if let Some(tokens) = sparse.tokens.as_ref() {
140+
sparse
141+
.indices
142+
.iter()
143+
.zip(tokens.iter())
144+
.map(|(index, token)| {
145+
StatisticsValue::SparseVector(*index, Some(token.clone()))
146+
})
147+
.collect()
148+
} else {
149+
sparse
150+
.indices
151+
.iter()
152+
.map(|index| StatisticsValue::SparseVector(*index, None))
153+
.collect()
154+
}
155+
}
132156
}
133157
}
134158
}
135159

136160
impl std::fmt::Display for StatisticsValue {
137161
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
138-
write!(f, "{}", self.stable_string())
162+
write!(f, "{}", self.stable_value_string())
139163
}
140164
}
141165

@@ -146,7 +170,9 @@ impl PartialEq for StatisticsValue {
146170
(Self::Int(lhs), Self::Int(rhs)) => lhs == rhs,
147171
(Self::Float(lhs), Self::Float(rhs)) => lhs.to_bits() == rhs.to_bits(),
148172
(Self::Str(lhs), Self::Str(rhs)) => lhs == rhs,
149-
(Self::SparseVector(lhs), Self::SparseVector(rhs)) => lhs == rhs,
173+
(Self::SparseVector(lhs1, lhs2), Self::SparseVector(rhs1, rhs2)) => {
174+
lhs1 == rhs1 && lhs2 == rhs2
175+
}
150176
_ => false,
151177
}
152178
}
@@ -162,7 +188,10 @@ impl Hash for StatisticsValue {
162188
StatisticsValue::Int(value) => value.hash(state),
163189
StatisticsValue::Float(value) => value.to_bits().hash(state),
164190
StatisticsValue::Str(value) => value.hash(state),
165-
StatisticsValue::SparseVector(value) => value.hash(state),
191+
StatisticsValue::SparseVector(value, token) => {
192+
value.hash(state);
193+
token.hash(state);
194+
}
166195
}
167196
}
168197
}
@@ -203,10 +232,10 @@ impl AttachedFunctionExecutor for StatisticsFunctionExecutor {
203232
let mut records = Vec::with_capacity(counts.len());
204233
for (key, inner_map) in counts.into_iter() {
205234
for (stats_value, count) in inner_map.into_iter() {
206-
let stable_value = stats_value.stable_value();
207-
let stable_string = stats_value.stable_string();
208-
let record_id = format!("{key}::{stable_string}");
209-
let document = format!("statistics about {key} for {stable_string}");
235+
let stable_value_index = stats_value.stable_value_index();
236+
let stable_value_string = stats_value.stable_value_string();
237+
let record_id = format!("{key}::{stable_value_string}");
238+
let document = format!("statistics about {key} for {stable_value_string}");
210239

211240
let mut metadata = HashMap::with_capacity(4);
212241
metadata.insert("count".to_string(), count.output());
@@ -215,7 +244,16 @@ impl AttachedFunctionExecutor for StatisticsFunctionExecutor {
215244
"type".to_string(),
216245
UpdateMetadataValue::Str(stats_value.stable_type().to_string()),
217246
);
218-
metadata.insert("value".to_string(), UpdateMetadataValue::Str(stable_value));
247+
metadata.insert(
248+
"value".to_string(),
249+
UpdateMetadataValue::Str(stable_value_index),
250+
);
251+
if let Some(stable_value_token) = stats_value.stable_value_token() {
252+
metadata.insert(
253+
"value_token".to_string(),
254+
UpdateMetadataValue::Str(stable_value_token),
255+
);
256+
}
219257

220258
keys.insert(record_id.clone());
221259
records.push(LogRecord {

0 commit comments

Comments
 (0)