Skip to content

Commit a0b964a

Browse files
committed
[ENH] Put both token id and token str in the statistics
1 parent 7f5094a commit a0b964a

File tree

1 file changed

+55
-20
lines changed

1 file changed

+55
-20
lines changed

rust/worker/src/execution/functions/statistics.rs

Lines changed: 55 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ enum StatisticsValue {
6565
/// String metadata value associated with a record.
6666
Str(String),
6767
/// Sparse vector index observed in metadata.
68-
SparseVector(u32),
68+
SparseVector(u32, Option<String>),
6969
}
7070

7171
impl StatisticsValue {
@@ -76,7 +76,7 @@ impl StatisticsValue {
7676
Self::Int(_) => "int",
7777
Self::Float(_) => "float",
7878
Self::Str(_) => "str",
79-
Self::SparseVector(_) => "sparse",
79+
Self::SparseVector(_, _) => "sparse",
8080
}
8181
}
8282

@@ -87,12 +87,12 @@ impl StatisticsValue {
8787
Self::Int(_) => "i",
8888
Self::Float(_) => "f",
8989
Self::Str(_) => "s",
90-
Self::SparseVector(_) => "sv",
90+
Self::SparseVector(_, _) => "sv",
9191
}
9292
}
9393

9494
/// A stable representation of the statistics's value.
95-
fn stable_value(&self) -> String {
95+
fn stable_value1(&self) -> String {
9696
match self {
9797
Self::Bool(b) => {
9898
format!("{b}")
@@ -102,16 +102,27 @@ impl StatisticsValue {
102102
}
103103
Self::Str(s) => s.clone(),
104104
Self::Float(f) => format!("{f:.16e}"),
105-
Self::SparseVector(index) => {
105+
Self::SparseVector(index, _) => {
106106
format!("{index}")
107107
}
108108
}
109109
}
110110

111+
/// A stable representation of the statistics's value.
112+
fn stable_value2(&self) -> Option<String> {
113+
match self {
114+
Self::Bool(_) => None,
115+
Self::Int(_) => None,
116+
Self::Str(_) => None,
117+
Self::Float(_) => None,
118+
Self::SparseVector(_, token) => token.clone(),
119+
}
120+
}
121+
111122
/// A stable string representation of a statistics value with type tag.
112123
/// Separate so display repr can change.
113-
fn stable_string(&self) -> String {
114-
format!("{}:{}", self.type_prefix(), self.stable_value())
124+
fn stable_string1(&self) -> String {
125+
format!("{}:{}", self.type_prefix(), self.stable_value1())
115126
}
116127

117128
/// Convert MetadataValue to a vector of StatisticsValue.
@@ -122,18 +133,31 @@ impl StatisticsValue {
122133
MetadataValue::Int(i) => vec![StatisticsValue::Int(*i)],
123134
MetadataValue::Float(f) => vec![StatisticsValue::Float(*f)],
124135
MetadataValue::Str(s) => vec![StatisticsValue::Str(s.clone())],
125-
MetadataValue::SparseVector(sparse) => sparse
126-
.indices
127-
.iter()
128-
.map(|index| StatisticsValue::SparseVector(*index))
129-
.collect(),
136+
MetadataValue::SparseVector(sparse) => {
137+
if let Some(tokens) = sparse.tokens.as_ref() {
138+
sparse
139+
.indices
140+
.iter()
141+
.zip(tokens.iter())
142+
.map(|(index, token)| {
143+
StatisticsValue::SparseVector(*index, Some(token.clone()))
144+
})
145+
.collect()
146+
} else {
147+
sparse
148+
.indices
149+
.iter()
150+
.map(|index| StatisticsValue::SparseVector(*index, None))
151+
.collect()
152+
}
153+
}
130154
}
131155
}
132156
}
133157

134158
impl std::fmt::Display for StatisticsValue {
135159
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
136-
write!(f, "{}", self.stable_string())
160+
write!(f, "{}", self.stable_string1())
137161
}
138162
}
139163

@@ -144,7 +168,9 @@ impl PartialEq for StatisticsValue {
144168
(Self::Int(lhs), Self::Int(rhs)) => lhs == rhs,
145169
(Self::Float(lhs), Self::Float(rhs)) => lhs.to_bits() == rhs.to_bits(),
146170
(Self::Str(lhs), Self::Str(rhs)) => lhs == rhs,
147-
(Self::SparseVector(lhs), Self::SparseVector(rhs)) => lhs == rhs,
171+
(Self::SparseVector(lhs1, lhs2), Self::SparseVector(rhs1, rhs2)) => {
172+
lhs1 == rhs1 && lhs2 == rhs2
173+
}
148174
_ => false,
149175
}
150176
}
@@ -160,7 +186,10 @@ impl Hash for StatisticsValue {
160186
StatisticsValue::Int(value) => value.hash(state),
161187
StatisticsValue::Float(value) => value.to_bits().hash(state),
162188
StatisticsValue::Str(value) => value.hash(state),
163-
StatisticsValue::SparseVector(value) => value.hash(state),
189+
StatisticsValue::SparseVector(value, token) => {
190+
value.hash(state);
191+
token.hash(state);
192+
}
164193
}
165194
}
166195
}
@@ -202,10 +231,10 @@ impl AttachedFunctionExecutor for StatisticsFunctionExecutor {
202231
let mut records = Vec::with_capacity(counts.len());
203232
for (key, inner_map) in counts.into_iter() {
204233
for (stats_value, count) in inner_map.into_iter() {
205-
let stable_value = stats_value.stable_value();
206-
let stable_string = stats_value.stable_string();
207-
let record_id = format!("{key}::{stable_string}");
208-
let document = format!("statistics about {key} for {stable_string}");
234+
let stable_value1 = stats_value.stable_value1();
235+
let stable_string1 = stats_value.stable_string1();
236+
let record_id = format!("{key}::{stable_string1}");
237+
let document = format!("statistics about {key} for {stable_string1}");
209238

210239
let mut metadata = HashMap::with_capacity(4);
211240
metadata.insert("count".to_string(), count.output());
@@ -214,7 +243,13 @@ impl AttachedFunctionExecutor for StatisticsFunctionExecutor {
214243
"type".to_string(),
215244
UpdateMetadataValue::Str(stats_value.stable_type().to_string()),
216245
);
217-
metadata.insert("value".to_string(), UpdateMetadataValue::Str(stable_value));
246+
metadata.insert("value".to_string(), UpdateMetadataValue::Str(stable_value1));
247+
if let Some(stable_value2) = stats_value.stable_value2() {
248+
metadata.insert(
249+
"value2".to_string(),
250+
UpdateMetadataValue::Str(stable_value2),
251+
);
252+
}
218253

219254
keys.insert(record_id.clone());
220255
records.push(LogRecord {

0 commit comments

Comments
 (0)