Skip to content

Commit 0177c42

Browse files
committed
icu4x-datagen: add attribute filtering to cli
1 parent e721b81 commit 0177c42

File tree

4 files changed

+70
-0
lines changed

4 files changed

+70
-0
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ rand = "0.9"
299299
rand_distr = "0.5"
300300
rand_pcg = "0.9"
301301
rayon = "1.3.0"
302+
regex = "1.12.2"
302303
rkyv = "0.7"
303304
rmp-serde = "1.2.0"
304305
serde_json = "1.0.45"

provider/icu4x-datagen/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@ icu_provider_registry = { workspace = true }
2626
icu_provider_blob = { workspace = true, features = ["alloc"], optional = true }
2727

2828
clap = { workspace = true, features = ["derive"] }
29+
displaydoc = { workspace = true }
2930
eyre = { workspace = true }
3031
log = { workspace = true }
32+
regex = { workspace = true }
3133
simple_logger = { workspace = true }
3234

3335
[features]

provider/icu4x-datagen/src/main.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
)]
2828

2929
use clap::{Parser, ValueEnum};
30+
use displaydoc::Display;
3031
use eyre::WrapErr;
3132
use icu_provider::export::ExportableProvider;
3233
use icu_provider::hello_world::HelloWorldV1;
@@ -35,9 +36,54 @@ use icu_provider_export::prelude::*;
3536
use icu_provider_export::ExportMetadata;
3637
#[cfg(feature = "provider")]
3738
use icu_provider_source::SourceDataProvider;
39+
use regex::Regex;
3840
use simple_logger::SimpleLogger;
3941
use std::collections::HashMap;
4042
use std::path::PathBuf;
43+
use std::str::FromStr;
44+
45+
#[derive(Clone)]
46+
struct Filter {
47+
domain: String,
48+
regex: Regex,
49+
}
50+
51+
#[derive(Debug, Display)]
52+
enum FilterError {
53+
#[displaydoc("no filter found. specify one after an =")]
54+
NoFilter,
55+
#[displaydoc("opening / delimiter for regex not found")]
56+
NoOpeningSlash,
57+
#[displaydoc("closing / delimiter for regex not found")]
58+
NoClosingSlash,
59+
#[displaydoc("{0}")]
60+
Regex(regex::Error),
61+
}
62+
63+
impl From<regex::Error> for FilterError {
64+
fn from(value: regex::Error) -> Self {
65+
FilterError::Regex(value)
66+
}
67+
}
68+
69+
impl std::error::Error for FilterError {}
70+
71+
impl FromStr for Filter {
72+
type Err = FilterError;
73+
fn from_str(s: &str) -> Result<Self, Self::Err> {
74+
let (domain, regex) = s.split_once('=').ok_or(FilterError::NoFilter)?;
75+
76+
let regex = regex.strip_prefix('/').ok_or(FilterError::NoOpeningSlash)?;
77+
let regex = regex.strip_suffix('/').ok_or(FilterError::NoClosingSlash)?;
78+
79+
let regex = Regex::new(regex)?;
80+
81+
Ok(Filter {
82+
domain: domain.to_owned(),
83+
regex,
84+
})
85+
}
86+
}
4187

4288
#[derive(Parser)]
4389
#[command(name = "icu4x-datagen")]
@@ -169,6 +215,10 @@ struct Cli {
169215
#[arg(help = "Analyzes the binary and only includes markers that are used by the binary.")]
170216
markers_for_bin: Option<PathBuf>,
171217

218+
#[arg(long, value_name = "FILTER")]
219+
#[arg(help = "Filter attributes on markers for a domain. Accepts form `domain=/regex/`.")]
220+
attribute_filter: Vec<Filter>,
221+
172222
#[arg(long, short, num_args = 0..)]
173223
#[cfg_attr(feature = "provider", arg(default_value = "recommended"))]
174224
#[arg(
@@ -528,6 +578,21 @@ fn main() -> eyre::Result<()> {
528578
driver.with_segmenter_models(cli.segmenter_models.clone())
529579
};
530580

581+
let attribute_filters = cli
582+
.attribute_filter
583+
.iter()
584+
.fold(HashMap::new(), |mut map, filter| {
585+
map.entry(&filter.domain)
586+
.and_modify(|v: &mut Vec<_>| v.push(filter.regex.clone()))
587+
.or_insert_with(|| vec![filter.regex.clone()]);
588+
map
589+
});
590+
for (domain, filters) in attribute_filters {
591+
driver = driver.with_marker_attributes_filter(domain, move |attr| {
592+
filters.iter().all(|regex| regex.is_match(attr))
593+
})
594+
}
595+
531596
let metadata: Result<ExportMetadata, DataError> = match cli.format {
532597
#[cfg(not(feature = "fs_exporter"))]
533598
Format::Fs => {

0 commit comments

Comments
 (0)