Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ rand = "0.9"
rand_distr = "0.5"
rand_pcg = "0.9"
rayon = "1.3.0"
regex = "1.12.2"
rkyv = "0.7"
rmp-serde = "1.2.0"
serde_json = "1.0.45"
Expand Down
2 changes: 2 additions & 0 deletions provider/icu4x-datagen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ icu_provider_registry = { workspace = true }
icu_provider_blob = { workspace = true, features = ["alloc"], optional = true }

clap = { workspace = true, features = ["derive"] }
displaydoc = { workspace = true }
eyre = { workspace = true }
log = { workspace = true }
regex = { workspace = true }
simple_logger = { workspace = true }

[features]
Expand Down
115 changes: 115 additions & 0 deletions provider/icu4x-datagen/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
)]

use clap::{Parser, ValueEnum};
use displaydoc::Display;
use eyre::WrapErr;
use icu_provider::export::ExportableProvider;
use icu_provider::hello_world::HelloWorldV1;
Expand All @@ -35,9 +36,63 @@ use icu_provider_export::prelude::*;
use icu_provider_export::ExportMetadata;
#[cfg(feature = "provider")]
use icu_provider_source::SourceDataProvider;
use regex::Regex;
use simple_logger::SimpleLogger;
use std::collections::HashMap;
use std::path::PathBuf;
use std::str::FromStr;

#[derive(Clone)]
struct Filter {
domain: String,
regex: Regex,
inverted: bool,
}

#[derive(Debug, Display)]
enum FilterError {
#[displaydoc("no filter found. specify one after an =")]
NoFilter,
#[displaydoc("opening / delimiter for regex not found")]
NoOpeningSlash,
#[displaydoc("closing / delimiter for regex not found")]
NoClosingSlash,
#[displaydoc("{0}")]
Regex(regex::Error),
}

impl From<regex::Error> for FilterError {
fn from(value: regex::Error) -> Self {
FilterError::Regex(value)
}
}

impl std::error::Error for FilterError {}

impl FromStr for Filter {
type Err = FilterError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let (domain, regex) = s.split_once('=').ok_or(FilterError::NoFilter)?;

let (regex, inverted) = regex
.strip_prefix('-')
.map(|regex| (regex, true))
.unwrap_or((regex, false));

let regex = regex.strip_prefix('/').ok_or(FilterError::NoOpeningSlash)?;
let regex = regex.strip_suffix('/').ok_or(FilterError::NoClosingSlash)?;

// add an implicit `^(?:)$` around the regex
let regex = format!("^(?:{})$", regex);
let regex = Regex::new(&regex)?;

Ok(Filter {
domain: domain.to_owned(),
regex,
inverted,
})
}
}

#[derive(Parser)]
#[command(name = "icu4x-datagen")]
Expand Down Expand Up @@ -169,6 +224,10 @@ struct Cli {
#[arg(help = "Analyzes the binary and only includes markers that are used by the binary.")]
markers_for_bin: Option<PathBuf>,

#[arg(long, value_name = "FILTER")]
#[arg(help = "Filter attributes on markers for a domain. Accepts form `domain=/regex/`.")]
attribute_filter: Vec<Filter>,

#[arg(long, short, num_args = 0..)]
#[cfg_attr(feature = "provider", arg(default_value = "recommended"))]
#[arg(
Expand Down Expand Up @@ -289,6 +348,10 @@ fn main() -> eyre::Result<()> {
.unwrap()
}

run(cli)
}

fn run(cli: Cli) -> eyre::Result<()> {
let markers = if !cli.markers.is_empty() {
match cli.markers.as_slice() {
[x] if x == "none" => Default::default(),
Expand Down Expand Up @@ -528,6 +591,23 @@ fn main() -> eyre::Result<()> {
driver.with_segmenter_models(cli.segmenter_models.clone())
};

let attribute_filters = cli.attribute_filter.into_iter().fold(
HashMap::<_, Vec<(Regex, bool)>>::new(),
|mut map, filter| {
map.entry(filter.domain)
.or_default()
.push((filter.regex, filter.inverted));
map
},
);
for (domain, filters) in attribute_filters {
driver = driver.with_marker_attributes_filter(&domain, move |attr| {
filters
.iter()
.all(|(regex, inverted)| regex.is_match(attr) ^ inverted)
})
}

let metadata: Result<ExportMetadata, DataError> = match cli.format {
#[cfg(not(feature = "fs_exporter"))]
Format::Fs => {
Expand Down Expand Up @@ -701,3 +781,38 @@ where
self.0.iter_ids_for_marker(M::INFO)
}
}

#[test]
fn test_attributes_regex() {
let out = std::env::temp_dir().join("icu4x-datagen_test_attributes_regex_out");
let _ = std::fs::remove_dir_all(&out);

let mut args = Cli::parse_from([
"bin",
"--markers",
"HelloWorldV1",
"--locales",
"full",
"--format",
"fs",
"--attribute-filter",
"hello=/r.*?|.*?case/",
"--attribute-filter",
"hello=-/lowercase/",
"--attribute-filter",
"hello=-/.*3/",
]);

args.output = Some(out.clone());

run(args).unwrap();

assert!(std::fs::exists(out.join("hello/world/v1/reverse")).unwrap());

assert!(std::fs::exists(out.join("hello/world/v1/rotate1")).unwrap());
assert!(std::fs::exists(out.join("hello/world/v1/rotate2")).unwrap());
assert!(!std::fs::exists(out.join("hello/world/v1/rotate3")).unwrap());

assert!(std::fs::exists(out.join("hello/world/v1/uppercase")).unwrap());
assert!(!std::fs::exists(out.join("hello/world/v1/lowercase")).unwrap());
}