From 9b44b51ee3096c475b6e02512a4656c5d7bff520 Mon Sep 17 00:00:00 2001 From: HSPU_nngithub Date: Wed, 20 May 2026 15:32:52 +0200 Subject: [PATCH 1/7] Add unit detection to to_sodacl_threshold and update check functions --- datacontract/engines/data_contract_checks.py | 41 +++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py index 43009436..e6b2e76d 100644 --- a/datacontract/engines/data_contract_checks.py +++ b/datacontract/engines/data_contract_checks.py @@ -556,8 +556,10 @@ def check_property_regex( ) -def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()): +def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()): check_type = "row_count" + if "%" in threshold: + logger.warning("Row count threshold cannot be specified as a percentage.") check_key = f"{model_name}__{check_type}" sodacl_check_dict = { checks_for(model_name, quoting_config, check_type): [ @@ -586,10 +588,11 @@ def check_model_duplicate_values( check_type = "model_duplicate_values" check_key = f"{model_name}__{check_type}" col_joined = ", ".join(_quote_field_name(col, quoting_config) for col in cols) + metric = "duplicate_count" if not threshold.endswith("%") else "duplicate_percent" sodacl_check_dict = { checks_for(model_name, quoting_config, check_type): [ { - f"duplicate_count({col_joined}) {threshold}": {"name": check_key}, + f"{metric}({col_joined}) {threshold}": {"name": check_key}, } ], } @@ -614,10 +617,11 @@ def check_property_duplicate_values( check_type = "field_duplicate_values" check_key = f"{model_name}__{field_name}__{check_type}" + metric = "duplicate_count" if not threshold.endswith("%") else "duplicate_percent" sodacl_check_dict = { checks_for(model_name, quoting_config, check_type): [ { - f"duplicate_count({field_name_for_soda}) {threshold}": { + f"{metric}({field_name_for_soda}) {threshold}": { "name": check_key, }, } @@ -644,10 +648,11 @@ def check_property_null_values( check_type = "field_null_values" check_key = f"{model_name}__{field_name}__{check_type}" + metric = "missing_count" if not threshold.endswith("%") else "missing_percent" sodacl_check_dict = { checks_for(model_name, quoting_config, check_type): [ { - f"missing_count({field_name_for_soda}) {threshold}": { + f"{metric}({field_name_for_soda}) {threshold}": { "name": check_key, }, } @@ -685,11 +690,11 @@ def check_property_invalid_values( if valid_values is not None: sodacl_check_config["valid values"] = _escape_sql_string_values(valid_values) - + metric = "invalid_count" if not threshold.endswith("%") else "invalid_percent" sodacl_check_dict = { checks_for(model_name, quoting_config, check_type): [ { - f"invalid_count({field_name_for_soda}) {threshold}": sodacl_check_config, + f"{metric}({field_name_for_soda}) {threshold}": sodacl_check_config, } ], } @@ -728,10 +733,11 @@ def check_property_missing_values( if filtered_missing_values: sodacl_check_config["missing values"] = _escape_sql_string_values(filtered_missing_values) + metric = "missing_count" if not threshold.endswith("%") else "missing_percent" sodacl_check_dict = { checks_for(model_name, quoting_config, check_type): [ { - f"missing_count({field_name_for_soda}) {threshold}": sodacl_check_config, + f"{metric}({field_name_for_soda}) {threshold}": sodacl_check_config, } ], } @@ -922,32 +928,37 @@ def prepare_query( def to_sodacl_threshold(quality: DataQuality) -> str | None: + if quality.unit is not None and quality.unit not in ("rows", "percent"): + logger.warning(f"Unsupported quality.unit ={quality.unit} in quality check, must be 'rows' or 'percent' or None") + return None + threshold_suffix = "%" if quality.unit == "percent" else "" + if quality.mustBe is not None: - return f"= {quality.mustBe}" + return f"= {quality.mustBe}{threshold_suffix}" if quality.mustNotBe is not None: - return f"!= {quality.mustNotBe}" + return f"!= {quality.mustNotBe}{threshold_suffix}" if quality.mustBeGreaterThan is not None: - return f"> {quality.mustBeGreaterThan}" + return f"> {quality.mustBeGreaterThan}{threshold_suffix}" if quality.mustBeGreaterOrEqualTo is not None: - return f">= {quality.mustBeGreaterOrEqualTo}" + return f">= {quality.mustBeGreaterOrEqualTo}{threshold_suffix}" if quality.mustBeLessThan is not None: - return f"< {quality.mustBeLessThan}" + return f"< {quality.mustBeLessThan}{threshold_suffix}" if quality.mustBeLessOrEqualTo is not None: - return f"<= {quality.mustBeLessOrEqualTo}" + return f"<= {quality.mustBeLessOrEqualTo}{threshold_suffix}" if quality.mustBeBetween is not None: if len(quality.mustBeBetween) != 2: logger.warning( f"Quality check has invalid mustBeBetween, must have exactly 2 integers in an array: {quality.mustBeBetween}" ) return None - return f"between {quality.mustBeBetween[0]} and {quality.mustBeBetween[1]}" + return f"between {quality.mustBeBetween[0]}{threshold_suffix} and {quality.mustBeBetween[1]}{threshold_suffix}" if quality.mustNotBeBetween is not None: if len(quality.mustNotBeBetween) != 2: logger.warning( f"Quality check has invalid mustNotBeBetween, must have exactly 2 integers in an array: {quality.mustNotBeBetween}" ) return None - return f"not between {quality.mustNotBeBetween[0]} and {quality.mustNotBeBetween[1]}" + return f"not between {quality.mustNotBeBetween[0]}{threshold_suffix} and {quality.mustNotBeBetween[1]}{threshold_suffix}" return None From 4f718d8299389ea235a7050e2634c38173d7de0a Mon Sep 17 00:00:00 2001 From: HSPU_nngithub Date: Wed, 20 May 2026 15:44:05 +0200 Subject: [PATCH 2/7] ruff formatting --- datacontract/engines/data_contract_checks.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py index e6b2e76d..90bc6425 100644 --- a/datacontract/engines/data_contract_checks.py +++ b/datacontract/engines/data_contract_checks.py @@ -556,7 +556,7 @@ def check_property_regex( ) -def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()): +def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()): check_type = "row_count" if "%" in threshold: logger.warning("Row count threshold cannot be specified as a percentage.") @@ -929,10 +929,12 @@ def prepare_query( def to_sodacl_threshold(quality: DataQuality) -> str | None: if quality.unit is not None and quality.unit not in ("rows", "percent"): - logger.warning(f"Unsupported quality.unit ={quality.unit} in quality check, must be 'rows' or 'percent' or None") + logger.warning( + f"Unsupported quality.unit ={quality.unit} in quality check, must be 'rows' or 'percent' or None" + ) return None threshold_suffix = "%" if quality.unit == "percent" else "" - + if quality.mustBe is not None: return f"= {quality.mustBe}{threshold_suffix}" if quality.mustNotBe is not None: From 101896550d556d70e5d09df81494d4654ef4cfc3 Mon Sep 17 00:00:00 2001 From: HSPU_nngithub Date: Mon, 1 Jun 2026 07:59:54 +0200 Subject: [PATCH 3/7] improved unit handling in to_sodacl_threshold --- datacontract/engines/data_contract_checks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py index 5708f0b9..160fedbf 100644 --- a/datacontract/engines/data_contract_checks.py +++ b/datacontract/engines/data_contract_checks.py @@ -967,12 +967,12 @@ def prepare_query( def to_sodacl_threshold(quality: DataQuality) -> str | None: - if quality.unit is not None and quality.unit not in ("rows", "percent"): + if quality.unit is not None and quality.unit.lower() not in ("rows", "percent"): logger.warning( - f"Unsupported quality.unit ={quality.unit} in quality check, must be 'rows' or 'percent' or None" + f"Unsupported quality.unit ={quality.unit} in quality check, must be 'rows' (default) or 'percent'" ) return None - threshold_suffix = "%" if quality.unit == "percent" else "" + threshold_suffix = "%" if quality.unit.lower() == "percent" else "" if quality.mustBe is not None: return f"= {quality.mustBe}{threshold_suffix}" From a915de7d25914261363d74ef500f28e3bce4f6f6 Mon Sep 17 00:00:00 2001 From: HSPU_nngithub Date: Mon, 1 Jun 2026 08:01:44 +0200 Subject: [PATCH 4/7] check_row_count returns none on percentage unit --- datacontract/engines/data_contract_checks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py index 160fedbf..1432807c 100644 --- a/datacontract/engines/data_contract_checks.py +++ b/datacontract/engines/data_contract_checks.py @@ -599,6 +599,7 @@ def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConf check_type = "row_count" if "%" in threshold: logger.warning("Row count threshold cannot be specified as a percentage.") + return None check_key = f"{model_name}__{check_type}" sodacl_check_dict = { checks_for(model_name, quoting_config, check_type): [ @@ -640,7 +641,7 @@ def check_model_duplicate_values( key=check_key, category="quality", type=check_type, - name=f"Check that model {model_name} has duplicate_count {threshold} for columns {col_joined}", + name=f"Check that model {model_name} has {metric} {threshold} for columns {col_joined}", model=model_name, field=None, engine="soda", From 484640b3f3da1ee4398abe5d3e7bd53f81f2aa78 Mon Sep 17 00:00:00 2001 From: HSPU_nngithub Date: Mon, 1 Jun 2026 08:56:35 +0200 Subject: [PATCH 5/7] update check name to reflect percentage/rows metrics --- datacontract/engines/data_contract_checks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py index 1432807c..07c5bf4f 100644 --- a/datacontract/engines/data_contract_checks.py +++ b/datacontract/engines/data_contract_checks.py @@ -672,7 +672,7 @@ def check_property_duplicate_values( key=check_key, category="quality", type=check_type, - name=f"Check that field {field_name} has duplicate_count {threshold}", + name=f"Check that field {field_name} has {metric} {threshold}", model=model_name, field=field_name, engine="soda", @@ -703,7 +703,7 @@ def check_property_null_values( key=check_key, category="quality", type=check_type, - name=f"Check that field {field_name} has missing_count {threshold}", + name=f"Check that field {field_name} has {metric} {threshold}", model=model_name, field=field_name, engine="soda", @@ -743,7 +743,7 @@ def check_property_invalid_values( key=check_key, category="quality", type=check_type, - name=f"Check that field {field_name} has invalid_count {threshold}", + name=f"Check that field {field_name} has {metric} {threshold}", model=model_name, field=field_name, engine="soda", @@ -786,7 +786,7 @@ def check_property_missing_values( key=check_key, category="quality", type=check_type, - name=f"Check that field {field_name} has missing_count {threshold}", + name=f"Check that field {field_name} has {metric} {threshold}", model=model_name, field=field_name, engine="soda", From 7ed47b7797e73d871322ecf9e9e19042e64e0e55 Mon Sep 17 00:00:00 2001 From: HSPU_nngithub Date: Mon, 1 Jun 2026 10:32:53 +0200 Subject: [PATCH 6/7] Fixed lowering of unit which can be none --- datacontract/engines/data_contract_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py index 07c5bf4f..f133268d 100644 --- a/datacontract/engines/data_contract_checks.py +++ b/datacontract/engines/data_contract_checks.py @@ -973,7 +973,7 @@ def to_sodacl_threshold(quality: DataQuality) -> str | None: f"Unsupported quality.unit ={quality.unit} in quality check, must be 'rows' (default) or 'percent'" ) return None - threshold_suffix = "%" if quality.unit.lower() == "percent" else "" + threshold_suffix = "%" if (quality.unit or "").lower() == "percent" else "" if quality.mustBe is not None: return f"= {quality.mustBe}{threshold_suffix}" From 5e949a582d5ad646f8d0d5f7c8f572274806f38e Mon Sep 17 00:00:00 2001 From: HSPU_nngithub Date: Mon, 1 Jun 2026 10:52:35 +0200 Subject: [PATCH 7/7] add checks for to_sodacl_treshold and checks functions --- tests/test_data_contract_checks.py | 148 +++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/tests/test_data_contract_checks.py b/tests/test_data_contract_checks.py index 8fc8a77d..01c989b9 100644 --- a/tests/test_data_contract_checks.py +++ b/tests/test_data_contract_checks.py @@ -1,3 +1,6 @@ +import logging + +import pytest import yaml from open_data_contract_standard.model import ( DataQuality, @@ -17,13 +20,16 @@ check_property_is_present, check_property_missing_values, check_property_not_equal, + check_property_null_values, check_property_required, check_property_type, check_property_unique, + check_row_count, create_checks, prepare_query, to_schema_checks, to_sla_freshness_check, + to_sodacl_threshold, ) @@ -616,3 +622,145 @@ def test_to_schema_checks_databricks_struct_with_nested_varchar_skips_type_check types = [c.type for c in checks] assert "field_is_present" in types assert "field_type" not in types + + +# --- Tests for to_sodacl_threshold function --- + + +@pytest.mark.parametrize( + "unit,quality_kwargs,expected", + [ + # Test percent unit appends % + ("percent", {"mustBe": 5}, "= 5%"), + ("percent", {"mustBeGreaterThan": 10}, "> 10%"), + ("percent", {"mustBeGreaterOrEqualTo": 10}, ">= 10%"), + ("percent", {"mustBeLessThan": 20}, "< 20%"), + ("percent", {"mustBeLessOrEqualTo": 30}, "<= 30%"), + ("percent", {"mustNotBe": 0}, "!= 0%"), + ("percent", {"mustBeBetween": [5, 15]}, "between 5% and 15%"), + ("percent", {"mustNotBeBetween": [80, 100]}, "not between 80% and 100%"), + # Test case insensitivity + ("PERCENT", {"mustBeLessThan": 10}, "< 10%"), + ("Percent", {"mustBe": 15}, "= 15%"), + # Test rows unit does not append % + ("rows", {"mustBe": 100}, "= 100"), + ("rows", {"mustBeGreaterThan": 50}, "> 50"), + # Test default (None) does not append % + (None, {"mustBe": 50}, "= 50"), + (None, {"mustBeLessThan": 75}, "< 75"), + ], +) +def test_to_sodacl_threshold_with_units(unit, quality_kwargs, expected): + """Test that to_sodacl_threshold correctly handles different units and operators.""" + quality = DataQuality(unit=unit, **quality_kwargs) + result = to_sodacl_threshold(quality) + assert result == expected + + +def test_to_sodacl_threshold_invalid_unit(caplog): + """Test that invalid unit returns None and raises a warning.""" + quality = DataQuality(unit="invalid", mustBe=5) + with caplog.at_level(logging.WARNING): + result = to_sodacl_threshold(quality) + + assert result is None + assert "Unsupported quality.unit" in caplog.text + + +# --- Tests for unit handling in check_* functions --- + + +@pytest.mark.parametrize( + "threshold,expected_metric,expected_check", + [ + ("< 5%", "missing_percent", "missing_percent(status) < 5%"), + ("< 10", "missing_count", "missing_count(status) < 10"), + ], +) +def test_check_property_null_values_metric_selection(threshold, expected_metric, expected_check): + """Test that check_property_null_values selects correct metric based on threshold.""" + check = check_property_null_values("orders", "status", threshold) + + assert check.model == "orders" + assert check.field == "status" + assert check.type == "field_null_values" + assert expected_metric in check.name + + impl = yaml.safe_load(check.implementation) + checks = impl["checks for orders"] + check_keys = list(checks[0].keys()) + assert check_keys == [expected_check] + + +@pytest.mark.parametrize( + "threshold,expected_metric,expected_check", + [ + ("= 0%", "invalid_percent", "invalid_percent(status) = 0%"), + ("= 0", "invalid_count", "invalid_count(status) = 0"), + ], +) +def test_check_property_invalid_values_metric_selection(threshold, expected_metric, expected_check): + """Test that check_property_invalid_values selects correct metric based on threshold.""" + check = check_property_invalid_values("orders", "status", threshold, valid_values=["active", "inactive"]) + + assert check.model == "orders" + assert check.field == "status" + assert check.type == "field_invalid_values" + assert expected_metric in check.name + + impl = yaml.safe_load(check.implementation) + checks = impl["checks for orders"] + check_keys = list(checks[0].keys()) + assert check_keys == [expected_check] + + +@pytest.mark.parametrize( + "threshold,expected_metric,expected_check", + [ + ("<= 2%", "missing_percent", "missing_percent(status) <= 2%"), + ("<= 5", "missing_count", "missing_count(status) <= 5"), + ], +) +def test_check_property_missing_values_metric_selection(threshold, expected_metric, expected_check): + """Test that check_property_missing_values selects correct metric based on threshold.""" + check = check_property_missing_values("orders", "status", threshold, missing_values=["n/a", "null"]) + + assert check.model == "orders" + assert check.field == "status" + assert check.type == "field_missing_values" + assert expected_metric in check.name + + impl = yaml.safe_load(check.implementation) + checks = impl["checks for orders"] + check_keys = list(checks[0].keys()) + assert check_keys == [expected_check] + + +@pytest.mark.parametrize( + "threshold", + [ + "> 100%", + "between 10% and 50%", + ], +) +def test_check_row_count_with_percent_returns_none(threshold, caplog): + """Test that check_row_count returns None when threshold contains '%'.""" + with caplog.at_level(logging.WARNING): + result = check_row_count("orders", threshold, QuotingConfig()) + assert result is None + assert "Row count threshold cannot be specified as a percentage." in caplog.text + + +def test_check_row_count_without_percent_works(): + """Test that check_row_count works normally without '%'.""" + check = check_row_count("orders", "> 1000", QuotingConfig()) + + assert check is not None + assert check.model == "orders" + assert check.type == "row_count" + + # Parse the implementation to verify the SodaCL check + impl = yaml.safe_load(check.implementation) + checks = impl["checks for orders"] + check_keys = list(checks[0].keys()) + assert check_keys == ["row_count > 1000"]