From 9b44b51ee3096c475b6e02512a4656c5d7bff520 Mon Sep 17 00:00:00 2001
From: HSPU_nngithub <HSPU@novonordisk.com>
Date: Wed, 20 May 2026 15:32:52 +0200
Subject: [PATCH 1/7] Add unit detection to to_sodacl_threshold and update
 check functions

---
 datacontract/engines/data_contract_checks.py | 41 +++++++++++++-------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py
index 43009436..e6b2e76d 100644
--- a/datacontract/engines/data_contract_checks.py
+++ b/datacontract/engines/data_contract_checks.py
@@ -556,8 +556,10 @@ def check_property_regex(
     )
 
 
-def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()):
+def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()):   
     check_type = "row_count"
+    if "%" in threshold:
+        logger.warning("Row count threshold cannot be specified as a percentage.")
     check_key = f"{model_name}__{check_type}"
     sodacl_check_dict = {
         checks_for(model_name, quoting_config, check_type): [
@@ -586,10 +588,11 @@ def check_model_duplicate_values(
     check_type = "model_duplicate_values"
     check_key = f"{model_name}__{check_type}"
     col_joined = ", ".join(_quote_field_name(col, quoting_config) for col in cols)
+    metric = "duplicate_count" if not threshold.endswith("%") else "duplicate_percent"
     sodacl_check_dict = {
         checks_for(model_name, quoting_config, check_type): [
             {
-                f"duplicate_count({col_joined}) {threshold}": {"name": check_key},
+                f"{metric}({col_joined}) {threshold}": {"name": check_key},
             }
         ],
     }
@@ -614,10 +617,11 @@ def check_property_duplicate_values(
 
     check_type = "field_duplicate_values"
     check_key = f"{model_name}__{field_name}__{check_type}"
+    metric = "duplicate_count" if not threshold.endswith("%") else "duplicate_percent"
     sodacl_check_dict = {
         checks_for(model_name, quoting_config, check_type): [
             {
-                f"duplicate_count({field_name_for_soda}) {threshold}": {
+                f"{metric}({field_name_for_soda}) {threshold}": {
                     "name": check_key,
                 },
             }
@@ -644,10 +648,11 @@ def check_property_null_values(
 
     check_type = "field_null_values"
     check_key = f"{model_name}__{field_name}__{check_type}"
+    metric = "missing_count" if not threshold.endswith("%") else "missing_percent"
     sodacl_check_dict = {
         checks_for(model_name, quoting_config, check_type): [
             {
-                f"missing_count({field_name_for_soda}) {threshold}": {
+                f"{metric}({field_name_for_soda}) {threshold}": {
                     "name": check_key,
                 },
             }
@@ -685,11 +690,11 @@ def check_property_invalid_values(
 
     if valid_values is not None:
         sodacl_check_config["valid values"] = _escape_sql_string_values(valid_values)
-
+    metric = "invalid_count" if not threshold.endswith("%") else "invalid_percent"
     sodacl_check_dict = {
         checks_for(model_name, quoting_config, check_type): [
             {
-                f"invalid_count({field_name_for_soda}) {threshold}": sodacl_check_config,
+                f"{metric}({field_name_for_soda}) {threshold}": sodacl_check_config,
             }
         ],
     }
@@ -728,10 +733,11 @@ def check_property_missing_values(
         if filtered_missing_values:
             sodacl_check_config["missing values"] = _escape_sql_string_values(filtered_missing_values)
 
+    metric = "missing_count" if not threshold.endswith("%") else "missing_percent"
     sodacl_check_dict = {
         checks_for(model_name, quoting_config, check_type): [
             {
-                f"missing_count({field_name_for_soda}) {threshold}": sodacl_check_config,
+                f"{metric}({field_name_for_soda}) {threshold}": sodacl_check_config,
             }
         ],
     }
@@ -922,32 +928,37 @@ def prepare_query(
 
 
 def to_sodacl_threshold(quality: DataQuality) -> str | None:
+    if quality.unit is not None and quality.unit not in ("rows", "percent"):
+        logger.warning(f"Unsupported quality.unit ={quality.unit} in quality check, must be 'rows' or 'percent' or None")
+        return None
+    threshold_suffix = "%" if quality.unit == "percent" else ""
+    
     if quality.mustBe is not None:
-        return f"= {quality.mustBe}"
+        return f"= {quality.mustBe}{threshold_suffix}"
     if quality.mustNotBe is not None:
-        return f"!= {quality.mustNotBe}"
+        return f"!= {quality.mustNotBe}{threshold_suffix}"
     if quality.mustBeGreaterThan is not None:
-        return f"> {quality.mustBeGreaterThan}"
+        return f"> {quality.mustBeGreaterThan}{threshold_suffix}"
     if quality.mustBeGreaterOrEqualTo is not None:
-        return f">= {quality.mustBeGreaterOrEqualTo}"
+        return f">= {quality.mustBeGreaterOrEqualTo}{threshold_suffix}"
     if quality.mustBeLessThan is not None:
-        return f"< {quality.mustBeLessThan}"
+        return f"< {quality.mustBeLessThan}{threshold_suffix}"
     if quality.mustBeLessOrEqualTo is not None:
-        return f"<= {quality.mustBeLessOrEqualTo}"
+        return f"<= {quality.mustBeLessOrEqualTo}{threshold_suffix}"
     if quality.mustBeBetween is not None:
         if len(quality.mustBeBetween) != 2:
             logger.warning(
                 f"Quality check has invalid mustBeBetween, must have exactly 2 integers in an array: {quality.mustBeBetween}"
             )
             return None
-        return f"between {quality.mustBeBetween[0]} and {quality.mustBeBetween[1]}"
+        return f"between {quality.mustBeBetween[0]}{threshold_suffix} and {quality.mustBeBetween[1]}{threshold_suffix}"
     if quality.mustNotBeBetween is not None:
         if len(quality.mustNotBeBetween) != 2:
             logger.warning(
                 f"Quality check has invalid mustNotBeBetween, must have exactly 2 integers in an array: {quality.mustNotBeBetween}"
             )
             return None
-        return f"not between {quality.mustNotBeBetween[0]} and {quality.mustNotBeBetween[1]}"
+        return f"not between {quality.mustNotBeBetween[0]}{threshold_suffix} and {quality.mustNotBeBetween[1]}{threshold_suffix}"
     return None
 
 

From 4f718d8299389ea235a7050e2634c38173d7de0a Mon Sep 17 00:00:00 2001
From: HSPU_nngithub <HSPU@novonordisk.com>
Date: Wed, 20 May 2026 15:44:05 +0200
Subject: [PATCH 2/7] ruff formatting

---
 datacontract/engines/data_contract_checks.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py
index e6b2e76d..90bc6425 100644
--- a/datacontract/engines/data_contract_checks.py
+++ b/datacontract/engines/data_contract_checks.py
@@ -556,7 +556,7 @@ def check_property_regex(
     )
 
 
-def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()):   
+def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()):
     check_type = "row_count"
     if "%" in threshold:
         logger.warning("Row count threshold cannot be specified as a percentage.")
@@ -929,10 +929,12 @@ def prepare_query(
 
 def to_sodacl_threshold(quality: DataQuality) -> str | None:
     if quality.unit is not None and quality.unit not in ("rows", "percent"):
-        logger.warning(f"Unsupported quality.unit ={quality.unit} in quality check, must be 'rows' or 'percent' or None")
+        logger.warning(
+            f"Unsupported quality.unit ={quality.unit} in quality check, must be 'rows' or 'percent' or None"
+        )
         return None
     threshold_suffix = "%" if quality.unit == "percent" else ""
-    
+
     if quality.mustBe is not None:
         return f"= {quality.mustBe}{threshold_suffix}"
     if quality.mustNotBe is not None:

From 101896550d556d70e5d09df81494d4654ef4cfc3 Mon Sep 17 00:00:00 2001
From: HSPU_nngithub <HSPU@novonordisk.com>
Date: Mon, 1 Jun 2026 07:59:54 +0200
Subject: [PATCH 3/7] improved unit handling in to_sodacl_threshold

---
 datacontract/engines/data_contract_checks.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py
index 5708f0b9..160fedbf 100644
--- a/datacontract/engines/data_contract_checks.py
+++ b/datacontract/engines/data_contract_checks.py
@@ -967,12 +967,12 @@ def prepare_query(
 
 
 def to_sodacl_threshold(quality: DataQuality) -> str | None:
-    if quality.unit is not None and quality.unit not in ("rows", "percent"):
+    if quality.unit is not None and quality.unit.lower() not in ("rows", "percent"):
         logger.warning(
-            f"Unsupported quality.unit ={quality.unit} in quality check, must be 'rows' or 'percent' or None"
+            f"Unsupported quality.unit ={quality.unit} in quality check, must be 'rows' (default) or 'percent'"
         )
         return None
-    threshold_suffix = "%" if quality.unit == "percent" else ""
+    threshold_suffix = "%" if quality.unit.lower() == "percent" else ""
 
     if quality.mustBe is not None:
         return f"= {quality.mustBe}{threshold_suffix}"

From a915de7d25914261363d74ef500f28e3bce4f6f6 Mon Sep 17 00:00:00 2001
From: HSPU_nngithub <HSPU@novonordisk.com>
Date: Mon, 1 Jun 2026 08:01:44 +0200
Subject: [PATCH 4/7] check_row_count returns none on percentage unit

---
 datacontract/engines/data_contract_checks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py
index 160fedbf..1432807c 100644
--- a/datacontract/engines/data_contract_checks.py
+++ b/datacontract/engines/data_contract_checks.py
@@ -599,6 +599,7 @@ def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConf
     check_type = "row_count"
     if "%" in threshold:
         logger.warning("Row count threshold cannot be specified as a percentage.")
+        return None
     check_key = f"{model_name}__{check_type}"
     sodacl_check_dict = {
         checks_for(model_name, quoting_config, check_type): [
@@ -640,7 +641,7 @@ def check_model_duplicate_values(
         key=check_key,
         category="quality",
         type=check_type,
-        name=f"Check that model {model_name} has duplicate_count {threshold} for columns {col_joined}",
+        name=f"Check that model {model_name} has {metric} {threshold} for columns {col_joined}",
         model=model_name,
         field=None,
         engine="soda",

From 484640b3f3da1ee4398abe5d3e7bd53f81f2aa78 Mon Sep 17 00:00:00 2001
From: HSPU_nngithub <HSPU@novonordisk.com>
Date: Mon, 1 Jun 2026 08:56:35 +0200
Subject: [PATCH 5/7] update check name to reflect percentage/rows metrics

---
 datacontract/engines/data_contract_checks.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py
index 1432807c..07c5bf4f 100644
--- a/datacontract/engines/data_contract_checks.py
+++ b/datacontract/engines/data_contract_checks.py
@@ -672,7 +672,7 @@ def check_property_duplicate_values(
         key=check_key,
         category="quality",
         type=check_type,
-        name=f"Check that field {field_name} has duplicate_count {threshold}",
+        name=f"Check that field {field_name} has {metric} {threshold}",
         model=model_name,
         field=field_name,
         engine="soda",
@@ -703,7 +703,7 @@ def check_property_null_values(
         key=check_key,
         category="quality",
         type=check_type,
-        name=f"Check that field {field_name} has missing_count {threshold}",
+        name=f"Check that field {field_name} has {metric} {threshold}",
         model=model_name,
         field=field_name,
         engine="soda",
@@ -743,7 +743,7 @@ def check_property_invalid_values(
         key=check_key,
         category="quality",
         type=check_type,
-        name=f"Check that field {field_name} has invalid_count {threshold}",
+        name=f"Check that field {field_name} has {metric} {threshold}",
         model=model_name,
         field=field_name,
         engine="soda",
@@ -786,7 +786,7 @@ def check_property_missing_values(
         key=check_key,
         category="quality",
         type=check_type,
-        name=f"Check that field {field_name} has missing_count {threshold}",
+        name=f"Check that field {field_name} has {metric} {threshold}",
         model=model_name,
         field=field_name,
         engine="soda",

From 7ed47b7797e73d871322ecf9e9e19042e64e0e55 Mon Sep 17 00:00:00 2001
From: HSPU_nngithub <HSPU@novonordisk.com>
Date: Mon, 1 Jun 2026 10:32:53 +0200
Subject: [PATCH 6/7] Fixed lowering of unit which can be none

---
 datacontract/engines/data_contract_checks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datacontract/engines/data_contract_checks.py b/datacontract/engines/data_contract_checks.py
index 07c5bf4f..f133268d 100644
--- a/datacontract/engines/data_contract_checks.py
+++ b/datacontract/engines/data_contract_checks.py
@@ -973,7 +973,7 @@ def to_sodacl_threshold(quality: DataQuality) -> str | None:
             f"Unsupported quality.unit ={quality.unit} in quality check, must be 'rows' (default) or 'percent'"
         )
         return None
-    threshold_suffix = "%" if quality.unit.lower() == "percent" else ""
+    threshold_suffix = "%" if (quality.unit or "").lower() == "percent" else ""
 
     if quality.mustBe is not None:
         return f"= {quality.mustBe}{threshold_suffix}"

From 5e949a582d5ad646f8d0d5f7c8f572274806f38e Mon Sep 17 00:00:00 2001
From: HSPU_nngithub <HSPU@novonordisk.com>
Date: Mon, 1 Jun 2026 10:52:35 +0200
Subject: [PATCH 7/7] add checks for to_sodacl_treshold and checks functions

---
 tests/test_data_contract_checks.py | 148 +++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)

diff --git a/tests/test_data_contract_checks.py b/tests/test_data_contract_checks.py
index 8fc8a77d..01c989b9 100644
--- a/tests/test_data_contract_checks.py
+++ b/tests/test_data_contract_checks.py
@@ -1,3 +1,6 @@
+import logging
+
+import pytest
 import yaml
 from open_data_contract_standard.model import (
     DataQuality,
@@ -17,13 +20,16 @@
     check_property_is_present,
     check_property_missing_values,
     check_property_not_equal,
+    check_property_null_values,
     check_property_required,
     check_property_type,
     check_property_unique,
+    check_row_count,
     create_checks,
     prepare_query,
     to_schema_checks,
     to_sla_freshness_check,
+    to_sodacl_threshold,
 )
 
 
@@ -616,3 +622,145 @@ def test_to_schema_checks_databricks_struct_with_nested_varchar_skips_type_check
     types = [c.type for c in checks]
     assert "field_is_present" in types
     assert "field_type" not in types
+
+
+# --- Tests for to_sodacl_threshold function ---
+
+
+@pytest.mark.parametrize(
+    "unit,quality_kwargs,expected",
+    [
+        # Test percent unit appends %
+        ("percent", {"mustBe": 5}, "= 5%"),
+        ("percent", {"mustBeGreaterThan": 10}, "> 10%"),
+        ("percent", {"mustBeGreaterOrEqualTo": 10}, ">= 10%"),
+        ("percent", {"mustBeLessThan": 20}, "< 20%"),
+        ("percent", {"mustBeLessOrEqualTo": 30}, "<= 30%"),
+        ("percent", {"mustNotBe": 0}, "!= 0%"),
+        ("percent", {"mustBeBetween": [5, 15]}, "between 5% and 15%"),
+        ("percent", {"mustNotBeBetween": [80, 100]}, "not between 80% and 100%"),
+        # Test case insensitivity
+        ("PERCENT", {"mustBeLessThan": 10}, "< 10%"),
+        ("Percent", {"mustBe": 15}, "= 15%"),
+        # Test rows unit does not append %
+        ("rows", {"mustBe": 100}, "= 100"),
+        ("rows", {"mustBeGreaterThan": 50}, "> 50"),
+        # Test default (None) does not append %
+        (None, {"mustBe": 50}, "= 50"),
+        (None, {"mustBeLessThan": 75}, "< 75"),
+    ],
+)
+def test_to_sodacl_threshold_with_units(unit, quality_kwargs, expected):
+    """Test that to_sodacl_threshold correctly handles different units and operators."""
+    quality = DataQuality(unit=unit, **quality_kwargs)
+    result = to_sodacl_threshold(quality)
+    assert result == expected
+
+
+def test_to_sodacl_threshold_invalid_unit(caplog):
+    """Test that invalid unit returns None and raises a warning."""
+    quality = DataQuality(unit="invalid", mustBe=5)
+    with caplog.at_level(logging.WARNING):
+        result = to_sodacl_threshold(quality)
+
+    assert result is None
+    assert "Unsupported quality.unit" in caplog.text
+
+
+# --- Tests for unit handling in check_* functions ---
+
+
+@pytest.mark.parametrize(
+    "threshold,expected_metric,expected_check",
+    [
+        ("< 5%", "missing_percent", "missing_percent(status) < 5%"),
+        ("< 10", "missing_count", "missing_count(status) < 10"),
+    ],
+)
+def test_check_property_null_values_metric_selection(threshold, expected_metric, expected_check):
+    """Test that check_property_null_values selects correct metric based on threshold."""
+    check = check_property_null_values("orders", "status", threshold)
+
+    assert check.model == "orders"
+    assert check.field == "status"
+    assert check.type == "field_null_values"
+    assert expected_metric in check.name
+
+    impl = yaml.safe_load(check.implementation)
+    checks = impl["checks for orders"]
+    check_keys = list(checks[0].keys())
+    assert check_keys == [expected_check]
+
+
+@pytest.mark.parametrize(
+    "threshold,expected_metric,expected_check",
+    [
+        ("= 0%", "invalid_percent", "invalid_percent(status) = 0%"),
+        ("= 0", "invalid_count", "invalid_count(status) = 0"),
+    ],
+)
+def test_check_property_invalid_values_metric_selection(threshold, expected_metric, expected_check):
+    """Test that check_property_invalid_values selects correct metric based on threshold."""
+    check = check_property_invalid_values("orders", "status", threshold, valid_values=["active", "inactive"])
+
+    assert check.model == "orders"
+    assert check.field == "status"
+    assert check.type == "field_invalid_values"
+    assert expected_metric in check.name
+
+    impl = yaml.safe_load(check.implementation)
+    checks = impl["checks for orders"]
+    check_keys = list(checks[0].keys())
+    assert check_keys == [expected_check]
+
+
+@pytest.mark.parametrize(
+    "threshold,expected_metric,expected_check",
+    [
+        ("<= 2%", "missing_percent", "missing_percent(status) <= 2%"),
+        ("<= 5", "missing_count", "missing_count(status) <= 5"),
+    ],
+)
+def test_check_property_missing_values_metric_selection(threshold, expected_metric, expected_check):
+    """Test that check_property_missing_values selects correct metric based on threshold."""
+    check = check_property_missing_values("orders", "status", threshold, missing_values=["n/a", "null"])
+
+    assert check.model == "orders"
+    assert check.field == "status"
+    assert check.type == "field_missing_values"
+    assert expected_metric in check.name
+
+    impl = yaml.safe_load(check.implementation)
+    checks = impl["checks for orders"]
+    check_keys = list(checks[0].keys())
+    assert check_keys == [expected_check]
+
+
+@pytest.mark.parametrize(
+    "threshold",
+    [
+        "> 100%",
+        "between 10% and 50%",
+    ],
+)
+def test_check_row_count_with_percent_returns_none(threshold, caplog):
+    """Test that check_row_count returns None when threshold contains '%'."""
+    with caplog.at_level(logging.WARNING):
+        result = check_row_count("orders", threshold, QuotingConfig())
+    assert result is None
+    assert "Row count threshold cannot be specified as a percentage." in caplog.text
+
+
+def test_check_row_count_without_percent_works():
+    """Test that check_row_count works normally without '%'."""
+    check = check_row_count("orders", "> 1000", QuotingConfig())
+
+    assert check is not None
+    assert check.model == "orders"
+    assert check.type == "row_count"
+
+    # Parse the implementation to verify the SodaCL check
+    impl = yaml.safe_load(check.implementation)
+    checks = impl["checks for orders"]
+    check_keys = list(checks[0].keys())
+    assert check_keys == ["row_count > 1000"]