test refactor for floating point failures + mypy fix (#1169)

armaan-dhillon · web-flow · commit 96febab2a9c0 · 2025-02-06T10:11:00.000-05:00
* test refactor for floating point failures + 1 mypy fix

* adding inline ignores for mypy and TODOs for context

* ran black for changes in previous commit
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ repos:
   # Flake8: complexity and style checking
   # https://flake8.pycqa.org/en/latest/user/using-hooks.html
   - repo: https://github.com/pycqa/flake8
-    rev: 4.0.1
+    rev: 5.0.4
     hooks:
       - id: flake8
         additional_dependencies: [flake8-docstrings]
diff --git a/dataprofiler/data_readers/parquet_data.py b/dataprofiler/data_readers/parquet_data.py
@@ -68,7 +68,7 @@ def __init__(
             self._load_data(data)
 
     @property
-    def file_encoding(self) -> None:
+    def file_encoding(self) -> Optional[str]:
         """Set file encoding to None since not detected for avro."""
         return None
 
diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py
@@ -194,8 +194,11 @@ def load_from_dict(cls, data, config: dict | None = None):
 
         return profile
 
+    # TODO: refactor BaseColumnProfiler.profile to not be an @property
+    # NumericStatsMixin inherits from BaseColumnProfile and adding @property to
+    # NumericStatisMixin.profile() results in a breaking change - ignoring [override]
     @property
-    def profile(self) -> dict:
+    def profile(self) -> dict:  # type: ignore[override]
         """
         Return the profile of the column.
 
diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py
@@ -92,8 +92,11 @@ def load_from_dict(cls, data, config: dict | None = None):
         profile._reformat_numeric_stats_types_on_serialized_profiles()
         return profile
 
+    # TODO: refactor BaseColumnProfiler.profile to not be an @property
+    # NumericStatsMixin inherits from BaseColumnProfile and adding @property to
+    # NumericStatisMixin.profile() results in a breaking change - ignoring [override]
     @property
-    def profile(self) -> dict:
+    def profile(self) -> dict:  # type: ignore[override]
         """
         Return the profile of the column.
 
diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py
@@ -365,7 +365,10 @@ def _add_helper(
             other1._median_abs_dev_is_enabled and other2._median_abs_dev_is_enabled
         )
 
-    def profile(self) -> dict:
+    # TODO: refactor BaseColumnProfiler.profile to not be an @property
+    # NumericStatsMixin inherits from BaseColumnProfile and adding @property to
+    # NumericStatisMixin.profile() results in a breaking change - ignoring [override]
+    def profile(self) -> dict:  # type: ignore[override]
         """
         Return profile of the column.
 
diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py
@@ -84,8 +84,11 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
 
         return profile
 
+    # TODO: refactor BaseColumnProfiler.profile to not be an @property
+    # NumericStatsMixin inherits from BaseColumnProfile and adding @property to
+    # NumericStatisMixin.profile() results in a breaking change - ignoring [override]
     @property
-    def profile(self) -> dict:
+    def profile(self) -> dict:  # type: ignore[override]
         """
         Return the profile of the column.
 
diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py
@@ -1,4 +1,5 @@
 import json
+import math
 import os
 import unittest
 from collections import defaultdict
@@ -731,7 +732,44 @@ def test_categorical_diff(self):
             },
         }
         actual_diff = profile.diff(profile2)
-        self.assertDictEqual(expected_diff, actual_diff)
+
+        assert expected_diff["categorical"] == actual_diff["categorical"]
+        assert (
+            expected_diff["statistics"]["unique_count"]
+            == actual_diff["statistics"]["unique_count"]
+        )
+        assert math.isclose(
+            expected_diff["statistics"]["unique_ratio"],
+            actual_diff["statistics"]["unique_ratio"],
+        )
+        assert (
+            expected_diff["statistics"]["categories"]
+            == actual_diff["statistics"]["categories"]
+        )
+        assert math.isclose(
+            expected_diff["statistics"]["gini_impurity"],
+            actual_diff["statistics"]["gini_impurity"],
+        )
+        assert math.isclose(
+            expected_diff["statistics"]["unalikeability"],
+            actual_diff["statistics"]["unalikeability"],
+        )
+        assert (
+            expected_diff["statistics"]["categorical_count"]
+            == actual_diff["statistics"]["categorical_count"]
+        )
+        assert math.isclose(
+            expected_diff["statistics"]["chi2-test"]["chi2-statistic"],
+            actual_diff["statistics"]["chi2-test"]["chi2-statistic"],
+        )
+        assert (
+            expected_diff["statistics"]["chi2-test"]["deg_of_free"]
+            == actual_diff["statistics"]["chi2-test"]["deg_of_free"]
+        )
+        assert math.isclose(
+            expected_diff["statistics"]["chi2-test"]["p-value"],
+            actual_diff["statistics"]["chi2-test"]["p-value"],
+        )
 
         # Test with one categorical column matching
         df_not_categorical = pd.Series(
diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py
@@ -1,5 +1,6 @@
 import json
 import logging
+import math
 import os
 import random
 import re
@@ -2162,8 +2163,18 @@ def test_diff_categorical_chi2_test(self, *mocks):
             "deg_of_free": 2,
             "p-value": 0.3099238764710244,
         }
-        self.assertDictEqual(
-            expected_chi2_test_dict, diff["data_stats"][0]["statistics"]["chi2-test"]
+        actual_chi2_test_dict = diff["data_stats"][0]["statistics"]["chi2-test"]
+
+        assert math.isclose(
+            expected_chi2_test_dict["chi2-statistic"],
+            actual_chi2_test_dict["chi2-statistic"],
+        )
+        assert (
+            expected_chi2_test_dict["deg_of_free"]
+            == actual_chi2_test_dict["deg_of_free"]
+        )
+        assert math.isclose(
+            expected_chi2_test_dict["p-value"], actual_chi2_test_dict["p-value"]
         )
 
     @mock.patch(
diff --git a/setup.cfg b/setup.cfg
@@ -17,7 +17,6 @@ warn_unused_configs = True
 ignore_missing_imports = True
 no_implicit_optional = False
 exclude = ^dataprofiler/tests/|^resources/|^examples|venv*/
-disable_error_code = override
 
 [check-manifest]
 ignore-default-rules=True