diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index 26280f545..324d04ff2 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -23,6 +23,8 @@ The CHANGELOG for the current development version is available at - Fixes an edge-case bug where decision regions plots didn't have unique colors ([#1157](https://github.com/rasbt/mlxtend/issues/1157) via [mariam851](https://github.com/mariam851)) +- Fix `preprocessing.standardize` so a constant column is mapped to all-zeros (as the docstring promises) instead of `-mean(column)` ([#1058](https://github.com/rasbt/mlxtend/issues/1058) via [jbbqqf](https://github.com/jbbqqf)) + - Reject `min_support` values outside the documented `(0, 1]` interval in `apriori`, `fpgrowth`, `fpmax`, and `hmine`. The previous check only caught `<= 0`, so passing e.g. `min_support=2` silently returned an empty result ([#864](https://github.com/rasbt/mlxtend/issues/864) via [jbbqqf](https://github.com/jbbqqf)) - Add a `top_k` argument to `ExhaustiveFeatureSelector.get_metric_dict()` so callers can request only the highest-scoring subsets before converting the result to a DataFrame ([#610](https://github.com/rasbt/mlxtend/issues/610) via [jbbqqf](https://github.com/jbbqqf)) diff --git a/mlxtend/preprocessing/scaling.py b/mlxtend/preprocessing/scaling.py index e7d1bb9b2..3ee266bf9 100644 --- a/mlxtend/preprocessing/scaling.py +++ b/mlxtend/preprocessing/scaling.py @@ -139,9 +139,16 @@ def standardize(array, columns=None, ddof=0, return_params=False, params=None): } are_constant = np.all(ary_newt[:, columns] == ary_newt[0, columns], axis=0) + # For constant columns the standard deviation is 0 (or NaN with some ddof + # values), so dividing by it would propagate NaNs / Infs. Forcing std to + # 1.0 means the subtraction (col - mean) below collapses the column to + # exactly 0.0, matching the contract documented in the "Notes" section + # ("If all values in a given column are the same, these values are all + # set to 0.0"). The previous version also pre-zeroed the column before + # the divide, but that turned (0 - mean) / 1 into -mean instead of 0 + # -- see issue #1058. for c, b in zip(columns, are_constant): if b: - ary_newt[:, c] = np.zeros(dim[0]) parameters["stds"][c] = 1.0 ary_newt[:, columns] = (ary_newt[:, columns] - parameters["avgs"]) / parameters[ diff --git a/mlxtend/preprocessing/tests/test__scaling__standardizing.py b/mlxtend/preprocessing/tests/test__scaling__standardizing.py index 29c04bf61..8b718cb59 100644 --- a/mlxtend/preprocessing/tests/test__scaling__standardizing.py +++ b/mlxtend/preprocessing/tests/test__scaling__standardizing.py @@ -164,3 +164,37 @@ def test_standardize_all_columns_pandas(): ] ) np.testing.assert_allclose(df_out1.values, ary_out1, rtol=1e-03) + + +def test_standardize_constant_column_numpy_issue_1058(): + # Regression test for #1058: a constant column was being mapped to + # `-mean(column)` instead of `0.0`. The "Notes" docstring promises that + # constant columns are set to 0.0. + ary = np.array([[0, 1, 2, 5], [1, 2, 3, 5], [3, 1, 2, 5]], dtype=float) + ary_actu = standardize(ary) + # The 4th column is the constant one and must be all-zero. + np.testing.assert_allclose(ary_actu[:, 3], np.zeros(3)) + # Sanity check: the non-constant columns stay close to z-score scale + # (mean 0, std 1 with ddof=0). We only assert mean ~= 0 to avoid + # depending on the exact std implementation. + np.testing.assert_allclose(ary_actu[:, :3].mean(axis=0), np.zeros(3), atol=1e-9) + + +def test_standardize_constant_column_pandas_issue_1058(): + # Same regression as the numpy variant, exercised through the pandas + # branch of standardize(). + df = pd.DataFrame( + {"x": [0.0, 1.0, 3.0], "y": [1.0, 2.0, 1.0], "k": [5.0, 5.0, 5.0]} + ) + df_actu = standardize(df, ["x", "y", "k"]) + np.testing.assert_allclose(df_actu["k"].values, np.zeros(3)) + + +def test_standardize_constant_column_returns_unit_std_param_issue_1058(): + # The contract from the "Notes" docstring is that the std for a constant + # column ends up as 1.0 in the returned `params` dict, so the column can + # be reused by a subsequent call without dividing by zero. + ary = np.array([[5.0, 1.0], [5.0, 2.0], [5.0, 3.0]]) + out, params = standardize(ary, return_params=True) + np.testing.assert_allclose(out[:, 0], np.zeros(3)) + assert params["stds"][0] == 1.0