Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/sources/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ The CHANGELOG for the current development version is available at

- Fixes an edge-case bug where decision regions plots didn't have unique colors ([#1157](https://github.com/rasbt/mlxtend/issues/1157) via [mariam851](https://github.com/mariam851))

- Fix `preprocessing.standardize` so a constant column is mapped to all-zeros (as the docstring promises) instead of `-mean(column)` ([#1058](https://github.com/rasbt/mlxtend/issues/1058) via [jbbqqf](https://github.com/jbbqqf))

- Reject `min_support` values outside the documented `(0, 1]` interval in `apriori`, `fpgrowth`, `fpmax`, and `hmine`. The previous check only caught `<= 0`, so passing e.g. `min_support=2` silently returned an empty result ([#864](https://github.com/rasbt/mlxtend/issues/864) via [jbbqqf](https://github.com/jbbqqf))

- Add a `top_k` argument to `ExhaustiveFeatureSelector.get_metric_dict()` so callers can request only the highest-scoring subsets before converting the result to a DataFrame ([#610](https://github.com/rasbt/mlxtend/issues/610) via [jbbqqf](https://github.com/jbbqqf))
Expand Down
9 changes: 8 additions & 1 deletion mlxtend/preprocessing/scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,16 @@ def standardize(array, columns=None, ddof=0, return_params=False, params=None):
}
are_constant = np.all(ary_newt[:, columns] == ary_newt[0, columns], axis=0)

# For constant columns the standard deviation is 0 (or NaN with some ddof
# values), so dividing by it would propagate NaNs / Infs. Forcing std to
# 1.0 means the subtraction (col - mean) below collapses the column to
# exactly 0.0, matching the contract documented in the "Notes" section
# ("If all values in a given column are the same, these values are all
# set to 0.0"). The previous version also pre-zeroed the column before
# the divide, but that turned (0 - mean) / 1 into -mean instead of 0
# -- see issue #1058.
for c, b in zip(columns, are_constant):
if b:
ary_newt[:, c] = np.zeros(dim[0])
parameters["stds"][c] = 1.0

ary_newt[:, columns] = (ary_newt[:, columns] - parameters["avgs"]) / parameters[
Expand Down
34 changes: 34 additions & 0 deletions mlxtend/preprocessing/tests/test__scaling__standardizing.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,37 @@ def test_standardize_all_columns_pandas():
]
)
np.testing.assert_allclose(df_out1.values, ary_out1, rtol=1e-03)


def test_standardize_constant_column_numpy_issue_1058():
# Regression test for #1058: a constant column was being mapped to
# `-mean(column)` instead of `0.0`. The "Notes" docstring promises that
# constant columns are set to 0.0.
ary = np.array([[0, 1, 2, 5], [1, 2, 3, 5], [3, 1, 2, 5]], dtype=float)
ary_actu = standardize(ary)
# The 4th column is the constant one and must be all-zero.
np.testing.assert_allclose(ary_actu[:, 3], np.zeros(3))
# Sanity check: the non-constant columns stay close to z-score scale
# (mean 0, std 1 with ddof=0). We only assert mean ~= 0 to avoid
# depending on the exact std implementation.
np.testing.assert_allclose(ary_actu[:, :3].mean(axis=0), np.zeros(3), atol=1e-9)


def test_standardize_constant_column_pandas_issue_1058():
# Same regression as the numpy variant, exercised through the pandas
# branch of standardize().
df = pd.DataFrame(
{"x": [0.0, 1.0, 3.0], "y": [1.0, 2.0, 1.0], "k": [5.0, 5.0, 5.0]}
)
df_actu = standardize(df, ["x", "y", "k"])
np.testing.assert_allclose(df_actu["k"].values, np.zeros(3))


def test_standardize_constant_column_returns_unit_std_param_issue_1058():
# The contract from the "Notes" docstring is that the std for a constant
# column ends up as 1.0 in the returned `params` dict, so the column can
# be reused by a subsequent call without dividing by zero.
ary = np.array([[5.0, 1.0], [5.0, 2.0], [5.0, 3.0]])
out, params = standardize(ary, return_params=True)
np.testing.assert_allclose(out[:, 0], np.zeros(3))
assert params["stds"][0] == 1.0
Loading