Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/sources/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ The CHANGELOG for the current development version is available at

- Fixes an edge-case bug where decision regions plots didn't have unique colors ([#1157](https://github.com/rasbt/mlxtend/issues/1157) via [mariam851](https://github.com/mariam851))

- Add a `top_k` argument to `ExhaustiveFeatureSelector.get_metric_dict()` so callers can request only the highest-scoring subsets before converting the result to a DataFrame ([#610](https://github.com/rasbt/mlxtend/issues/610) via [jbbqqf](https://github.com/jbbqqf))

- `minmax_scaling` no longer returns silent NaNs for constant columns; constant columns are now collapsed to `min_val`, mirroring the existing contract of `standardize`. ([#1167](https://github.com/rasbt/mlxtend/issues/1167) via [jbbqqf](https://github.com/jbbqqf))

- `bias_variance_decomp` now accepts pandas DataFrames and Series as input, in addition to NumPy arrays. ([#1070](https://github.com/rasbt/mlxtend/issues/1070) via [berns722](https://github.com/berns722))
Expand Down
30 changes: 28 additions & 2 deletions mlxtend/feature_selection/exhaustive_feature_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,14 +556,25 @@ def fit_transform(self, X, y, groups=None, **fit_params):
self.fit(X, y, groups=groups, **fit_params)
return self.transform(X)

def get_metric_dict(self, confidence_interval=0.95):
def get_metric_dict(self, confidence_interval=0.95, top_k=None):
"""Return metric dictionary

Parameters
----------
confidence_interval : float (default: 0.95)
A positive float between 0.0 and 1.0 to compute the confidence
interval bounds of the CV score averages.
top_k : int or None (default: None)
If a positive integer, restrict the returned dictionary to the
top-`top_k` feature subsets ranked by `avg_score` descending.
ExhaustiveFeatureSelector can produce a very large number of
evaluated subsets, and downstream consumers (notably
``pd.DataFrame.from_dict(..., orient='index')``) often only need
the highest-scoring entries. ``top_k`` lets callers cap the
returned dictionary before such conversions without
re-implementing the ranking themselves (issue #610).
``None`` (default) preserves the historical behaviour and
returns all subsets.

Returns
----------
Expand All @@ -580,7 +591,22 @@ def get_metric_dict(self, confidence_interval=0.95):

"""
self._check_fitted()
fdict = deepcopy(self.subsets_)
if top_k is not None:
if not isinstance(top_k, (int, np.integer)) or top_k <= 0:
raise ValueError(
"`top_k` must be a positive integer or None. " "Got %r." % (top_k,)
)
# Preserve the original iteration keys so downstream code can
# still cross-reference `subsets_` using the same keys.
subset_keys = sorted(
self.subsets_,
key=lambda k: self.subsets_[k]["avg_score"],
reverse=True,
)[:top_k]
fdict = {k: deepcopy(self.subsets_[k]) for k in subset_keys}
else:
fdict = deepcopy(self.subsets_)

for k in fdict:
std_dev = np.std(self.subsets_[k]["cv_scores"])
bound, std_err = self._calc_confidence(
Expand Down
118 changes: 118 additions & 0 deletions mlxtend/feature_selection/tests/test_exhaustive_feature_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,3 +843,121 @@ def test_logistic_regression_compatibility():

assert efs.best_idx_ == (3,)
assert efs.best_score_ > 0.90


def test_get_metric_dict_top_k_returns_top_subsets_issue_610():
# Regression test for #610: top_k must restrict the returned dict to
# the top-K subsets ranked by avg_score (descending).
knn = KNeighborsClassifier(n_neighbors=4)
iris = load_iris()
X, y = iris.data, iris.target
efs = EFS(
knn,
min_features=1,
max_features=3,
scoring="accuracy",
cv=3,
clone_estimator=False,
print_progress=False,
n_jobs=1,
)
efs.fit(X, y)

full = efs.get_metric_dict()
assert len(full) > 3, "test setup expects more than 3 subsets evaluated"

top3 = efs.get_metric_dict(top_k=3)
assert len(top3) == 3, "top_k=3 should return exactly 3 entries"

# The returned subsets are exactly the 3 highest-scoring ones from `full`.
expected_top = sorted(
full.keys(), key=lambda k: full[k]["avg_score"], reverse=True
)[:3]
assert set(top3.keys()) == set(
expected_top
), "top_k did not return the highest-scoring subsets"

# All other metrics (avg_score, ci_bound, std_dev, std_err, feature_idx,
# feature_names if present) must match the corresponding entries in the
# full dict.
for k in top3:
assert top3[k]["feature_idx"] == full[k]["feature_idx"]
assert top3[k]["avg_score"] == full[k]["avg_score"]


def test_get_metric_dict_top_k_none_preserves_default_behavior_issue_610():
knn = KNeighborsClassifier(n_neighbors=4)
iris = load_iris()
X, y = iris.data, iris.target
efs = EFS(
knn,
min_features=1,
max_features=2,
scoring="accuracy",
cv=2,
clone_estimator=False,
print_progress=False,
n_jobs=1,
)
efs.fit(X, y)

default = efs.get_metric_dict()
explicit_none = efs.get_metric_dict(top_k=None)
assert default.keys() == explicit_none.keys()


def test_get_metric_dict_top_k_invalid_raises_issue_610():
knn = KNeighborsClassifier(n_neighbors=4)
iris = load_iris()
X, y = iris.data, iris.target
efs = EFS(
knn,
min_features=1,
max_features=2,
scoring="accuracy",
cv=2,
clone_estimator=False,
print_progress=False,
n_jobs=1,
)
efs.fit(X, y)
assert_raises(
ValueError,
"`top_k` must be a positive integer or None",
efs.get_metric_dict,
top_k=0,
)
assert_raises(
ValueError,
"`top_k` must be a positive integer or None",
efs.get_metric_dict,
top_k=-2,
)
assert_raises(
ValueError,
"`top_k` must be a positive integer or None",
efs.get_metric_dict,
top_k=1.5,
)


def test_get_metric_dict_top_k_larger_than_total_returns_all_issue_610():
# Boundary: if top_k exceeds the number of evaluated subsets, all
# entries are returned (not an error).
knn = KNeighborsClassifier(n_neighbors=4)
iris = load_iris()
X, y = iris.data, iris.target
efs = EFS(
knn,
min_features=1,
max_features=2,
scoring="accuracy",
cv=2,
clone_estimator=False,
print_progress=False,
n_jobs=1,
)
efs.fit(X, y)
full = efs.get_metric_dict()
out = efs.get_metric_dict(top_k=10**6)
assert out.keys() == full.keys()
Loading