diff --git a/python/interpret-core/interpret/glassbox/_ebm/_merge_ebms.py b/python/interpret-core/interpret/glassbox/_ebm/_merge_ebms.py index ca2cd54a5..165a62666 100644 --- a/python/interpret-core/interpret/glassbox/_ebm/_merge_ebms.py +++ b/python/interpret-core/interpret/glassbox/_ebm/_merge_ebms.py @@ -4,7 +4,7 @@ import logging import warnings from itertools import chain, count -from math import isnan +from math import isnan, prod import numpy as np @@ -47,15 +47,15 @@ def _harmonize_tensor( # greater than the old model's lowest cut. # eg: new: | | | | | # old: | | - # other1: | | proprotion | + # other1: | | proportion | # other2: | proportion | # One wrinkle is that for pairs, we'll be using the pair cuts and we need to - # one-dimensionalize any existing pair weights onto their respective 1D axies - # before proportionating them. Annother issue is that we might not even have + # one-dimensionalize any existing pair weights onto their respective 1D axis + # before proportioning them. Another issue is that we might not even have # another term_feature that uses some particular feature that we use in our model # so we don't have any weights. We can solve that issue by dropping any feature's # bins for terms that we have no information for. After we do this we'll have - # guaranteed that we only have new bin cuts for feature axies that we have inside + # guaranteed that we only have new bin cuts for feature axes that we have inside # the bin level that we're handling! old_feature_idxs = list(old_feature_idxs) @@ -241,7 +241,7 @@ def _harmonize_tensor( map_bins[bin_idx] for map_bins, bin_idx in zip(mapping, old_reversed_bin_idxs) ] - n_cells2 = np.prod([len(x) for x in cell_map]) + n_cells2 = prod(map(len, cell_map)) val = 0 if n_multiclasses == 1 else np.zeros(n_multiclasses, np.float64) total_weight = 0.0 for cell2_idx in range(n_cells2): @@ -416,7 +416,7 @@ def merge_ebms(models): # TODO: every time we merge models we fragment the bins more and more and this is undesirable # especially for pairs. When we build models, we store the feature bin cuts for pairs even - # if we have no pairs that use that paritcular feature as a pair. We can eliminate these useless + # if we have no pairs that use that particular feature as a pair. We can eliminate these useless # pair feature cuts before merging the bins and that'll give us less resulting cuts. Having less # cuts reduces the number of estimates that we need to make and reduces the complexity of the # tensors, so it's good to have this reduction. @@ -470,7 +470,7 @@ def merge_ebms(models): # order and also handling merged categories (where two categories map to a single score) # We should first try to progress in order along each set of keys and see if we can # establish the perfect order which might work if there are isolated missing categories - # and if we can't get a unique guaranteed sorted order that way then examime all the + # and if we can't get a unique guaranteed sorted order that way then examine all the # different known sort order and figure out if any of the possible orderings match merged_bins = dict(zip(merged_keys, count(1))) else: @@ -550,7 +550,7 @@ def merge_ebms(models): ): if hasattr(ebm, "feature_bounds_"): # TODO: estimate the histogram bin counts by taking the min of the mins and the max of the maxes - # and re-apportioning the counts based on the distributions of the previous histograms. Proprotion + # and re-apportioning the counts based on the distributions of the previous histograms. Proportion # them to the floor of their counts and then assign any remaining integers based on how much # they reduce the RMSE of the integer counts from the ideal floating point counts. pass @@ -623,7 +623,7 @@ def merge_ebms(models): # TODO: in the future we might at this point try and figure out the most # common feature ordering within the terms. Take the mode first - # and amonst the orderings that tie, choose the one that's best sorted by + # and amongst the orderings that tie, choose the one that's best sorted by # feature indexes ebm.term_features_ = sorted_fgs @@ -634,26 +634,26 @@ def merge_ebms(models): # interaction mismatches where an interaction will be in one model, but not the other. # We need to estimate the bin_weight_ tensors that would have existed in this case. # We'll use the interaction terms that we do have in other models to estimate the - # distribution in the essense of the data, which should be roughly consistent or you + # distribution in the essence of the data, which should be roughly consistent or you # shouldn't be attempting to merge the models in the first place. We'll then scale - # the percentage distribution by the total weight of the model that we're fillin in the + # the percentage distribution by the total weight of the model that we're filling in the # details for. # TODO: this algorithm has some problems. The estimated tensor that we get by taking the # model weight and distributing it by a per-cell percentage measure means that we get - # inconsistent weight distibutions along the axis. We can take our resulting weight tensor + # inconsistent weight distributions along the axis. We can take our resulting weight tensor # and sum the columns/rows to get the weights on each individual feature axis. Our model # however comes with a known set of weights on each feature, and the result of our operation # will not match the existing distribution in almost all cases. I think there might be # some algorithm where we start with the per-feature weights and use the distribution hints # from the other models to inform where we place our exact weights that we know about in our - # model from each axis. The problem is that the sums in both axies need to agree, and each + # model from each axis. The problem is that the sums in both axes need to agree, and each # change we make influences both. I'm not sure we can even guarantee that there is an answer # and if there was one I'm not sure how we'd go about generating it. I'm going to leave # this problem for YOU: a future person who is smarter than me and has more time to solve this. # One hint: I think a possible place to start would be an iterative algorithm that's similar # to purification where you randomly select a row/column and try to get closer at each step - # to the rigth answer. Good luck! + # to the right answer. Good luck! # # Oh, there's also another deeper problem.. let's say you had a crazy 5 way interaction in the # model eg: (0,1,2,3,4) and you had 2 and 3 way interactions that either overlap or not. @@ -731,7 +731,7 @@ def merge_ebms(models): model.bagged_scores_[term_idx][bag_idx], model.bin_weights_[ term_idx - ], # we use these to weigh distribution of scores for mulple bins + ], # we use these to weigh distribution of scores for multiple bins ) new_bagged_scores.append(harmonized_bagged_scores) ebm.bin_weights_.append(np.sum(new_bin_weights, axis=0)) @@ -768,7 +768,7 @@ def merge_ebms(models): # TODO: we might be able to do these operations earlier remove_extra_bins(ebm.term_features_, ebm.bins_) - # dependent attributes (can be re-derrived after serialization) + # dependent attributes (can be re-derived after serialization) ebm.n_features_in_ = len(ebm.bins_) # scikit-learn specified name ebm.term_names_ = generate_term_names(ebm.feature_names_in_, ebm.term_features_)